In [2]:
# train_model_dashboard.ipynb

# ===================================================
# 1. Imports
# ===================================================
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE

# ===================================================
# 2. Load dataset
# ===================================================
df = pd.read_csv("cleaned_student_data_v2.csv")  # replace with your dataset file

# Show columns
print("Columns available in the dataset:")
print(df.columns.tolist())

# ===================================================
# 3. Select only required columns
# ===================================================
selected_columns = [
    'Previous qualification (grade)',
    'Admission grade',
    'Age at enrollment',
    'Curricular units 1st sem (approved)',
    'Curricular units 1st sem (grade)',
    'Curricular units 2nd sem (approved)',
    'Curricular units 2nd sem (grade)',
    'Target'
]
df = df[selected_columns].copy()

# ===================================================
# 4. Convert target to numeric
# ===================================================
df['Target'] = df['Target'].replace({"Dropout": 0, "Graduate": 1})
print("‚úÖ Unique values in Target:", df['Target'].unique())

# ===================================================
# 5. Separate features and target
# ===================================================
X = df.drop('Target', axis=1)
y = df['Target']

# ===================================================
# 6. Train-test split
# ===================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("‚úÖ Data split completed!")
print("Training set:", X_train.shape)
print("Test set:", X_test.shape)

# ===================================================
# 7. Handle imbalance with SMOTE
# ===================================================
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

from collections import Counter
print("\nüìà Class distribution after SMOTE:", Counter(y_train_resampled))

# ===================================================
# 8. Train CatBoost model
# ===================================================
model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=5,
    eval_metric='Accuracy',
    verbose=100,
    random_seed=42
)

model.fit(X_train_resampled, y_train_resampled, eval_set=(X_test, y_test))

# ===================================================
# 9. Evaluate model
# ===================================================
y_pred = model.predict(X_test)
print(f"\nüéØ Model Accuracy: {accuracy_score(y_test, y_pred):.4f}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# ===================================================
# 10. Save model
# ===================================================
model.save_model("best_catboost_model_dashboard.pkl")
print("\nüíæ Model saved as 'best_catboost_model_dashboard.pkl'")


Columns available in the dataset:
['Previous qualification (grade)', 'Admission grade', 'Age at enrollment', 'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Target']
‚úÖ Unique values in Target: [0 1 'Enrolled']


TypeError: '<' not supported between instances of 'int' and 'str'

In [None]:
# train_model_dashboard.ipynb

# ===================================================
# 1. Imports
# ===================================================
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE

# ===================================================
# 2. Load dataset
# ===================================================
df = pd.read_csv("student_data.csv")  # replace with your dataset file

# Show columns
print("Columns available in the dataset:")
print(df.columns.tolist())

# ===================================================
# 3. Select only required columns
# ===================================================
selected_columns = [
    'Previous qualification (grade)',
    'Admission grade',
    'Age at enrollment',
    'Curricular units 1st sem (approved)',
    'Curricular units 1st sem (grade)',
    'Curricular units 2nd sem (approved)',
    'Curricular units 2nd sem (grade)',
    'Target'
]
df = df[selected_columns].copy()

# ===================================================
# 4. Convert target to numeric
# ===================================================
df['Target'] = df['Target'].replace({"Dropout": 0, "Graduate": 1})
print("‚úÖ Unique values in Target:", df['Target'].unique())

# ===================================================
# 5. Separate features and target
# ===================================================
X = df.drop('Target', axis=1)
y = df['Target']

# ===================================================
# 6. Train-test split
# ===================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("‚úÖ Data split completed!")
print("Training set:", X_train.shape)
print("Test set:", X_test.shape)

# ===================================================
# 7. Handle imbalance with SMOTE
# ===================================================
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

from collections import Counter
print("\nüìà Class distribution after SMOTE:", Counter(y_train_resampled))

# ===================================================
# 8. Train CatBoost model
# ===================================================
model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=5,
    eval_metric='Accuracy',
    verbose=100,
    random_seed=42
)

model.fit(X_train_resampled, y_train_resampled, eval_set=(X_test, y_test))

# ===================================================
# 9. Evaluate model
# ===================================================
y_pred = model.predict(X_test)
print(f"\nüéØ Model Accuracy: {accuracy_score(y_test, y_pred):.4f}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# ===================================================
# 10. Save model
# ===================================================
model.save_model("best_catboost_model_dashboard.pkl")
print("\nüíæ Model saved as 'best_catboost_model_dashboard.pkl'")


In [3]:
# train_model_dashboard.ipynb

# ===================================================
# 1. Imports
# ===================================================
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE

# ===================================================
# 2. Load dataset
# ===================================================
df = pd.read_csv("student_data.csv")  # replace with your dataset file

# Show columns
print("Columns available in the dataset:")
print(df.columns.tolist())

# ===================================================
# 3. Select only required columns
# ===================================================
selected_columns = [
    'Previous qualification (grade)',
    'Admission grade',
    'Age at enrollment',
    'Curricular units 1st sem (approved)',
    'Curricular units 1st sem (grade)',
    'Curricular units 2nd sem (approved)',
    'Curricular units 2nd sem (grade)',
    'Target'
]
df = df[selected_columns].copy()

# ===================================================
# 4. Convert target to numeric
# ===================================================
df['Target'] = df['Target'].replace({"Dropout": 0, "Graduate": 1})
print("‚úÖ Unique values in Target:", df['Target'].unique())

# ===================================================
# 5. Separate features and target
# ===================================================
X = df.drop('Target', axis=1)
y = df['Target']

# ===================================================
# 6. Train-test split
# ===================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("‚úÖ Data split completed!")
print("Training set:", X_train.shape)
print("Test set:", X_test.shape)

# ===================================================
# 7. Handle imbalance with SMOTE
# ===================================================
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

from collections import Counter
print("\nüìà Class distribution after SMOTE:", Counter(y_train_resampled))

# ===================================================
# 8. Train CatBoost model
# ===================================================
model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=5,
    eval_metric='Accuracy',
    verbose=100,
    random_seed=42
)

model.fit(X_train_resampled, y_train_resampled, eval_set=(X_test, y_test))

# ===================================================
# 9. Evaluate model
# ===================================================
y_pred = model.predict(X_test)
print(f"\nüéØ Model Accuracy: {accuracy_score(y_test, y_pred):.4f}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# ===================================================
# 10. Save model
# ===================================================
model.save_model("best_catboost_model_dashboard.pkl")
print("\nüíæ Model saved as 'best_catboost_model_dashboard.pkl'")

FileNotFoundError: [Errno 2] No such file or directory: 'student_data.csv'

In [4]:
# train_model_dashboard.ipynb

# ===================================================
# 1. Imports
# ===================================================
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE

# ===================================================
# 2. Load dataset
# ===================================================
df = pd.read_csv("cleaned_student_data_v2.csv")  # replace with your dataset file

# Show columns
print("Columns available in the dataset:")
print(df.columns.tolist())

# ===================================================
# 3. Select only required columns
# ===================================================
selected_columns = [
    'Previous qualification (grade)',
    'Admission grade',
    'Age at enrollment',
    'Curricular units 1st sem (approved)',
    'Curricular units 1st sem (grade)',
    'Curricular units 2nd sem (approved)',
    'Curricular units 2nd sem (grade)',
    'Target'
]
df = df[selected_columns].copy()

# ===================================================
# 4. Convert target to numeric
# ===================================================
df['Target'] = df['Target'].replace({"Dropout": 0, "Graduate": 1})
print("‚úÖ Unique values in Target:", df['Target'].unique())

# ===================================================
# 5. Separate features and target
# ===================================================
X = df.drop('Target', axis=1)
y = df['Target']

# ===================================================
# 6. Train-test split
# ===================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("‚úÖ Data split completed!")
print("Training set:", X_train.shape)
print("Test set:", X_test.shape)

# ===================================================
# 7. Handle imbalance with SMOTE
# ===================================================
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

from collections import Counter
print("\nüìà Class distribution after SMOTE:", Counter(y_train_resampled))

# ===================================================
# 8. Train CatBoost model
# ===================================================
model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=5,
    eval_metric='Accuracy',
    verbose=100,
    random_seed=42
)

model.fit(X_train_resampled, y_train_resampled, eval_set=(X_test, y_test))

# ===================================================
# 9. Evaluate model
# ===================================================
y_pred = model.predict(X_test)
print(f"\nüéØ Model Accuracy: {accuracy_score(y_test, y_pred):.4f}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# ===================================================
# 10. Save model
# ===================================================
model.save_model("best_catboost_model_dashboard.pkl")
print("\nüíæ Model saved as 'best_catboost_model_dashboard.pkl'")

Columns available in the dataset:
['Previous qualification (grade)', 'Admission grade', 'Age at enrollment', 'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Target']
‚úÖ Unique values in Target: [0 1 'Enrolled']


TypeError: '<' not supported between instances of 'int' and 'str'

In [5]:
# ===============================
# Student Dropout Prediction Training Notebook
# ===============================

# 1Ô∏è‚É£ Import libraries
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from collections import Counter
import pickle

# 2Ô∏è‚É£ Load dataset
df = pd.read_csv("cleaned_student_data.csv")  # replace with your actual file

# Show available columns
print("Columns available in the dataset:")
for idx, col in enumerate(df.columns):
    print(f"{idx}: {col}")

# ‚úÖ Selected columns based on previous discussion
selected_columns = [
    'Previous qualification (grade)',
    'Admission grade',
    'Age at enrollment',
    'Curricular units 1st sem (approved)',
    'Curricular units 1st sem (grade)',
    'Curricular units 2nd sem (approved)',
    'Curricular units 2nd sem (grade)',
    'Target'
]

df = df[selected_columns]

# 3Ô∏è‚É£ Clean Target column
# Convert 'Dropout' -> 0, 'Graduate' and 'Enrolled' -> 1
df['Target'] = df['Target'].replace({
    "Dropout": 0,
    "Graduate":1 
})

print("\n‚úÖ Unique values in Target after cleaning:", df['Target'].unique())

# 4Ô∏è‚É£ Handle missing values (if any)
df = df.dropna()  # simple approach, can be enhanced

# 5Ô∏è‚É£ Prepare features and target
X = df.drop('Target', axis=1)
y = df['Target']

# 6Ô∏è‚É£ Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("\n‚úÖ Data split completed!")
print("Training set:", X_train.shape)
print("Test set:", X_test.shape)

# 7Ô∏è‚É£ Check class distribution before SMOTE
print("\nüìä Class distribution before SMOTE:")
print(Counter(y_train))

# 8Ô∏è‚É£ Apply SMOTE to balance the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("\nüìà Class distribution after SMOTE:")
print(Counter(y_train_resampled))

# 9Ô∏è‚É£ Train CatBoost Classifier
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    eval_metric='Accuracy',
    verbose=100,
    random_state=42
)

model.fit(X_train_resampled, y_train_resampled, eval_set=(X_test, y_test), verbose=100)

# 10Ô∏è‚É£ Evaluate model
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"\nüéØ Model Accuracy: {accuracy:.4f}\n")

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# 11Ô∏è‚É£ Save the trained model for dashboard
with open("best_catboost_model_final.pkl", "wb") as f:
    pickle.dump(model, f)

print("\nüíæ Model saved as 'best_catboost_model_final.pkl'")


SyntaxError: expression expected after dictionary key and ':' (2484574044.py, line 41)

In [6]:
# ===============================
# Student Dropout Prediction Training Notebook
# ===============================

# 1Ô∏è‚É£ Import libraries
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from collections import Counter
import pickle

# 2Ô∏è‚É£ Load dataset
df = pd.read_csv("cleaned_student_data.csv")  # replace with your actual file

# Show available columns
print("Columns available in the dataset:")
for idx, col in enumerate(df.columns):
    print(f"{idx}: {col}")

# ‚úÖ Selected columns based on previous discussion
selected_columns = [
    'Previous qualification (grade)',
    'Admission grade',
    'Age at enrollment',
    'Curricular units 1st sem (approved)',
    'Curricular units 1st sem (grade)',
    'Curricular units 2nd sem (approved)',
    'Curricular units 2nd sem (grade)',
    'Target'
]

df = df[selected_columns]

# 3Ô∏è‚É£ Clean Target column
# Convert 'Dropout' -> 0, 'Graduate' and 'Enrolled' -> 1
df['Target'] = df['Target'].replace({
    "Dropout": 0,
    "Graduate":1 
})

print("\n‚úÖ Unique values in Target after cleaning:", df['Target'].unique())

# 4Ô∏è‚É£ Handle missing values (if any)
df = df.dropna()  # simple approach, can be enhanced

# 5Ô∏è‚É£ Prepare features and target
X = df.drop('Target', axis=1)
y = df['Target']

# 6Ô∏è‚É£ Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("\n‚úÖ Data split completed!")
print("Training set:", X_train.shape)
print("Test set:", X_test.shape)

# 7Ô∏è‚É£ Check class distribution before SMOTE
print("\nüìä Class distribution before SMOTE:")
print(Counter(y_train))

# 8Ô∏è‚É£ Apply SMOTE to balance the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("\nüìà Class distribution after SMOTE:")
print(Counter(y_train_resampled))

# 9Ô∏è‚É£ Train CatBoost Classifier
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    eval_metric='Accuracy',
    verbose=100,
    random_state=42
)

model.fit(X_train_resampled, y_train_resampled, eval_set=(X_test, y_test), verbose=100)

# 10Ô∏è‚É£ Evaluate model
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"\nüéØ Model Accuracy: {accuracy:.4f}\n")

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# 11Ô∏è‚É£ Save the trained model for dashboard
with open("best_catboost_model_final.pkl", "wb") as f:
    pickle.dump(model, f)

print("\nüíæ Model saved as 'best_catboost_model_final.pkl'")

Columns available in the dataset:
0: Target_num
1: Curricular units 1st sem (grade)
2: Admission grade
3: Previous qualification (grade)
4: Age at enrollment


KeyError: "['Curricular units 1st sem (approved)', 'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Target'] not in index"

In [None]:
# ===============================
# Student Dropout Prediction Training Notebook
# ===============================

# 1Ô∏è‚É£ Import libraries
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from collections import Counter
import pickle

# 2Ô∏è‚É£ Load dataset
df = pd.read_csv("cleaned_student_data.csv")  

# Show available columns
print("Columns available in the dataset:")
for idx, col in enumerate(df.columns):
    print(f"{idx}: {col}")

# ‚úÖ Selected columns
selected_columns = [
    'Previous qualification (grade)',
    'Admission grade',
    'Age at enrollment',
    'Curricular units 1st sem (approved)',
    'Curricular units 1st sem (grade)',
    'Curricular units 2nd sem (approved)',
    'Curricular units 2nd sem (grade)',
    'Target'
]

df = df[selected_columns]

# 3Ô∏è‚É£ Clean Target column
df['Target'] = df['Target'].replace({
    "Dropout": 0,
    "Graduate": 1,
    "Enrolled": 1
})
print("\n‚úÖ Unique values in Target after cleaning:", df['Target'].unique())

# 4Ô∏è‚É£ Handle missing values
df = df.dropna()

# 5Ô∏è‚É£ Prepare features and target
X = df.drop('Target', axis=1)
y = df['Target']

# 6Ô∏è‚É£ Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("\n‚úÖ Data split completed!")
print("Training set:", X_train.shape)
print("Test set:", X_test.shape)

# 7Ô∏è‚É£ Check class distribution before SMOTE
print("\nüìä Class distribution before SMOTE:")
print(Counter(y_train))

# 8Ô∏è‚É£ Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print("\nüìà Class distribution after SMOTE:")
print(Counter(y_train_resampled))

# 9Ô∏è‚É£ Train CatBoost Classifier
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    eval_metric='Accuracy',
    verbose=100,
    random_state=42
)
model.fit(X_train_resampled, y_train_resampled, eval_set=(X_test, y_test), verbose=100)

# 10Ô∏è‚É£ Evaluate model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nüéØ Model Accuracy: {accuracy:.4f}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# 11Ô∏è‚É£ Save the model
with open("best_catboost_model_final.pkl", "wb") as f:
    pickle.dump(model, f)
print("\nüíæ Model saved as 'best_catboost_model_final.pkl'")


Columns available in the dataset:
0: Target_num
1: Curricular units 1st sem (grade)
2: Admission grade
3: Previous qualification (grade)
4: Age at enrollment


KeyError: "['Curricular units 1st sem (approved)', 'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Target'] not in index"

In [None]:
# ===============================
# Student Dropout Prediction Training Notebook
# ===============================

# 1Ô∏è‚É£ Import libraries
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from collections import Counter
import pickle

# 2Ô∏è‚É£ Load dataset
df = pd.read_csv("cleaned_student_data_v2.csv")  

# Show available columns
print("Columns available in the dataset:")
for idx, col in enumerate(df.columns):
    print(f"{idx}: {col}")

# ‚úÖ Selected columns
selected_columns = [
    'Previous qualification (grade)',
    'Admission grade',
    'Age at enrollment',
    'Curricular units 1st sem (approved)',
    'Curricular units 1st sem (grade)',
    'Curricular units 2nd sem (approved)',
    'Curricular units 2nd sem (grade)',
    'Target'
]

df = df[selected_columns]

# 3Ô∏è‚É£ Clean Target column
df['Target'] = df['Target'].replace({
    "Dropout": 0,
    "Graduate": 1,
    "Enrolled": 1
})
print("\n‚úÖ Unique values in Target after cleaning:", df['Target'].unique())

# 4Ô∏è‚É£ Handle missing values
df = df.dropna()

# 5Ô∏è‚É£ Prepare features and target
X = df.drop('Target', axis=1)
y = df['Target']

# 6Ô∏è‚É£ Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("\n‚úÖ Data split completed!")
print("Training set:", X_train.shape)
print("Test set:", X_test.shape)

# 7Ô∏è‚É£ Check class distribution before SMOTE
print("\nüìä Class distribution before SMOTE:")
print(Counter(y_train))

# 8Ô∏è‚É£ Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print("\nüìà Class distribution after SMOTE:")
print(Counter(y_train_resampled))

# 9Ô∏è‚É£ Train CatBoost Classifier
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    eval_metric='Accuracy',
    verbose=100,
    random_state=42
)
model.fit(X_train_resampled, y_train_resampled, eval_set=(X_test, y_test), verbose=100)

# 10Ô∏è‚É£ Evaluate model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nüéØ Model Accuracy: {accuracy:.4f}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# 11Ô∏è‚É£ Save the model
with open("best_catboost_model_final.pkl", "wb") as f:
    pickle.dump(model, f)
print("\nüíæ Model saved as 'best_catboost_model_final.pkl'")

Columns available in the dataset:
0: Previous qualification (grade)
1: Admission grade
2: Age at enrollment
3: Curricular units 1st sem (approved)
4: Curricular units 1st sem (grade)
5: Curricular units 2nd sem (approved)
6: Curricular units 2nd sem (grade)
7: Target

‚úÖ Unique values in Target after cleaning: [0 1]

‚úÖ Data split completed!
Training set: (3539, 7)
Test set: (885, 7)

üìä Class distribution before SMOTE:
Counter({1: 2402, 0: 1137})

üìà Class distribution after SMOTE:
Counter({0: 2402, 1: 2402})


  df['Target'] = df['Target'].replace({


0:	learn: 0.8172356	test: 0.8327684	best: 0.8327684 (0)	total: 137ms	remaining: 2m 17s
100:	learn: 0.8819734	test: 0.8429379	best: 0.8474576 (78)	total: 743ms	remaining: 6.62s
200:	learn: 0.9154871	test: 0.8361582	best: 0.8474576 (78)	total: 1.59s	remaining: 6.33s
300:	learn: 0.9331807	test: 0.8338983	best: 0.8474576 (78)	total: 2.17s	remaining: 5.05s
400:	learn: 0.9479600	test: 0.8338983	best: 0.8474576 (78)	total: 2.71s	remaining: 4.05s
500:	learn: 0.9612823	test: 0.8305085	best: 0.8474576 (78)	total: 3.27s	remaining: 3.26s
600:	learn: 0.9708576	test: 0.8338983	best: 0.8474576 (78)	total: 3.81s	remaining: 2.53s
700:	learn: 0.9766861	test: 0.8338983	best: 0.8474576 (78)	total: 4.34s	remaining: 1.85s
800:	learn: 0.9814738	test: 0.8282486	best: 0.8474576 (78)	total: 4.88s	remaining: 1.21s
900:	learn: 0.9852206	test: 0.8271186	best: 0.8474576 (78)	total: 5.44s	remaining: 598ms
999:	learn: 0.9879267	test: 0.8248588	best: 0.8474576 (78)	total: 5.96s	remaining: 0us

bestTest = 0.8474576271


In [9]:
import pandas as pd

# Load the cleaned dataset
df = pd.read_csv("cleaned_student_data_v2.csv")

# Show column names
print("Columns available in the dataset:")
print(df.columns.tolist())

# Show first 20 rows
print("\nFirst 20 rows of the dataset:")
print(df.head(20))

# Show min/max values for numeric features
numeric_cols = df.columns.drop("Target")
print("\nFeature ranges:")
print(df[numeric_cols].describe())


Columns available in the dataset:
['Previous qualification (grade)', 'Admission grade', 'Age at enrollment', 'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Target']

First 20 rows of the dataset:
    Previous qualification (grade)  Admission grade  Age at enrollment  \
0                            122.0            127.3                 20   
1                            160.0            142.5                 19   
2                            122.0            124.8                 19   
3                            122.0            119.6                 20   
4                            100.0            141.5                 45   
5                            133.1            114.8                 50   
6                            142.0            128.4                 18   
7                            119.0            113.1                 22   
8                            137.0