In [1]:
# ===========================================================
# 1. Import Libraries and Load Cleaned Dataset
# ===========================================================
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from catboost import CatBoostClassifier
import pickle

# Load the cleaned dataset
df = pd.read_csv("cleaned_student_data_v2.csv")

print("‚úÖ Dataset loaded successfully!")
print("Shape:", df.shape)
display(df.head())



‚úÖ Dataset loaded successfully!
Shape: (4424, 8)


Unnamed: 0,Previous qualification (grade),Admission grade,Age at enrollment,Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Target
0,122.0,127.3,20,0,0.0,0,0.0,Dropout
1,160.0,142.5,19,6,14.0,6,13.666667,Graduate
2,122.0,124.8,19,0,0.0,0,0.0,Dropout
3,122.0,119.6,20,6,13.428571,5,12.4,Graduate
4,100.0,141.5,45,5,12.333333,6,13.0,Graduate


In [2]:
# ===========================================================
# 2. Feature Engineering
# ===========================================================

# Add new interaction features that combine performance from both semesters

df["Total_approved_units"] = (
    df["Curricular units 1st sem (approved)"] +
    df["Curricular units 2nd sem (approved)"]
)

df["Average_semester_grade"] = (
    (df["Curricular units 1st sem (grade)"] +
     df["Curricular units 2nd sem (grade)"]) / 2
)

# Create performance efficiency (approved units / total average grade)
df["Performance_index"] = df["Total_approved_units"] / (df["Average_semester_grade"] + 1)

# Dropout indicator often correlates with low grades + low approved units
df["Low_performance_flag"] = ((df["Average_semester_grade"] < 10) & 
                              (df["Total_approved_units"] < 5)).astype(int)

print("‚úÖ Feature engineering completed.")
display(df.head())


‚úÖ Feature engineering completed.


Unnamed: 0,Previous qualification (grade),Admission grade,Age at enrollment,Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Target,Total_approved_units,Average_semester_grade,Performance_index,Low_performance_flag
0,122.0,127.3,20,0,0.0,0,0.0,Dropout,0,0.0,0.0,1
1,160.0,142.5,19,6,14.0,6,13.666667,Graduate,12,13.833333,0.808989,0
2,122.0,124.8,19,0,0.0,0,0.0,Dropout,0,0.0,0.0,1
3,122.0,119.6,20,6,13.428571,5,12.4,Graduate,11,12.914286,0.790554,0
4,100.0,141.5,45,5,12.333333,6,13.0,Graduate,11,12.666667,0.804878,0


In [3]:
# ===========================================================
# 3. Split Features and Target
# ===========================================================

X = df.drop("Target", axis=1)
y = df["Target"]

# Convert Target to binary: Dropout = 0, Graduate = 1
y = y.replace({"Dropout": 0, "Graduate": 1})

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("‚úÖ Data split completed!")
print("Training set:", X_train.shape)
print("Testing set:", X_test.shape)


TypeError: '<' not supported between instances of 'int' and 'str'

In [3]:
# ===========================================================
# 3. Split Features and Target
# ===========================================================

X = df.drop("Target", axis=1)
y = df["Target"]

# Convert Target to binary: Dropout = 0, Graduate = 1
y = y.replace({"Dropout": 0, "Graduate": 1})

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("‚úÖ Data split completed!")
print("Training set:", X_train.shape)
print("Testing set:", X_test.shape)


TypeError: '<' not supported between instances of 'int' and 'str'

In [9]:
from collections import Counter

# Before SMOTE
print("üìä Class distribution before SMOTE:")
print(Counter(y_train))

# Apply SMOTE (if not already done)
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# After SMOTE
print("\nüìà Class distribution after SMOTE:")
print(Counter(y_train_resampled))


üìä Class distribution before SMOTE:
Counter({1: 1767, 0: 1137})


ValueError: Unknown label type: unknown. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [4]:
# ===========================================================
# 3. Split Features and Target (Fixed version)
# ===========================================================

X = df.drop("Target", axis=1)
y = df["Target"]

# Convert all to string first (in case there are mixed types)
y = y.astype(str).str.strip().str.capitalize()

# Map categorical to binary values
y = y.replace({"Dropout": 0, "Graduate": 1})

# Drop any rows where target is missing or invalid
valid_targets = [0, 1]
mask = y.isin(valid_targets)
X = X[mask]
y = y[mask]

# Reset index
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("‚úÖ Data split completed successfully!")
print("Training set:", X_train.shape)
print("Testing set:", X_test.shape)
print("\nUnique values in target:", y.unique())


‚úÖ Data split completed successfully!
Training set: (2904, 11)
Testing set: (726, 11)

Unique values in target: [0 1]


In [5]:
# ===========================================================
# 4. Train CatBoost Classifier
# ===========================================================

model = CatBoostClassifier(
    iterations=800,
    learning_rate=0.05,
    depth=8,
    loss_function="Logloss",
    eval_metric="Accuracy",
    verbose=False,
    random_seed=42
)

model.fit(X_train, y_train)

# Save model
with open("best_catboost_model_v3.pkl", "wb") as f:
    pickle.dump(model, f)

print("‚úÖ Model training completed and saved as 'best_catboost_model_v3.pkl'")


‚úÖ Model training completed and saved as 'best_catboost_model_v3.pkl'


In [6]:
# ===========================================================
# 5. Evaluate Model Performance
# ===========================================================
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"üéØ Improved Model Accuracy: {accuracy:.4f}\n")

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=["Dropout", "Graduate"]))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


ValueError: Classification metrics can't handle a mix of unknown and binary targets

In [7]:
# ===========================================================
# 5. Evaluate Model Performance (Fixed)
# ===========================================================
y_pred = model.predict(X_test)

# Ensure both y_test and y_pred are numeric
y_pred = pd.Series(y_pred).replace({"Dropout": 0, "Graduate": 1}).astype(int)
y_test = pd.Series(y_test).astype(int)

accuracy = accuracy_score(y_test, y_pred)
print(f"üéØ Improved Model Accuracy: {accuracy:.4f}\n")

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


üéØ Improved Model Accuracy: 0.8774

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.76      0.83       284
           1       0.86      0.95      0.90       442

    accuracy                           0.88       726
   macro avg       0.89      0.86      0.87       726
weighted avg       0.88      0.88      0.88       726

Confusion Matrix:
[[217  67]
 [ 22 420]]


In [8]:
from collections import Counter

# Before SMOTE
print("üìä Class distribution before SMOTE:")
print(Counter(y_train))

# Apply SMOTE (if not already done)
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# After SMOTE
print("\nüìà Class distribution after SMOTE:")
print(Counter(y_train_resampled))


üìä Class distribution before SMOTE:
Counter({1: 1767, 0: 1137})


ValueError: Unknown label type: unknown. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [10]:
# ===========================================================
# 3. Split Features and Target
# ===========================================================

X = df.drop("Target", axis=1)
y = df["Target"]

# Convert Target to binary: Dropout = 0, Graduate = 1
y = y.replace({"Dropout": 0, "Graduate": 1})

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("‚úÖ Data split completed!")
print("Training set:", X_train.shape)
print("Testing set:", X_test.shape)
from collections import Counter

# Before SMOTE
print("üìä Class distribution before SMOTE:")
print(Counter(y_train))

# Apply SMOTE (if not already done)
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# After SMOTE
print("\nüìà Class distribution after SMOTE:")
print(Counter(y_train_resampled))


TypeError: '<' not supported between instances of 'int' and 'str'

In [11]:
from imblearn.over_sampling import SMOTE
from collections import Counter
print("üìä Class distribution before SMOTE:")
print(Counter(y_train))


üìä Class distribution before SMOTE:
Counter({1: 1767, 0: 1137})


In [12]:
# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to balance the training set
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Confirm new class distribution
print("\nüìà Class distribution after SMOTE:")
print(Counter(y_train_resampled))


ValueError: Unknown label type: unknown. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [13]:
print(y_train.unique())
print(y_train.dtype)


[1 0]
object


In [14]:
print("‚úÖ Unique values in y_train after conversion:", y_train.unique())


‚úÖ Unique values in y_train after conversion: [1 0]


In [15]:
from imblearn.over_sampling import SMOTE
from collections import Counter

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("\nüìà Class distribution after SMOTE:")
print(Counter(y_train_resampled))


ValueError: Unknown label type: unknown. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [16]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# ‚úÖ Ensure y_train is a 1D array of integers
y_train = y_train.squeeze()               # remove extra dimension if it's DataFrame
y_train = y_train.astype(int)             # ensure integer type
y_train = pd.Series(y_train)              # force it to be a Series, not DataFrame

print("y_train type:", type(y_train))
print("Unique values:", y_train.unique())

# ‚úÖ Now safely apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("\nüìä Class distribution after SMOTE:")
print(Counter(y_train_resampled))


y_train type: <class 'pandas.core.series.Series'>
Unique values: [1 0]

üìä Class distribution after SMOTE:
Counter({1: 1767, 0: 1767})


In [17]:
# ===========================================================
# 6. Balance Training Data with SMOTE + Retrain Model
# ===========================================================

from imblearn.over_sampling import SMOTE
from collections import Counter
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# --- Ensure y_train is correctly formatted ---
y_train = y_train.squeeze()
y_train = y_train.astype(int)
y_train = pd.Series(y_train)

print("‚úÖ y_train verified! Unique values:", y_train.unique())

# --- Apply SMOTE to handle imbalance ---
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("\nüìä Class distribution after SMOTE:")
print(Counter(y_train_resampled))

# --- Retrain the CatBoost model on balanced data ---
model = CatBoostClassifier(
    iterations=400,
    learning_rate=0.05,
    depth=8,
    eval_metric='Accuracy',
    verbose=100,
    random_state=42
)

model.fit(X_train_resampled, y_train_resampled, eval_set=(X_test, y_test), verbose=100)

# --- Evaluate performance ---
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"\nüéØ Model Accuracy after SMOTE: {accuracy:.4f}\n")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


‚úÖ y_train verified! Unique values: [1 0]

üìä Class distribution after SMOTE:
Counter({1: 1767, 0: 1767})
0:	learn: 0.8675722	test: 0.8567493	best: 0.8567493 (0)	total: 14.6ms	remaining: 5.81s
100:	learn: 0.9202037	test: 0.8650138	best: 0.8719008 (21)	total: 1.43s	remaining: 4.24s
200:	learn: 0.9493492	test: 0.8705234	best: 0.8719008 (21)	total: 2.59s	remaining: 2.57s
300:	learn: 0.9634975	test: 0.8705234	best: 0.8719008 (21)	total: 3.61s	remaining: 1.19s
399:	learn: 0.9745331	test: 0.8719008	best: 0.8719008 (21)	total: 4.63s	remaining: 0us

bestTest = 0.8719008264
bestIteration = 21

Shrink model to first 22 iterations.

üéØ Model Accuracy after SMOTE: 0.8719

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.79      0.83       284
           1       0.87      0.92      0.90       442

    accuracy                           0.87       726
   macro avg       0.87      0.86      0.86       726
weighted avg       0.87      0

In [18]:
import pickle

# Save the trained model to a .pkl file
with open("best_catboost_model_smote.pkl", "wb") as f:
    pickle.dump(model, f)

print("üíæ Model saved successfully as 'best_catboost_model_smote.pkl'")


üíæ Model saved successfully as 'best_catboost_model_smote.pkl'


In [1]:
import matplotlib.pyplot as plt
import seaborn as sns

# ==============================
# 1Ô∏è‚É£ Feature Distributions
# ==============================
num_cols = X_train.select_dtypes(include='number').columns.tolist()
for col in num_cols:
    plt.figure(figsize=(6,4))
    sns.histplot(df[col], kde=True, bins=20)
    plt.title(f'Distribution of {col}')
    plt.show()

# ==============================
# 2Ô∏è‚É£ Correlation Heatmap
# ==============================
plt.figure(figsize=(10,6))
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

# ==============================
# 3Ô∏è‚É£ Confusion Matrix
# ==============================
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# ==============================
# 4Ô∏è‚É£ Feature Importance
# ==============================
plt.figure(figsize=(8,5))
feature_importances = model.get_feature_importance()
sns.barplot(x=feature_importances, y=X_train.columns)
plt.title("Feature Importances")
plt.show()


NameError: name 'X_train' is not defined