In [26]:
import mlflow
import mlflow.sklearn
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
import sklearn
sklearn.set_config(transform_output="pandas")

In [3]:
df = pd.read_csv("processed_data/dataset_after_feature_building.csv")
df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Nacionality,Mother's qualification,Father's qualification,Mother's occupation,...,Parental_influence,Course_Dropout_prob,Course_Enrolled_prob,Course_Graduate_prob,Application mode_Dropout_prob,Application mode_Enrolled_prob,Application mode_Graduate_prob,Previous qualification_Dropout_prob,Previous qualification_Enrolled_prob,Previous qualification_Graduate_prob
0,1,8,5,2,1,1,1,13,10,6,...,-4.824723,0.381395,0.172093,0.446512,0.293578,0.182339,0.524083,0.290019,0.187786,0.522195
1,1,6,1,11,1,1,1,1,3,4,...,-18.023059,0.380952,0.162698,0.456349,0.166667,0.333333,0.5,0.290019,0.187786,0.522195
2,1,1,5,5,1,1,1,22,27,10,...,14.511525,0.225664,0.185841,0.588496,0.201991,0.175644,0.622365,0.290019,0.187786,0.522195
3,1,8,2,15,1,1,1,23,27,6,...,14.122941,0.305136,0.102719,0.592145,0.293578,0.182339,0.524083,0.290019,0.187786,0.522195
4,2,12,1,3,0,1,1,22,28,10,...,15.327614,0.330233,0.097674,0.572093,0.55414,0.15414,0.29172,0.290019,0.187786,0.522195


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 52 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital status                                  4424 non-null   int64  
 1   Application mode                                4424 non-null   int64  
 2   Application order                               4424 non-null   int64  
 3   Course                                          4424 non-null   int64  
 4   Daytime/evening attendance                      4424 non-null   int64  
 5   Previous qualification                          4424 non-null   int64  
 6   Nacionality                                     4424 non-null   int64  
 7   Mother's qualification                          4424 non-null   int64  
 8   Father's qualification                          4424 non-null   int64  
 9   Mother's occupation                      

In [5]:
selected_features = [
    "Tuition fees up to date",
    "Age at enrollment",
    "2nd_sem_perf_ratio",
    "Failed units ratio",
    "Course_Graduate_prob",
    "Mother's qualification",
    "Father's qualification",
    "Inflation rate",
    "Late_enrollment",
    "Course_Graduate_prob",
    "Application mode_Enrolled_prob",
    "Curricular units 2nd sem (grade)",
    "Parental_influence",
    "Curricular units 1st sem (grade)",
    "Target"
  ]

In [6]:
df = df.loc[:, selected_features]

In [7]:
df.head()

Unnamed: 0,Tuition fees up to date,Age at enrollment,2nd_sem_perf_ratio,Failed units ratio,Course_Graduate_prob,Mother's qualification,Father's qualification,Inflation rate,Late_enrollment,Course_Graduate_prob.1,Application mode_Enrolled_prob,Curricular units 2nd sem (grade),Parental_influence,Curricular units 1st sem (grade),Target
0,1,20,0.0,0.0,0.446512,13,10,1.4,0,0.446512,0.182339,0.0,-4.824723,0.0,Dropout
1,0,19,1.0,0.0,0.456349,1,3,-0.3,0,0.456349,0.333333,13.666667,-18.023059,14.0,Graduate
2,0,19,0.0,1.0,0.588496,22,27,1.4,0,0.588496,0.175644,0.0,14.511525,0.0,Dropout
3,1,20,0.833333,0.0,0.592145,23,27,-0.8,0,0.592145,0.182339,12.4,14.122941,13.428571,Graduate
4,1,45,1.0,0.166667,0.572093,22,28,-0.3,1,0.572093,0.15414,13.0,15.327614,12.333333,Graduate


In [8]:
df.shape

(4424, 15)

In [9]:
X = df.drop(columns=['Target'])
y = df['Target']

In [17]:
print(type(X))
print(type(y))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [11]:
label_enc = LabelEncoder()
y_train = label_enc.fit_transform(y_train)
y_test = label_enc.transform(y_test)

In [12]:
y_train

array([2, 1, 0, ..., 2, 0, 1])

In [13]:
X_train.shape, X_test.shape

((3539, 14), (885, 14))

In [23]:
models = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "SVC": SVC(probability=True, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42),
}

In [24]:
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

# Start MLflow Experiment
mlflow.set_experiment("Multi-Class Model Comparison")

# Train & Evaluate Each Model
for model_name, model in models.items():
    with mlflow.start_run(run_name=f"{model_name}"):
        
        # Handle XGBoost Issue (Convert X_train if needed)
        if model_name == "XGBoost":
            X_train_model = np.array(X_train)  # Ensure it's NumPy
            X_test_model = np.array(X_test)
        else:
            X_train_model = X_train
            X_test_model = X_test
        
        # Train Model
        model.fit(X_train_model, y_train)
        y_pred = model.predict(X_test_model)

        # Compute Metrics
        metrics = {
            "accuracy": accuracy_score(y_test, y_pred),
            "precision_macro": precision_score(y_test, y_pred, average="macro"),
            "recall_macro": recall_score(y_test, y_pred, average="macro"),
            "f1_macro": f1_score(y_test, y_pred, average="macro")
        }

        # Log Parameters & Metrics
        mlflow.log_params({"model": model_name})
        mlflow.log_metrics(metrics)

        # Log Model
        mlflow.sklearn.log_model(model, f"model_{model_name}")

        # Compute & Save Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        cm_df = pd.DataFrame(cm, index=np.unique(y_test), columns=np.unique(y_test))

        plt.figure(figsize=(6, 4))
        sns.heatmap(cm_df, annot=True, fmt="d", cmap="Blues")
        plt.xlabel("Predicted Label")
        plt.ylabel("True Label")
        plt.title(f"Confusion Matrix - {model_name}")

        # Save Confusion Matrix as Image
        cm_image_path = f"confusion_matrix_{model_name}.png"
        plt.savefig(cm_image_path)
        plt.close()

        # Log Confusion Matrix Image in MLflow
        mlflow.log_artifact(cm_image_path)

print("✅ MLflow logging complete! Compare different models in MLflow UI.")



🏃 View run RandomForest at: http://127.0.0.1:5000/#/experiments/677329907534610718/runs/d3a96372e01f43e6be83c8e600c681bf
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/677329907534610718


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


🏃 View run LogisticRegression at: http://127.0.0.1:5000/#/experiments/677329907534610718/runs/e6d2d5be14e34a9cb0e492bb2f7981ba
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/677329907534610718




🏃 View run GradientBoosting at: http://127.0.0.1:5000/#/experiments/677329907534610718/runs/ac8e8feb1021434188292470ea6bd288
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/677329907534610718


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


🏃 View run SVC at: http://127.0.0.1:5000/#/experiments/677329907534610718/runs/6376e8d3015541018898263c1fc1f753
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/677329907534610718


Parameters: { "use_label_encoder" } are not used.



🏃 View run XGBoost at: http://127.0.0.1:5000/#/experiments/677329907534610718/runs/ea7b3aa16b9143e89fcde49afad4e280
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/677329907534610718
✅ MLflow logging complete! Compare different models in MLflow UI.


In [27]:
# Define Hyperparameter Grid
param_grid = {
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.1, 0.2],
    "max_depth": [3, 5, 7],
    "subsample": [0.8, 1.0],
    "min_samples_split": [2, 5, 10]
}

# Initialize Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(random_state=42)

# Grid Search with Cross-Validation
grid_search = GridSearchCV(gb_model, param_grid, scoring="accuracy", cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Get Best Model
best_model = grid_search.best_estimator_

# Predict with Best Model
y_pred = best_model.predict(X_test)

# Compute Metrics
metrics = {
    "accuracy": accuracy_score(y_test, y_pred),
    "precision_macro": precision_score(y_test, y_pred, average="macro"),
    "recall_macro": recall_score(y_test, y_pred, average="macro"),
    "f1_macro": f1_score(y_test, y_pred, average="macro")
}

# Start MLflow Run
mlflow.set_experiment("GradientBoosting Hyperparameter Tuning")

with mlflow.start_run(run_name="Best_GradientBoosting_Model"):
    # Log Best Parameters
    mlflow.log_params(grid_search.best_params_)

    # Log Metrics
    mlflow.log_metrics(metrics)

    # Log Model
    mlflow.sklearn.log_model(best_model, "best_gradient_boosting_model")

    # Compute & Save Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    # cm_df = pd.DataFrame(cm, index=np.unique(y_test), columns=np.unique(y_test))

    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix - Best Gradient Boosting Model")

    # Save Confusion Matrix as Image
    cm_image_path = "confusion_matrix_gb.png"
    plt.savefig(cm_image_path)
    plt.close()

    # Log Confusion Matrix Image in MLflow
    mlflow.log_artifact(cm_image_path)

print("✅ Hyperparameter tuning complete! Check MLflow UI for results.")

Fitting 3 folds for each of 162 candidates, totalling 486 fits


2025/02/04 12:28:55 INFO mlflow.tracking.fluent: Experiment with name 'GradientBoosting Hyperparameter Tuning' does not exist. Creating a new experiment.


🏃 View run Best_GradientBoosting_Model at: http://127.0.0.1:5000/#/experiments/275122257120077088/runs/525f17369b3944c6a4c85b2822bde730
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/275122257120077088
✅ Hyperparameter tuning complete! Check MLflow UI for results.
