In [2]:
import pandas as pd
import shap
!pip install optuna
import optuna
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from imblearn.over_sampling import SMOTE

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [17]:
# Load the dataset
df = pd.read_csv("dropout.csv")

# Encode categorical variables (if any)
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Define features and target (assuming last column is the target)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Map target labels to class names
class_labels = {0: "graduate", 1: "dropout", 2: "enrolled"}
y = y.map(class_labels)

# Feature Scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [18]:
# Ensure X is non-negative for chi2 (Modified #1 Preprocessing)
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Feature Selection (Modified #1 - SelectKBest)
selector = SelectKBest(chi2, k=5)
X_selected = selector.fit_transform(X, y)

# Synthetic Oversampling (Modified #1 - Handling Class Imbalance)
smote = SMOTE(random_state=318945)
X_resampled, y_resampled = smote.fit_resample(X_selected, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=318945)

# Polynomial Features (Modified #1 - Feature Engineering for Logistic Regression)
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

In [21]:
# Function to train, evaluate and return metrics
def train_and_evaluate(model, model_name, X_train, X_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")
    cm = confusion_matrix(y_test, y_pred, labels=["graduate", "dropout", "enrolled"])
    report = classification_report(y_test, y_pred, target_names=["graduate", "dropout", "enrolled"])

    print(f"\nModel: {model_name}")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("Confusion Matrix:")
    print(cm)
    print("Classification Report:")
    print(report)

    return model

# Baseline Models
rf_baseline = train_and_evaluate(RandomForestClassifier(random_state=318945), "Random Forest Baseline", X_train, X_test)
log_reg_baseline = train_and_evaluate(LogisticRegression(random_state=318945), "Logistic Regression Baseline", X_train, X_test)
svm_baseline = train_and_evaluate(SVC(kernel="linear", random_state=318945), "SVM Baseline", X_train, X_test)
nb_baseline = train_and_evaluate(GaussianNB(), "Naïve Bayes Baseline", X_train, X_test)

# Modified Algorithm #1 Models
rf_modified1 = train_and_evaluate(RandomForestClassifier(min_samples_split=5, class_weight="balanced", random_state=318945), "Random Forest Modified #1", X_train, X_test)
log_reg_modified1 = train_and_evaluate(LogisticRegression(C=0.5, max_iter=1000, random_state=318945), "Logistic Regression Modified #1", X_train_poly, X_test_poly)
svm_modified1 = train_and_evaluate(SVC(C=0.5, kernel="rbf", random_state=318945), "SVM Modified #1", X_train, X_test)
nb_modified1 = train_and_evaluate(GaussianNB(var_smoothing=1e-2), "Naïve Bayes Modified #1", X_train, X_test)

# Hyperparameter optimization with Optuna (Modified Algorithm #2)
def objective(trial):
    model_name = trial.suggest_categorical("model", ["Random Forest", "Logistic Regression", "SVM", "Naïve Bayes"])
    if model_name == "Random Forest":
        model = RandomForestClassifier(
            n_estimators=trial.suggest_int("n_estimators", 10, 200),
            max_depth=trial.suggest_int("max_depth", 3, 20),
            random_state=318945
        )
    elif model_name == "Logistic Regression":
        model = LogisticRegression(
            C=trial.suggest_loguniform("C", 1e-3, 1e3),
            max_iter=1000,
            random_state=318945
        )
    elif model_name == "SVM":
        model = SVC(
            C=trial.suggest_loguniform("C", 1e-3, 1e3),
            kernel="linear",
            random_state=318945
        )
    else:
        model = GaussianNB(var_smoothing=trial.suggest_loguniform("var_smoothing", 1e-9, 1e-1))

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return f1_score(y_test, y_pred, average="weighted")

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)
print("Best parameters:", study.best_params)

# Train and evaluate Optimized Models (Modified #2)
rf_optimized = RandomForestClassifier(n_estimators=study.best_params.get("n_estimators", 100), max_depth=study.best_params.get("max_depth", None), random_state=318945)
log_reg_optimized = LogisticRegression(C=study.best_params.get("C", 1.0), max_iter=1000, random_state=318945)
svm_optimized = SVC(C=study.best_params.get("C", 1.0), kernel="linear", random_state=318945)
nb_optimized = GaussianNB(var_smoothing=study.best_params.get("var_smoothing", 1e-9))

train_and_evaluate(rf_optimized, "Random Forest Optimized", X_train, X_test)
train_and_evaluate(log_reg_optimized, "Logistic Regression Optimized", X_train_poly, X_test_poly)
train_and_evaluate(svm_optimized, "SVM Optimized", X_train, X_test)
train_and_evaluate(nb_optimized, "Naïve Bayes Optimized", X_train, X_test)


Model: Random Forest Baseline
Accuracy: 0.6923
F1 Score: 0.6916
Confusion Matrix:
[[335  75  48]
 [ 74 254  92]
 [ 47  72 329]]
Classification Report:
              precision    recall  f1-score   support

    graduate       0.63      0.60      0.62       420
     dropout       0.70      0.73      0.72       448
    enrolled       0.73      0.73      0.73       458

    accuracy                           0.69      1326
   macro avg       0.69      0.69      0.69      1326
weighted avg       0.69      0.69      0.69      1326


Model: Logistic Regression Baseline
Accuracy: 0.6320
F1 Score: 0.6376
Confusion Matrix:
[[266 142  50]
 [ 38 255 127]
 [ 10 121 317]]
Classification Report:
              precision    recall  f1-score   support

    graduate       0.49      0.61      0.54       420
     dropout       0.64      0.71      0.67       448
    enrolled       0.85      0.58      0.69       458

    accuracy                           0.63      1326
   macro avg       0.66      0.63    

[I 2025-02-20 03:08:09,270] A new study created in memory with name: no-name-d76862d3-b43a-4d4b-a6ed-e7ba947dd63b
  model = GaussianNB(var_smoothing=trial.suggest_loguniform("var_smoothing", 1e-9, 1e-1))
[I 2025-02-20 03:08:09,288] Trial 0 finished with value: 0.6231786518292287 and parameters: {'model': 'Naïve Bayes', 'var_smoothing': 2.3639868980947885e-09}. Best is trial 0 with value: 0.6231786518292287.
  C=trial.suggest_loguniform("C", 1e-3, 1e3),
[I 2025-02-20 03:08:09,341] Trial 1 finished with value: 0.6363243470518782 and parameters: {'model': 'Logistic Regression', 'C': 8.71842729269408}. Best is trial 1 with value: 0.6363243470518782.
  C=trial.suggest_loguniform("C", 1e-3, 1e3),
[I 2025-02-20 03:08:09,377] Trial 2 finished with value: 0.6322648257204074 and parameters: {'model': 'Logistic Regression', 'C': 0.016247730275884536}. Best is trial 1 with value: 0.6363243470518782.



Model: SVM Modified #1
Accuracy: 0.6418
F1 Score: 0.6432
Confusion Matrix:
[[276 132  50]
 [ 49 230 141]
 [ 16  87 345]]
Classification Report:
              precision    recall  f1-score   support

    graduate       0.51      0.55      0.53       420
     dropout       0.64      0.77      0.70       448
    enrolled       0.81      0.60      0.69       458

    accuracy                           0.64      1326
   macro avg       0.66      0.64      0.64      1326
weighted avg       0.66      0.64      0.64      1326


Model: Naïve Bayes Modified #1
Accuracy: 0.6214
F1 Score: 0.6145
Confusion Matrix:
[[277 105  76]
 [ 55 174 191]
 [ 16  59 373]]
Classification Report:
              precision    recall  f1-score   support

    graduate       0.51      0.41      0.46       420
     dropout       0.58      0.83      0.69       448
    enrolled       0.80      0.60      0.69       458

    accuracy                           0.62      1326
   macro avg       0.63      0.62      0.61      

[I 2025-02-20 03:08:10,408] Trial 3 finished with value: 0.6955776112988175 and parameters: {'model': 'Random Forest', 'n_estimators': 180, 'max_depth': 17}. Best is trial 3 with value: 0.6955776112988175.
  C=trial.suggest_loguniform("C", 1e-3, 1e3),
[I 2025-02-20 03:08:11,397] Trial 4 finished with value: 0.5354578536701117 and parameters: {'model': 'SVM', 'C': 0.017354129506948868}. Best is trial 3 with value: 0.6955776112988175.
  C=trial.suggest_loguniform("C", 1e-3, 1e3),
[I 2025-02-20 03:08:11,450] Trial 5 finished with value: 0.6392450150673481 and parameters: {'model': 'Logistic Regression', 'C': 1.4676972695610868}. Best is trial 3 with value: 0.6955776112988175.
[I 2025-02-20 03:08:12,236] Trial 6 finished with value: 0.7125400309290673 and parameters: {'model': 'Random Forest', 'n_estimators': 152, 'max_depth': 13}. Best is trial 6 with value: 0.7125400309290673.
  C=trial.suggest_loguniform("C", 1e-3, 1e3),
[I 2025-02-20 03:08:20,126] Trial 7 finished with value: 0.6405803

Best parameters: {'model': 'Random Forest', 'n_estimators': 175, 'max_depth': 12}

Model: Random Forest Optimized
Accuracy: 0.7142
F1 Score: 0.7159
Confusion Matrix:
[[327  85  46]
 [ 50 287  83]
 [ 28  87 333]]
Classification Report:
              precision    recall  f1-score   support

    graduate       0.63      0.68      0.65       420
     dropout       0.72      0.74      0.73       448
    enrolled       0.81      0.71      0.76       458

    accuracy                           0.71      1326
   macro avg       0.72      0.71      0.71      1326
weighted avg       0.72      0.71      0.72      1326


Model: Logistic Regression Optimized
Accuracy: 0.6463
F1 Score: 0.6492
Confusion Matrix:
[[273 135  50]
 [ 53 249 118]
 [ 12 101 335]]
Classification Report:
              precision    recall  f1-score   support

    graduate       0.51      0.59      0.55       420
     dropout       0.67      0.75      0.70       448
    enrolled       0.81      0.60      0.69       458

    acc

In [23]:
# Model performance data
data = {
    "Model": ["Random Forest", "Logistic Regression", "SVM", "Naïve Bayes"],
    "Accuracy (Baseline)": [69.23, 63.20, 63.57, 63.05],
    "F1 Score (Baseline)": [69.16, 63.76, 63.86, 62.32],
    "Accuracy (Modified #1)": [71.19, 63.50, 64.18, 62.14],
    "F1 Score (Modified #1)": [71.11, 63.80, 64.32, 61.45],
    "Accuracy (Modified #2)": [71.42, 64.63, 63.57, 63.05],
    "F1 Score (Modified #2)": [71.59, 64.92, 63.86, 62.32],
}

# Create DataFrame
df = pd.DataFrame(data)

# Display table
print(df.to_string(index=False))

              Model  Accuracy (Baseline)  F1 Score (Baseline)  Accuracy (Modified #1)  F1 Score (Modified #1)  Accuracy (Modified #2)  F1 Score (Modified #2)
      Random Forest                69.23                69.16                   71.19                   71.11                   71.42                   71.59
Logistic Regression                63.20                63.76                   63.50                   63.80                   64.63                   64.92
                SVM                63.57                63.86                   64.18                   64.32                   63.57                   63.86
        Naïve Bayes                63.05                62.32                   62.14                   61.45                   63.05                   62.32
