In [29]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset
df = pd.read_csv("train_cleaned_final.csv")

# Parse program age ranges
age_split = df['Age Range by Program'].astype(str).str.extract(r'(?P<min_age>\d+)-(?P<max_age>\d+)')
df['Program Min Age'] = age_split['min_age'].astype(float)
df['Program Max Age'] = age_split['max_age'].astype(float)

# Age-based features
df['Age In Range'] = df.apply(lambda row: 1 if row['Program Min Age'] <= row['Age'] <= row['Program Max Age'] else 0, axis=1)
df['Age Gap To Min'] = df['Age'] - df['Program Min Age']
df['Age Gap To Max'] = df['Program Max Age'] - df['Age']
df['Extreme Age'] = df.apply(lambda row: 1 if row['Age'] < row['Program Min Age'] - 5 or row['Age'] > row['Program Max Age'] + 5 else 0, axis=1)

# Drop text column
df.drop(columns=['Age Range by Program'], inplace=True)

# Drop irrelevant columns
df.drop(columns=['Student ID', 'Home City', 'Program ID', 'Program Start Date', 'Program End Date'], inplace=True, errors='ignore')

# Clean college labels
df['College'] = df['College'].replace({
    "Natural Sciences Mathematics and Statistics": "Natural Sciences, Mathematics, and Statistics",
    "Business Management and Law": "Business, Management and Law",
    "Business,Management and Law": "Business, Management and Law"
}).astype(str).str.strip()

# Strip whitespace in category codes
df['Program Main Category Code'] = df['Program Main Category Code'].astype(str).str.strip()
df['Program Sub Category Code'] = df['Program Sub Category Code'].astype(str).str.strip()

# Encode binary columns
df['Completed Degree'] = df['Completed Degree'].map({'Yes': 1, 'No': 0})
df['Still Working'] = df['Still Working'].map({'Yes': 1, 'No': 0})

# Encode categorical
categorical_cols = df.select_dtypes(include='object').columns.tolist()
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Features and target
X = df.drop(columns=['Y'])
y = df['Y'].map({0: 1, 1: 0})  # 1 = completed, 0 = quit

# Scale numeric features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Define model grids
model_grids = {
    "Logistic Regression": (
        LogisticRegression(class_weight='balanced', max_iter=1000),
        {"C": [0.01, 0.1, 1, 10]}
    ),
    "SVM": (
        SVC(class_weight='balanced', probability=True),
        {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]}
    ),
    "Random Forest": (
        RandomForestClassifier(class_weight='balanced', random_state=42),
        {"n_estimators": [100, 200], "max_depth": [None, 10, 20]}
    ),
    "Decision Tree": (
        DecisionTreeClassifier(class_weight='balanced', random_state=42),
        {"max_depth": [None, 10, 20], "min_samples_split": [2, 5]}
    )
}

# Train and evaluate
results = {}
for name, (model, params) in model_grids.items():
    grid = GridSearchCV(model, params, scoring='f1', cv=5, n_jobs=-1)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    results[name] = {
        "Model": name,
        "Best Params": grid.best_params_,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "Best Estimator": grid.best_estimator_
    }

# Select and save best model
results_df = pd.DataFrame(results).T.sort_values(by="F1 Score", ascending=False)
best_model_name = results_df.iloc[0]["Model"]
best_model = results[best_model_name]["Best Estimator"]

joblib.dump(best_model, "best_model_extreme_age.pkl")
joblib.dump(scaler, "scaler_extreme_age.pkl")
joblib.dump(label_encoders, "label_encoders_extreme_age.pkl")

# --- Output Summary ---
print(f"\n✅ Best model saved: {best_model_name}")
print("📦 Files saved: best_model.pkl, scaler.pkl, label_encoders.pkl\n")
print("📊 Model Performance Summary:")
print(results_df.drop(columns='Best Estimator'))



✅ Best model saved: Random Forest
📦 Files saved: best_model.pkl, scaler.pkl, label_encoders.pkl

📊 Model Performance Summary:
                                   Model  \
Random Forest              Random Forest   
Logistic Regression  Logistic Regression   
SVM                                  SVM   
Decision Tree              Decision Tree   

                                                     Best Params  Accuracy  \
Random Forest           {'max_depth': None, 'n_estimators': 100}  0.883741   
Logistic Regression                                  {'C': 0.01}  0.854254   
SVM                                    {'C': 1, 'kernel': 'rbf'}  0.850042   
Decision Tree        {'max_depth': None, 'min_samples_split': 2}  0.837405   

                    Precision    Recall  F1 Score  
Random Forest        0.909702    0.9556  0.932087  
Logistic Regression  0.945534  0.875883  0.909377  
SVM                  0.944262  0.871847  0.906611  
Decision Tree        0.907143  0.897074   0.90208  


In [30]:
import plotly.express as px
import pandas as pd

df_long = results_df.reset_index().melt(id_vars='Model', 
                          value_vars=['Accuracy', 'Precision', 'Recall', 'F1 Score'],
                          var_name='Metric', value_name='Score')

fig = px.bar(df_long, 
             x='Model', 
             y='Score', 
             color='Metric',
             barmode='group',
             title='Model Performance Comparison',
             text='Score',
             height=600,
             color_discrete_sequence=px.colors.sequential.Blues[2:]
             )

fig.update_layout(
    xaxis_title='Model',
    yaxis_title='Score',
    legend_title='Metric',
    font=dict(size=14),
    plot_bgcolor='white',
    yaxis=dict(showgrid=True, gridcolor='lightgray'),
    bargap=0.4
)

fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')

fig.show()

In [31]:
import plotly.graph_objects as go
from sklearn.metrics import confusion_matrix

best_rf_model = results["Random Forest"]["Best Estimator"]
y_pred_rf = best_rf_model.predict(X_test)

cm = confusion_matrix(y_test, y_pred_rf)
cm_percent = cm / cm.sum() * 100  

labels = [["TN", "FP"], ["FN", "TP"]]

annotations = [[f"{label}<br>{value}<br>({percent:.1f}%)"
                for label, value, percent in zip(row_l, row_v, row_p)]
               for row_l, row_v, row_p in zip(labels, cm, cm_percent)]

fig = go.Figure(data=go.Heatmap(
    z=cm,
    x=["Predicted Negative", "Predicted Positive"],
    y=["Actual Negative", "Actual Positive"],
    text=annotations,
    texttemplate="%{text}",
    colorscale="Blues",
    showscale=True,
    hoverinfo="skip"
))

fig.update_layout(
    title="Confusion Matrix for Random Forest",
    xaxis_title="Predicted Label",
    yaxis_title="Actual Label",
    font=dict(size=18),
    width=800,
    height=550,
    margin=dict(t=80, l=100, r=20, b=80)
)

fig.show()