In [2]:
pip install -q pandas numpy scikit-learn lightgbm joblib


Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report, confusion_matrix
from lightgbm import LGBMClassifier
import joblib

# Use processed train/test splits from feature engineering
TRAIN_PATH = os.path.join("../data", "train_processed.csv")
TEST_PATH = os.path.join("../data", "test_processed.csv")
MODEL_PATH = os.path.join("../models", "academic_risk_model.joblib")
os.makedirs("../models", exist_ok=True)

# Load processed data
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

print("=== Data Overview ===")
print(f"Train: {len(train_df)} rows, {len(train_df.columns)} cols")
print(f"Test: {len(test_df)} rows, {len(test_df.columns)} cols")

# Prepare features and labels
id_cols = ["student_id", "course_id"]
label_col = "label"
feature_cols = [c for c in train_df.columns if c not in id_cols + [label_col]]

X_train = train_df[feature_cols].values
y_train = train_df[label_col].values
X_test = test_df[feature_cols].values
y_test = test_df[label_col].values

print(f"Features: {len(feature_cols)}")
print(f"Train label distribution: {np.bincount(y_train)}")
print(f"Test label distribution: {np.bincount(y_test)}")

# Baseline: Logistic Regression
print("\n=== Baseline: Logistic Regression ===")
logreg_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000, random_state=42))
])
logreg_pipeline.fit(X_train, y_train)
y_pred_lr = logreg_pipeline.predict(X_test)

print(f"LogReg F1: {f1_score(y_test, y_pred_lr):.4f}")
print(f"LogReg Precision: {precision_score(y_test, y_pred_lr):.4f}")
print(f"LogReg Recall: {recall_score(y_test, y_pred_lr):.4f}")

# Main: LightGBM
print("\n=== Main Model: LightGBM ===")
lgbm = LGBMClassifier(
    objective='binary',
    metric='binary_logloss',
    boosting_type='gbdt',
    n_estimators=100,  # Reduced for small dataset
    learning_rate=0.1,
    max_depth=6,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42,
    verbose=-1
)

lgbm.fit(X_train, y_train)
y_pred_lgbm = lgbm.predict(X_test)
y_pred_proba_lgbm = lgbm.predict_proba(X_test)[:, 1]

print(f"LightGBM F1: {f1_score(y_test, y_pred_lgbm):.4f}")
print(f"LightGBM Precision: {precision_score(y_test, y_pred_lgbm):.4f}")
print(f"LightGBM Recall: {recall_score(y_test, y_pred_lgbm):.4f}")

print("\n=== Detailed Classification Report ===")
print(classification_report(y_test, y_pred_lgbm, target_names=['Low Risk (C/D/F)', 'High Success (A/B)']))

print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred_lgbm))

# Feature importance
print("\n=== Top 10 Feature Importances ===")
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': lgbm.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance.head(10))

# Save model and feature names
joblib.dump({
    'model': lgbm,
    'feature_names': feature_cols,
    'scaler': None  # LightGBM doesn't need scaling
}, MODEL_PATH)

print(f"\nSaved LightGBM model to {MODEL_PATH}")


=== Data Overview ===
Train: 20 rows, 39 cols
Test: 5 rows, 39 cols
Features: 36
Train label distribution: [ 7 13]
Test label distribution: [2 3]

=== Baseline: Logistic Regression ===
LogReg F1: 0.6667
LogReg Precision: 0.6667
LogReg Recall: 0.6667

=== Main Model: LightGBM ===
LightGBM F1: 0.7500
LightGBM Precision: 0.6000
LightGBM Recall: 1.0000

=== Detailed Classification Report ===
                    precision    recall  f1-score   support

  Low Risk (C/D/F)       0.00      0.00      0.00         2
High Success (A/B)       0.60      1.00      0.75         3

          accuracy                           0.60         5
         macro avg       0.30      0.50      0.38         5
      weighted avg       0.36      0.60      0.45         5


=== Confusion Matrix ===
[[0 2]
 [0 3]]

=== Top 10 Feature Importances ===
     feature  importance
0     s_comm           0
1     c_comm           0
20  s_emb_24           0
21  s_emb_27           0
22  s_emb_31           0
23   c_emb_0       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [6]:
pip install -q optuna


Note: you may need to restart the kernel to use updated packages.


In [7]:
# Hyperparameter Tuning with Optuna (scales better than Bayesian optimization)
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

print("\n=== Hyperparameter Tuning with Optuna ===")

def objective(trial):
    # Define search space - conservative for small data, will scale well
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'num_leaves': trial.suggest_int('num_leaves', 10, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
        'random_state': 42,
        'verbose': -1
    }
    
    # Use cross-validation for robust evaluation on small data
    model = LGBMClassifier(**params)
    
    # 3-fold CV (good for small datasets)
    cv_scores = cross_val_score(
        model, X_train, y_train, 
        cv=3, 
        scoring=make_scorer(f1_score),
        n_jobs=1  # Avoid nested parallelism
    )
    
    return cv_scores.mean()

# Create study - TPE sampler is efficient for both small and large data
study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner(n_startup_trials=5)
)

# Run optimization - conservative number of trials for small data
n_trials = 30 if len(X_train) < 100 else 100
study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

print(f"Best F1 score: {study.best_value:.4f}")
print("Best parameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

# Train final model with best parameters
print("\n=== Final Optimized LightGBM ===")
best_lgbm = LGBMClassifier(**study.best_params, random_state=42, verbose=-1)
best_lgbm.fit(X_train, y_train)

y_pred_best = best_lgbm.predict(X_test)
y_pred_proba_best = best_lgbm.predict_proba(X_test)[:, 1]

print(f"Optimized LightGBM F1: {f1_score(y_test, y_pred_best):.4f}")
print(f"Optimized LightGBM Precision: {precision_score(y_test, y_pred_best):.4f}")
print(f"Optimized LightGBM Recall: {recall_score(y_test, y_pred_best):.4f}")

# Compare with baseline
baseline_f1 = f1_score(y_test, y_pred_lgbm)
optimized_f1 = f1_score(y_test, y_pred_best)
improvement = ((optimized_f1 - baseline_f1) / baseline_f1) * 100

print(f"\nImprovement over baseline: {improvement:+.2f}%")

# Save the best model
BEST_MODEL_PATH = os.path.join("../models", "academic_risk_model_optimized.joblib")
joblib.dump({
    'model': best_lgbm,
    'feature_names': feature_cols,
    'best_params': study.best_params,
    'cv_score': study.best_value,
    'scaler': None
}, BEST_MODEL_PATH)

print(f"Saved optimized model to {BEST_MODEL_PATH}")

# Feature importance for optimized model
print("\n=== Optimized Model Feature Importances ===")
feature_importance_opt = pd.DataFrame({
    'feature': feature_cols,
    'importance': best_lgbm.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance_opt.head(10))


  from .autonotebook import tqdm as notebook_tqdm
[I 2025-09-27 21:47:50,244] A new study created in memory with name: no-name-32f621a5-5956-497c-9e68-b3eb094c0e36



=== Hyperparameter Tuning with Optuna ===


Best trial: 0. Best value: 0.786869:  30%|███       | 9/30 [00:00<00:00, 44.72it/s]

[I 2025-09-27 21:47:50,276] Trial 0 finished with value: 0.7868686868686868 and parameters: {'n_estimators': 106, 'learning_rate': 0.2746436300285509, 'max_depth': 7, 'num_leaves': 64, 'subsample': 0.6624074561769746, 'colsample_bytree': 0.662397808134481, 'reg_alpha': 0.05808361216819946, 'reg_lambda': 0.8661761457749352}. Best is trial 0 with value: 0.7868686868686868.
[I 2025-09-27 21:47:50,296] Trial 1 finished with value: 0.7868686868686868 and parameters: {'n_estimators': 140, 'learning_rate': 0.17781056670396786, 'max_depth': 3, 'num_leaves': 98, 'subsample': 0.9329770563201687, 'colsample_bytree': 0.6849356442713105, 'reg_alpha': 0.18182496720710062, 'reg_lambda': 0.18340450985343382}. Best is trial 0 with value: 0.7868686868686868.
[I 2025-09-27 21:47:50,319] Trial 2 finished with value: 0.7868686868686868 and parameters: {'n_estimators': 95, 'learning_rate': 0.1280294493935025, 'max_depth': 5, 'num_leaves': 36, 'subsample': 0.8447411578889518, 'colsample_bytree': 0.6557975442



[I 2025-09-27 21:47:50,538] Trial 10 finished with value: 0.7868686868686868 and parameters: {'n_estimators': 185, 'learning_rate': 0.05054017248410982, 'max_depth': 6, 'num_leaves': 11, 'subsample': 0.6061470949312417, 'colsample_bytree': 0.775232370984732, 'reg_alpha': 0.015144237102756877, 'reg_lambda': 0.9761398998579952}. Best is trial 0 with value: 0.7868686868686868.
[I 2025-09-27 21:47:50,567] Trial 11 finished with value: 0.7868686868686868 and parameters: {'n_estimators': 153, 'learning_rate': 0.2959920050962083, 'max_depth': 6, 'num_leaves': 61, 'subsample': 0.9801238964661397, 'colsample_bytree': 0.7348640964697394, 'reg_alpha': 0.03363260482404265, 'reg_lambda': 0.744750019391158}. Best is trial 0 with value: 0.7868686868686868.
[I 2025-09-27 21:47:50,593] Trial 12 finished with value: 0.7868686868686868 and parameters: {'n_estimators': 160, 'learning_rate': 0.1772603826428493, 'max_depth': 5, 'num_leaves': 97, 'subsample': 0.902800604671404, 'colsample_bytree': 0.83188479

Best trial: 0. Best value: 0.786869:  87%|████████▋ | 26/30 [00:00<00:00, 40.04it/s]

[I 2025-09-27 21:47:50,742] Trial 18 finished with value: 0.7868686868686868 and parameters: {'n_estimators': 133, 'learning_rate': 0.19604963171526604, 'max_depth': 5, 'num_leaves': 68, 'subsample': 0.9338003283396147, 'colsample_bytree': 0.6742143359542161, 'reg_alpha': 0.44954668341925197, 'reg_lambda': 0.26839947496575345}. Best is trial 0 with value: 0.7868686868686868.
[I 2025-09-27 21:47:50,764] Trial 19 finished with value: 0.7868686868686868 and parameters: {'n_estimators': 80, 'learning_rate': 0.22818868347922028, 'max_depth': 7, 'num_leaves': 87, 'subsample': 0.8416634198004798, 'colsample_bytree': 0.73964158693172, 'reg_alpha': 0.2657826831355138, 'reg_lambda': 0.07488639833738729}. Best is trial 0 with value: 0.7868686868686868.
[I 2025-09-27 21:47:50,789] Trial 20 finished with value: 0.7868686868686868 and parameters: {'n_estimators': 110, 'learning_rate': 0.14638612485123334, 'max_depth': 8, 'num_leaves': 70, 'subsample': 0.7727431143998708, 'colsample_bytree': 0.798225

Best trial: 0. Best value: 0.786869: 100%|██████████| 30/30 [00:00<00:00, 38.61it/s]


[I 2025-09-27 21:47:50,968] Trial 27 finished with value: 0.7868686868686868 and parameters: {'n_estimators': 112, 'learning_rate': 0.11937024143120632, 'max_depth': 7, 'num_leaves': 17, 'subsample': 0.9964125388085956, 'colsample_bytree': 0.7291665208174549, 'reg_alpha': 0.10185142889413137, 'reg_lambda': 0.11653766965047542}. Best is trial 0 with value: 0.7868686868686868.
[I 2025-09-27 21:47:50,996] Trial 28 finished with value: 0.7868686868686868 and parameters: {'n_estimators': 128, 'learning_rate': 0.1371943059767041, 'max_depth': 5, 'num_leaves': 43, 'subsample': 0.8740038680959015, 'colsample_bytree': 0.6395772706424394, 'reg_alpha': 0.22385222076348227, 'reg_lambda': 0.8355841097756369}. Best is trial 0 with value: 0.7868686868686868.
[I 2025-09-27 21:47:51,021] Trial 29 finished with value: 0.7868686868686868 and parameters: {'n_estimators': 119, 'learning_rate': 0.19797739167828052, 'max_depth': 6, 'num_leaves': 56, 'subsample': 0.7309448291222743, 'colsample_bytree': 0.7703