# Supervised Learning: Model Training & Evaluation

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1t4z_78XLGZqADzb4eOMoQ3JlFC3c7Ggu?usp=sharing)

## Objective
Train classical machine learning models for student success prediction:

### Classification Models (Logistic Regression & Naive Bayes):
1. **Early Warning System - 1st Semester** (4-class)
2. **Early Warning System - 1st Year** (4-class)
3. **Semester Probation Prediction** (binary)
4. **Academic Recovery** (binary)
5. **STEM Course Success** (binary)

### Regression Model (Linear Regression):
6. **Next Semester GPA Change** (continuous)

## Evaluation Strategy
- **Split**: 80% train, 20% test (stratified for classification)
- **Validation**: 5-fold cross-validation on training set
- **Class Imbalance**: `class_weight='balanced'` for classification
- **Metrics**:
  - Classification: F1 (weighted), Balanced Accuracy
  - Regression: RMSE, MAE
- **Visualization**: Confusion matrices, prediction plots

## 1. Setup & Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
from pathlib import Path

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    f1_score, balanced_accuracy_score, confusion_matrix,
    mean_squared_error, mean_absolute_error, r2_score
)

import warnings
warnings.filterwarnings('ignore')

plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("Libraries imported successfully")

In [None]:
# Create directories for outputs
Path('models').mkdir(exist_ok=True)
Path('results').mkdir(exist_ok=True)

print("Output directories created")

In [None]:
# Load datasets
with open('supervised_learning_datasets.pkl', 'rb') as f:
    datasets = pickle.load(f)

print("Datasets loaded successfully!")
print(f"\nNumber of datasets: {len(datasets)}")
print(f"\nDataset names:")
for name in datasets.keys():
    print(f"  - {name}")

In [None]:
# Display dataset summary
print("="*70)
print("DATASET SUMMARY")
print("="*70)

for name, data in datasets.items():
    print(f"\n{name.replace('_', ' ').title()}:")
    print(f"  Samples: {len(data['X']):,}")
    print(f"  Features: {len(data['X'].columns)}")
    
    if 'y_multiclass' in data:
        print(f"  Type: Multi-class Classification")
        print(f"  Classes: {sorted(data['y_multiclass'].unique())}")
        print(f"  Class distribution:")
        for cls, count in data['y_multiclass'].value_counts().sort_index().items():
            print(f"    {cls}: {count} ({count/len(data['y_multiclass']):.1%})")
    elif data['y'].dtype in ['float64', 'float32']:
        print(f"  Type: Regression")
        print(f"  Target range: [{data['y'].min():.2f}, {data['y'].max():.2f}]")
        print(f"  Target mean: {data['y'].mean():.3f} (std: {data['y'].std():.3f})")
    else:
        print(f"  Type: Binary Classification")
        print(f"  Positive class: {data['y'].sum()} ({data['y'].mean():.1%})")
        print(f"  Negative class: {(data['y']==0).sum()} ({(data['y']==0).mean():.1%})")

## 2. Helper Functions

In [None]:
def evaluate_classification(y_true, y_pred, model_name, dataset_name, class_labels=None):
    """
    Evaluate classification model and plot confusion matrix.
    
    Args:
        y_true: True labels
        y_pred: Predicted labels
        model_name: Name of the model (e.g., 'Logistic Regression')
        dataset_name: Name of the dataset
        class_labels: Optional list of class names for confusion matrix
    
    Returns:
        dict: Dictionary with F1 and Balanced Accuracy scores
    """
    # Calculate metrics
    f1 = f1_score(y_true, y_pred, average='weighted')
    bal_acc = balanced_accuracy_score(y_true, y_pred)
    
    print(f"\n{model_name} Results:")
    print(f"  F1 Score (weighted): {f1:.4f}")
    print(f"  Balanced Accuracy: {bal_acc:.4f}")
    
    # Plot confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=class_labels if class_labels else 'auto',
                yticklabels=class_labels if class_labels else 'auto')
    plt.title(f'{model_name} - {dataset_name}\nConfusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    
    # Save figure
    filename = f"results/{dataset_name.replace(' ', '_')}_{model_name.replace(' ', '_')}_confusion_matrix.png"
    plt.savefig(filename, dpi=150, bbox_inches='tight')
    plt.show()
    
    return {'f1_weighted': f1, 'balanced_accuracy': bal_acc}

In [None]:
def evaluate_regression(y_true, y_pred, model_name, dataset_name):
    """
    Evaluate regression model and plot predictions.
    
    Args:
        y_true: True values
        y_pred: Predicted values
        model_name: Name of the model
        dataset_name: Name of the dataset
    
    Returns:
        dict: Dictionary with RMSE, MAE, and R¬≤ scores
    """
    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    print(f"\n{model_name} Results:")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE: {mae:.4f}")
    print(f"  R¬≤ Score: {r2:.4f}")
    
    # Plot predictions vs actual
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Scatter plot
    axes[0].scatter(y_true, y_pred, alpha=0.5, s=20)
    axes[0].plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 
                 'r--', lw=2, label='Perfect Prediction')
    axes[0].set_xlabel('True Values')
    axes[0].set_ylabel('Predicted Values')
    axes[0].set_title(f'{model_name} - {dataset_name}\nPredictions vs Actual')
    axes[0].legend()
    axes[0].grid(alpha=0.3)
    
    # Residuals
    residuals = y_true - y_pred
    axes[1].scatter(y_pred, residuals, alpha=0.5, s=20)
    axes[1].axhline(y=0, color='r', linestyle='--', lw=2)
    axes[1].set_xlabel('Predicted Values')
    axes[1].set_ylabel('Residuals')
    axes[1].set_title('Residual Plot')
    axes[1].grid(alpha=0.3)
    
    plt.tight_layout()
    
    # Save figure
    filename = f"results/{dataset_name.replace(' ', '_')}_{model_name.replace(' ', '_')}_predictions.png"
    plt.savefig(filename, dpi=150, bbox_inches='tight')
    plt.show()
    
    return {'rmse': rmse, 'mae': mae, 'r2': r2}

In [None]:
def save_model(model, scaler, metadata, filename):
    """
    Save trained model with scaler and metadata.
    
    Args:
        model: Trained model object
        scaler: Fitted StandardScaler
        metadata: Dictionary with model information
        filename: Filename to save (without path)
    """
    model_artifact = {
        'model': model,
        'scaler': scaler,
        'metadata': metadata
    }
    
    filepath = f'models/{filename}'
    with open(filepath, 'wb') as f:
        pickle.dump(model_artifact, f)
    
    print(f"\n  ‚úÖ Model saved: {filepath}")

In [None]:
# Initialize results storage
all_results = []

print("Helper functions defined successfully")

## 3. Model Training

### 3.1 Early Warning System - 1st Semester

Predict final grade bin (Probation/At-Risk/Good Standing/Dean's List) from first semester performance.

In [None]:
print("="*70)
print("EARLY WARNING SYSTEM - 1ST SEMESTER")
print("="*70)

# Get data
data = datasets['early_warning_1st_semester']
X = data['X']
y = data['y_multiclass']  # 4-class: Probation, At_Risk, Good_Standing, Deans_List

# Get class labels
class_labels = sorted(y.unique())
print(f"\nClasses: {class_labels}")
print(f"Samples: {len(X):,}")
print(f"Features: {len(X.columns)}")

# Handle any missing values
X_filled = X.fillna(X.median())

# Train/test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X_filled, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain set: {len(X_train):,} | Test set: {len(X_test):,}")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Logistic Regression with class_weight='balanced'
print("\n" + "-"*70)
print("Training Logistic Regression...")
print("-"*70)

lr_model = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    random_state=42
)

# 5-fold CV on training set
cv_scores = cross_val_score(
    lr_model, X_train_scaled, y_train,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='f1_weighted'
)

print(f"\n5-Fold CV F1 Scores: {cv_scores}")
print(f"Mean CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

# Train on full training set
lr_model.fit(X_train_scaled, y_train)

# Predict on test set
y_pred_lr = lr_model.predict(X_test_scaled)

# Evaluate
lr_metrics = evaluate_classification(
    y_test, y_pred_lr,
    'Logistic Regression',
    'Early Warning 1st Semester',
    class_labels=class_labels
)

In [None]:
# Naive Bayes
print("\n" + "-"*70)
print("Training Naive Bayes...")
print("-"*70)

nb_model = GaussianNB()

# 5-fold CV on training set
cv_scores = cross_val_score(
    nb_model, X_train_scaled, y_train,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='f1_weighted'
)

print(f"\n5-Fold CV F1 Scores: {cv_scores}")
print(f"Mean CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

# Train on full training set
nb_model.fit(X_train_scaled, y_train)

# Predict on test set
y_pred_nb = nb_model.predict(X_test_scaled)

# Evaluate
nb_metrics = evaluate_classification(
    y_test, y_pred_nb,
    'Naive Bayes',
    'Early Warning 1st Semester',
    class_labels=class_labels
)

In [None]:
# Compare and save best model
print("\n" + "="*70)
print("MODEL COMPARISON")
print("="*70)
print(f"\nLogistic Regression - F1: {lr_metrics['f1_weighted']:.4f} | Bal Acc: {lr_metrics['balanced_accuracy']:.4f}")
print(f"Naive Bayes         - F1: {nb_metrics['f1_weighted']:.4f} | Bal Acc: {nb_metrics['balanced_accuracy']:.4f}")

# Save best model
if lr_metrics['f1_weighted'] >= nb_metrics['f1_weighted']:
    best_model = lr_model
    best_name = 'Logistic Regression'
    best_metrics = lr_metrics
else:
    best_model = nb_model
    best_name = 'Naive Bayes'
    best_metrics = nb_metrics

print(f"\nüèÜ Best Model: {best_name}")

metadata = {
    'dataset': 'early_warning_1st_semester',
    'model_type': best_name,
    'n_samples': len(X),
    'n_features': len(X.columns),
    'metrics': best_metrics,
    'classes': class_labels
}

save_model(best_model, scaler, metadata, 'early_warning_1st_semester_best.pkl')

# Store results
all_results.append({
    'Dataset': 'Early Warning 1st Semester',
    'Type': 'Multi-class (4)',
    'Best Model': best_name,
    'F1 (weighted)': best_metrics['f1_weighted'],
    'Balanced Accuracy': best_metrics['balanced_accuracy']
})

### 3.2 Early Warning System - 1st Year

Predict final grade bin from first year (2 semesters) performance.

In [None]:
print("="*70)
print("EARLY WARNING SYSTEM - 1ST YEAR")
print("="*70)

data = datasets['early_warning_1st_year']
X = data['X']
y = data['y_multiclass']

class_labels = sorted(y.unique())
print(f"\nClasses: {class_labels}")
print(f"Samples: {len(X):,}")
print(f"Features: {len(X.columns)}")

X_filled = X.fillna(X.median())
X_train, X_test, y_train, y_test = train_test_split(
    X_filled, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain set: {len(X_train):,} | Test set: {len(X_test):,}")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression
print("\n" + "-"*70)
print("Training Logistic Regression...")
print("-"*70)

lr_model = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
cv_scores = cross_val_score(
    lr_model, X_train_scaled, y_train,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='f1_weighted'
)
print(f"\nMean CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

lr_model.fit(X_train_scaled, y_train)
y_pred_lr = lr_model.predict(X_test_scaled)
lr_metrics = evaluate_classification(y_test, y_pred_lr, 'Logistic Regression', 'Early Warning 1st Year', class_labels)

# Naive Bayes
print("\n" + "-"*70)
print("Training Naive Bayes...")
print("-"*70)

nb_model = GaussianNB()
cv_scores = cross_val_score(
    nb_model, X_train_scaled, y_train,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='f1_weighted'
)
print(f"\nMean CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

nb_model.fit(X_train_scaled, y_train)
y_pred_nb = nb_model.predict(X_test_scaled)
nb_metrics = evaluate_classification(y_test, y_pred_nb, 'Naive Bayes', 'Early Warning 1st Year', class_labels)

# Save best
print("\n" + "="*70)
print("MODEL COMPARISON")
print("="*70)
print(f"\nLogistic Regression - F1: {lr_metrics['f1_weighted']:.4f} | Bal Acc: {lr_metrics['balanced_accuracy']:.4f}")
print(f"Naive Bayes         - F1: {nb_metrics['f1_weighted']:.4f} | Bal Acc: {nb_metrics['balanced_accuracy']:.4f}")

if lr_metrics['f1_weighted'] >= nb_metrics['f1_weighted']:
    best_model, best_name, best_metrics = lr_model, 'Logistic Regression', lr_metrics
else:
    best_model, best_name, best_metrics = nb_model, 'Naive Bayes', nb_metrics

print(f"\nüèÜ Best Model: {best_name}")

metadata = {
    'dataset': 'early_warning_1st_year',
    'model_type': best_name,
    'n_samples': len(X),
    'n_features': len(X.columns),
    'metrics': best_metrics,
    'classes': class_labels
}
save_model(best_model, scaler, metadata, 'early_warning_1st_year_best.pkl')

all_results.append({
    'Dataset': 'Early Warning 1st Year',
    'Type': 'Multi-class (4)',
    'Best Model': best_name,
    'F1 (weighted)': best_metrics['f1_weighted'],
    'Balanced Accuracy': best_metrics['balanced_accuracy']
})

### 3.3 Semester-by-Semester Probation Prediction

Predict if a student will be on probation (CGPA < 2.0) at the end of current semester.

In [None]:
print("="*70)
print("SEMESTER PROBATION PREDICTION")
print("="*70)

data = datasets['semester_probation_prediction']
X = data['X']
y = data['y']  # Binary: 0 = Not on probation, 1 = On probation

print(f"\nSamples: {len(X):,}")
print(f"Features: {len(X.columns)}")
print(f"Class distribution:")
print(f"  Not on Probation: {(y==0).sum()} ({(y==0).mean():.1%})")
print(f"  On Probation: {y.sum()} ({y.mean():.1%})")

X_filled = X.fillna(X.median())
X_train, X_test, y_train, y_test = train_test_split(
    X_filled, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain set: {len(X_train):,} | Test set: {len(X_test):,}")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression
print("\n" + "-"*70)
print("Training Logistic Regression...")
print("-"*70)

lr_model = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
cv_scores = cross_val_score(
    lr_model, X_train_scaled, y_train,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='f1_weighted'
)
print(f"\nMean CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

lr_model.fit(X_train_scaled, y_train)
y_pred_lr = lr_model.predict(X_test_scaled)
lr_metrics = evaluate_classification(y_test, y_pred_lr, 'Logistic Regression', 'Probation Prediction', ['Not on Probation', 'On Probation'])

# Naive Bayes
print("\n" + "-"*70)
print("Training Naive Bayes...")
print("-"*70)

nb_model = GaussianNB()
cv_scores = cross_val_score(
    nb_model, X_train_scaled, y_train,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='f1_weighted'
)
print(f"\nMean CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

nb_model.fit(X_train_scaled, y_train)
y_pred_nb = nb_model.predict(X_test_scaled)
nb_metrics = evaluate_classification(y_test, y_pred_nb, 'Naive Bayes', 'Probation Prediction', ['Not on Probation', 'On Probation'])

# Save best
print("\n" + "="*70)
print("MODEL COMPARISON")
print("="*70)
print(f"\nLogistic Regression - F1: {lr_metrics['f1_weighted']:.4f} | Bal Acc: {lr_metrics['balanced_accuracy']:.4f}")
print(f"Naive Bayes         - F1: {nb_metrics['f1_weighted']:.4f} | Bal Acc: {nb_metrics['balanced_accuracy']:.4f}")

if lr_metrics['f1_weighted'] >= nb_metrics['f1_weighted']:
    best_model, best_name, best_metrics = lr_model, 'Logistic Regression', lr_metrics
else:
    best_model, best_name, best_metrics = nb_model, 'Naive Bayes', nb_metrics

print(f"\nüèÜ Best Model: {best_name}")

metadata = {
    'dataset': 'semester_probation_prediction',
    'model_type': best_name,
    'n_samples': len(X),
    'n_features': len(X.columns),
    'metrics': best_metrics
}
save_model(best_model, scaler, metadata, 'probation_prediction_best.pkl')

all_results.append({
    'Dataset': 'Probation Prediction',
    'Type': 'Binary',
    'Best Model': best_name,
    'F1 (weighted)': best_metrics['f1_weighted'],
    'Balanced Accuracy': best_metrics['balanced_accuracy']
})

### 3.4 Academic Recovery Prediction

For students on probation (CGPA < 2.0), predict if they will recover (CGPA ‚â• 2.0) in next semester.

In [None]:
print("="*70)
print("ACADEMIC RECOVERY PREDICTION")
print("="*70)

data = datasets['academic_recovery']
X = data['X']
y = data['y']  # Binary: 0 = Did not recover, 1 = Recovered

print(f"\nSamples: {len(X):,} (probation instances)")
print(f"Features: {len(X.columns)}")
print(f"Class distribution:")
print(f"  Did not Recover: {(y==0).sum()} ({(y==0).mean():.1%})")
print(f"  Recovered: {y.sum()} ({y.mean():.1%})")

X_filled = X.fillna(X.median())
X_train, X_test, y_train, y_test = train_test_split(
    X_filled, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain set: {len(X_train):,} | Test set: {len(X_test):,}")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression
print("\n" + "-"*70)
print("Training Logistic Regression...")
print("-"*70)

lr_model = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
cv_scores = cross_val_score(
    lr_model, X_train_scaled, y_train,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='f1_weighted'
)
print(f"\nMean CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

lr_model.fit(X_train_scaled, y_train)
y_pred_lr = lr_model.predict(X_test_scaled)
lr_metrics = evaluate_classification(y_test, y_pred_lr, 'Logistic Regression', 'Academic Recovery', ['Did not Recover', 'Recovered'])

# Naive Bayes
print("\n" + "-"*70)
print("Training Naive Bayes...")
print("-"*70)

nb_model = GaussianNB()
cv_scores = cross_val_score(
    nb_model, X_train_scaled, y_train,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='f1_weighted'
)
print(f"\nMean CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

nb_model.fit(X_train_scaled, y_train)
y_pred_nb = nb_model.predict(X_test_scaled)
nb_metrics = evaluate_classification(y_test, y_pred_nb, 'Naive Bayes', 'Academic Recovery', ['Did not Recover', 'Recovered'])

# Save best
print("\n" + "="*70)
print("MODEL COMPARISON")
print("="*70)
print(f"\nLogistic Regression - F1: {lr_metrics['f1_weighted']:.4f} | Bal Acc: {lr_metrics['balanced_accuracy']:.4f}")
print(f"Naive Bayes         - F1: {nb_metrics['f1_weighted']:.4f} | Bal Acc: {nb_metrics['balanced_accuracy']:.4f}")

if lr_metrics['f1_weighted'] >= nb_metrics['f1_weighted']:
    best_model, best_name, best_metrics = lr_model, 'Logistic Regression', lr_metrics
else:
    best_model, best_name, best_metrics = nb_model, 'Naive Bayes', nb_metrics

print(f"\nüèÜ Best Model: {best_name}")

metadata = {
    'dataset': 'academic_recovery',
    'model_type': best_name,
    'n_samples': len(X),
    'n_features': len(X.columns),
    'metrics': best_metrics
}
save_model(best_model, scaler, metadata, 'academic_recovery_best.pkl')

all_results.append({
    'Dataset': 'Academic Recovery',
    'Type': 'Binary',
    'Best Model': best_name,
    'F1 (weighted)': best_metrics['f1_weighted'],
    'Balanced Accuracy': best_metrics['balanced_accuracy']
})

### 3.5 Next Semester GPA Change (Regression)

Predict the change in GPA in the next semester (continuous value).

In [None]:
print("="*70)
print("NEXT SEMESTER GPA CHANGE - REGRESSION")
print("="*70)

data = datasets['next_semester_gpa_change']
X = data['X']
y = data['y']  # Continuous: GPA change

print(f"\nSamples: {len(X):,}")
print(f"Features: {len(X.columns)}")
print(f"Target statistics:")
print(f"  Range: [{y.min():.2f}, {y.max():.2f}]")
print(f"  Mean: {y.mean():.3f} (std: {y.std():.3f})")

X_filled = X.fillna(X.median())
X_train, X_test, y_train, y_test = train_test_split(
    X_filled, y, test_size=0.2, random_state=42
)

print(f"\nTrain set: {len(X_train):,} | Test set: {len(X_test):,}")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Linear Regression
print("\n" + "-"*70)
print("Training Linear Regression...")
print("-"*70)

lr_model = LinearRegression()

# 5-fold CV on training set
cv_scores = cross_val_score(
    lr_model, X_train_scaled, y_train,
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    scoring='neg_mean_squared_error'
)
cv_rmse = np.sqrt(-cv_scores)
print(f"\nMean CV RMSE: {cv_rmse.mean():.4f} (+/- {cv_rmse.std():.4f})")

# Train on full training set
lr_model.fit(X_train_scaled, y_train)

# Predict on test set
y_pred = lr_model.predict(X_test_scaled)

# Evaluate
metrics = evaluate_regression(y_test, y_pred, 'Linear Regression', 'Next Semester GPA Change')

# Save model
metadata = {
    'dataset': 'next_semester_gpa_change',
    'model_type': 'Linear Regression',
    'n_samples': len(X),
    'n_features': len(X.columns),
    'metrics': metrics
}
save_model(lr_model, scaler, metadata, 'next_semester_gpa_change.pkl')

all_results.append({
    'Dataset': 'Next Semester GPA Change',
    'Type': 'Regression',
    'Best Model': 'Linear Regression',
    'RMSE': metrics['rmse'],
    'MAE': metrics['mae'],
    'R¬≤': metrics['r2']
})

### 3.6 STEM Course Success Prediction

Predict if a student will pass a STEM course (Grade ‚â• 2.0).

In [None]:
print("="*70)
print("STEM COURSE SUCCESS PREDICTION")
print("="*70)

data = datasets['stem_course_success']
X = data['X']
y = data['y']  # Binary: 0 = Failed, 1 = Passed

print(f"\nSamples: {len(X):,} (course enrollments)")
print(f"Features: {len(X.columns)}")
print(f"Class distribution:")
print(f"  Failed: {(y==0).sum()} ({(y==0).mean():.1%})")
print(f"  Passed: {y.sum()} ({y.mean():.1%})")

X_filled = X.fillna(X.median())
X_train, X_test, y_train, y_test = train_test_split(
    X_filled, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain set: {len(X_train):,} | Test set: {len(X_test):,}")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression
print("\n" + "-"*70)
print("Training Logistic Regression...")
print("-"*70)

lr_model = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
cv_scores = cross_val_score(
    lr_model, X_train_scaled, y_train,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='f1_weighted'
)
print(f"\nMean CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

lr_model.fit(X_train_scaled, y_train)
y_pred_lr = lr_model.predict(X_test_scaled)
lr_metrics = evaluate_classification(y_test, y_pred_lr, 'Logistic Regression', 'STEM Course Success', ['Failed', 'Passed'])

# Naive Bayes
print("\n" + "-"*70)
print("Training Naive Bayes...")
print("-"*70)

nb_model = GaussianNB()
cv_scores = cross_val_score(
    nb_model, X_train_scaled, y_train,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='f1_weighted'
)
print(f"\nMean CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

nb_model.fit(X_train_scaled, y_train)
y_pred_nb = nb_model.predict(X_test_scaled)
nb_metrics = evaluate_classification(y_test, y_pred_nb, 'Naive Bayes', 'STEM Course Success', ['Failed', 'Passed'])

# Save best
print("\n" + "="*70)
print("MODEL COMPARISON")
print("="*70)
print(f"\nLogistic Regression - F1: {lr_metrics['f1_weighted']:.4f} | Bal Acc: {lr_metrics['balanced_accuracy']:.4f}")
print(f"Naive Bayes         - F1: {nb_metrics['f1_weighted']:.4f} | Bal Acc: {nb_metrics['balanced_accuracy']:.4f}")

if lr_metrics['f1_weighted'] >= nb_metrics['f1_weighted']:
    best_model, best_name, best_metrics = lr_model, 'Logistic Regression', lr_metrics
else:
    best_model, best_name, best_metrics = nb_model, 'Naive Bayes', nb_metrics

print(f"\nüèÜ Best Model: {best_name}")

metadata = {
    'dataset': 'stem_course_success',
    'model_type': best_name,
    'n_samples': len(X),
    'n_features': len(X.columns),
    'metrics': best_metrics
}
save_model(best_model, scaler, metadata, 'stem_course_success_best.pkl')

all_results.append({
    'Dataset': 'STEM Course Success',
    'Type': 'Binary',
    'Best Model': best_name,
    'F1 (weighted)': best_metrics['f1_weighted'],
    'Balanced Accuracy': best_metrics['balanced_accuracy']
})

## 4. Summary & Model Comparison

In [None]:
# Create summary DataFrame
results_df = pd.DataFrame(all_results)

print("="*70)
print("FINAL MODEL PERFORMANCE SUMMARY")
print("="*70)
print("\n")
print(results_df.to_string(index=False))

# Save to CSV
results_df.to_csv('model_summary.csv', index=False)
print("\n‚úÖ Summary saved to 'model_summary.csv'")

In [None]:
# Visualize comparison
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Classification models
classification_df = results_df[results_df['Type'].str.contains('class')].copy()
x_pos = np.arange(len(classification_df))

axes[0].bar(x_pos - 0.2, classification_df['F1 (weighted)'], 0.4, label='F1 (weighted)', alpha=0.8)
axes[0].bar(x_pos + 0.2, classification_df['Balanced Accuracy'], 0.4, label='Balanced Accuracy', alpha=0.8)
axes[0].set_xlabel('Dataset')
axes[0].set_ylabel('Score')
axes[0].set_title('Classification Models Performance')
axes[0].set_xticks(x_pos)
axes[0].set_xticklabels(classification_df['Dataset'], rotation=45, ha='right')
axes[0].legend()
axes[0].grid(axis='y', alpha=0.3)
axes[0].set_ylim([0, 1.1])

# Add value labels on bars
for i, (f1, ba) in enumerate(zip(classification_df['F1 (weighted)'], classification_df['Balanced Accuracy'])):
    axes[0].text(i - 0.2, f1 + 0.02, f'{f1:.3f}', ha='center', va='bottom', fontsize=8)
    axes[0].text(i + 0.2, ba + 0.02, f'{ba:.3f}', ha='center', va='bottom', fontsize=8)

# Regression model
regression_df = results_df[results_df['Type'] == 'Regression'].copy()
if len(regression_df) > 0:
    x_pos_reg = np.arange(len(regression_df))
    
    axes[1].bar(x_pos_reg - 0.3, regression_df['RMSE'], 0.3, label='RMSE', alpha=0.8)
    axes[1].bar(x_pos_reg, regression_df['MAE'], 0.3, label='MAE', alpha=0.8)
    axes[1].bar(x_pos_reg + 0.3, regression_df['R¬≤'], 0.3, label='R¬≤', alpha=0.8)
    axes[1].set_xlabel('Dataset')
    axes[1].set_ylabel('Score')
    axes[1].set_title('Regression Model Performance')
    axes[1].set_xticks(x_pos_reg)
    axes[1].set_xticklabels(regression_df['Dataset'], rotation=45, ha='right')
    axes[1].legend()
    axes[1].grid(axis='y', alpha=0.3)
    
    # Add value labels
    for i, (rmse, mae, r2) in enumerate(zip(regression_df['RMSE'], regression_df['MAE'], regression_df['R¬≤'])):
        axes[1].text(i - 0.3, rmse + 0.02, f'{rmse:.3f}', ha='center', va='bottom', fontsize=8)
        axes[1].text(i, mae + 0.02, f'{mae:.3f}', ha='center', va='bottom', fontsize=8)
        axes[1].text(i + 0.3, r2 + 0.02, f'{r2:.3f}', ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.savefig('results/model_comparison_summary.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n‚úÖ Comparison visualization saved to 'results/model_comparison_summary.png'")



### Best Practices Applied:
1. ‚úÖ Stratified splits for classification (maintains class proportions)
2. ‚úÖ Class weighting to handle imbalance
3. ‚úÖ 5-fold cross-validation for robust evaluation
4. ‚úÖ Feature scaling (StandardScaler)
5. ‚úÖ Comprehensive metrics (F1 weighted, Balanced Accuracy, RMSE, MAE)
