In [3]:
import pandas as pd
import joblib
from sklearn.linear_model import LogisticRegressionCV
from Clean_data import clean_data
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, classification_report
from Clean_data import clean_data

In [4]:
data = pd.read_csv("/Users/kaylenamann/Downloads/BC Grad/2025Fall_ADAN7430/Final/Raw Data/EducationData.csv")

years = sorted(data["year"].unique())
train_years = years[:-1]  
test_year = years[-1]     

train_data = data[data["year"].isin(train_years)].copy()
test_data = data[data["year"] == test_year].copy()

X_train, y_train, cutoff, training_columns, continuous_features = clean_data(
    train_data, 
    cutoff=None,
    training_columns=None,
    is_training=True
)

print(f"\nTotal features: {len(training_columns)}")
print(f"Continuous features to scale: {len(continuous_features)}")
print(f"Categorical features (dummies): {len(training_columns) - len(continuous_features)}")

X_test_raw = clean_data(
    test_data,  
    cutoff=cutoff,
    training_columns=training_columns,
    is_training=False
)

test_data["math_test_pct_prof_midpt"] = pd.to_numeric(
    test_data["math_test_pct_prof_midpt"], errors="coerce"
)
y_test = (test_data["math_test_pct_prof_midpt"] <= cutoff).astype(int)
valid_idx = y_test.notna()
y_test = y_test[valid_idx]
X_test = X_test_raw[valid_idx]

print(f"\nTraining samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Cutoff: {cutoff:.2f}%")

# Scale features
categorical_features = [col for col in training_columns if col not in continuous_features]
scaler = StandardScaler()

X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[continuous_features] = scaler.fit_transform(X_train[continuous_features])
X_test_scaled[continuous_features] = scaler.transform(X_test[continuous_features])

model = LogisticRegressionCV(
    Cs=10,
    cv=5,
    penalty="l2",
    solver="lbfgs",
    class_weight="balanced",
    scoring="roc_auc",
    max_iter=5000,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train_scaled, y_train)

y_train_pred = model.predict(X_train_scaled)
y_train_prob = model.predict_proba(X_train_scaled)[:, 1]

print("\n" + "="*70)
print("TRAINING SET PERFORMANCE (2016-2017)")
print("="*70)
print(f"Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_train, y_train_prob):.4f}")
print(f"F1 Score: {f1_score(y_train, y_train_pred):.4f}")

y_test_pred = model.predict(X_test_scaled)
y_test_prob = model.predict_proba(X_test_scaled)[:, 1]

print("\n" + "="*70)
print("TEST SET PERFORMANCE (2018)")
print("="*70)
print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_test_prob):.4f}")
print(f"F1 Score: {f1_score(y_test, y_test_pred):.4f}")

print(f"\nClassification Report (2018):")
print(classification_report(y_test, y_test_pred, 
                          target_names=['Not Bottom Quartile', 'Bottom Quartile']))

joblib.dump(model, "/Users/kaylenamann/Downloads/BC Grad/2025Fall_ADAN7430/Final/Code/model/final_model.pkl")
joblib.dump(cutoff, "/Users/kaylenamann/Downloads/BC Grad/2025Fall_ADAN7430/Final/Code/model/cutoff.pkl")
joblib.dump(training_columns, "/Users/kaylenamann/Downloads/BC Grad/2025Fall_ADAN7430/Final/Code/model/training_columns.pkl")
joblib.dump(scaler, "/Users/kaylenamann/Downloads/BC Grad/2025Fall_ADAN7430/Final/Code/model/scaler.pkl")
joblib.dump(continuous_features, "/Users/kaylenamann/Downloads/BC Grad/2025Fall_ADAN7430/Final/Code/model/continuous_features.pkl")

print("\n" + "="*70)
print("✓ MODEL SAVED")
print("="*70)
print(f"Trained on: {train_years} (years)")
print(f"Validated on: {test_year} (year)")
print("This simulates real early-warning: using 2016-2017 patterns to predict 2018 outcomes")
print("="*70)


Total features: 16
Continuous features to scale: 8
Categorical features (dummies): 8

Training samples: 3742
Test samples: 1872
Cutoff: 37.00%

TRAINING SET PERFORMANCE (2016-2017)
Accuracy: 0.8311
ROC-AUC: 0.9158
F1 Score: 0.7109

TEST SET PERFORMANCE (2018)
Accuracy: 0.8344
ROC-AUC: 0.8901
F1 Score: 0.7140

Classification Report (2018):
                     precision    recall  f1-score   support

Not Bottom Quartile       0.93      0.84      0.88      1398
    Bottom Quartile       0.63      0.82      0.71       474

           accuracy                           0.83      1872
          macro avg       0.78      0.83      0.80      1872
       weighted avg       0.86      0.83      0.84      1872


✓ MODEL SAVED
Trained on: [np.int64(2016), np.int64(2017)] (years)
Validated on: 2018 (year)
