In [None]:
from CAD.config import Config

import optuna
import joblib
import pandas as pd
import numpy as np
import warnings
import re
import math
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import shap

from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split, KFold, learning_curve
from pathlib import Path

In [None]:
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
config = Config()
DATA_PATH = config.PROCESSED_DATA_DIR / 'FinalTrainingData.parquet'
MODEL_PATH = config.MODELS_DIR / 'lgbm_regressor.pkl'
N_TRIALS = 50
SEED = 999
CATEGORICALS = [
    'DISTRICT_CODE', 'DISTRICT_NAME', 'LOCATION_ID',
    'SCHOOL_NAME', 'STUDENT_ID', 'ETHNIC_CODE', 
    'STUDENT_GENDER', 'ECONOMIC_CODE', 'SPECIAL_ED_CODE',
    'ENG_PROF_CODE', 'HISPANIC_IND'
]
OBJECT_COLS = [
    'Att_Rate_Prev1_2019', 'Att_Rate_Prev2_2019', 'Delta_Attendance_2019', 'Grade_X_Gender_2019', 'Econ_X_Grade_Band_2019', 'Att_Rate_Prev2_2020', 'Grade_X_Gender_2020', 'Econ_X_Grade_Band_2020', 'Grade_X_Gender_2021', 'Econ_X_Grade_Band_2021', 'Grade_X_Gender_2022', 'Econ_X_Grade_Band_2022',
    'Grade_X_Gender_2023', 'Econ_X_Grade_Band_2023'
]
FOLDS = [
    (2021, 2022),
    (2022, 2023), 
    (2023, 2024)
]

YEAR_PATTERN = re.compile(r'_(\d{4})$')

In [None]:
df = pd.read_parquet(DATA_PATH)
for col in CATEGORICALS:
    if col in df.columns:
        df[col] = df[col].astype('category')

for col in OBJECT_COLS:
    if col in df.columns:
        df[col] = df[col].astype('category')

STATIC_FEATURES = [c for c in CATEGORICALS if c in df.columns]

In [None]:
def cols_up_to(year: int) -> list[str]:
    keep = []
    for col in df.columns:
        res = YEAR_PATTERN.search(col)
        if res:
            if int(res.group(1)) <= year:
                keep.append(col)
        elif col in STATIC_FEATURES or col in OBJECT_COLS:
            keep.append(col)

    return keep

In [None]:
# Optuna Hyperparameter Tuning
def objective(trial: optuna.Trial) -> float:
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'randome_state': SEED,
        'verbosity': -1,
        'n_jobs': -1,
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 2000),
        'num_leaves': trial.suggest_int('num_leaves', 10, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 10.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0, 15),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
    }

    rmses = []
    for max_feature_year, target_year in FOLDS:
        feature_cols = cols_up_to(max_feature_year)
        y_col = f'Attendance_Rate_{target_year}'

        x_full, y_full = df.loc[:, feature_cols], df.loc[:, y_col]
        x_train, x_test, y_train, y_test = train_test_split(x_full, y_full, test_size=0.2, random_state=SEED)

        model = LGBMRegressor(**params)
        model.fit(
            x_train, y_train,
            categorical_feature=[c for c in STATIC_FEATURES if c in feature_cols],
            eval_set=[(x_test, y_test)],
            eval_metric='rmse',
        )
        preds = model.predict(x_test, num_iteration=model.best_iteration_)
        rmses.append(mean_squared_error(y_test, preds)) #type: ignore

    return float(np.mean(rmses))

In [None]:
study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=SEED))
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)
print(f'\nBest RMSE: {study.best_value}')
print(f'\nBest Hyper Parameters: {study.best_params}')

In [None]:
study_best_params = {
    'learning_rate': 0.03187793347826908, 
    'n_estimators': 1416, 
    'num_leaves': 15, 
    'max_depth': 11, 
    'min_child_samples': 9, 
    'min_child_weight': 5.0989767957043455, 
    'reg_alpha': 0.0065283875392360136, 
    'reg_lambda': 1.1449128977564993, 
    'feature_fraction': 0.6019754679595203, 
    'bagging_fraction': 0.974121394316118, 
    'bagging_freq': 1, 
    'min_gain_to_split': 0.003244787478175533, 
    'lambda_l1': 1.492591506646833e-08, 
    'lambda_l2': 1.9818051331341124e-05
}

best_params = {
    **study_best_params,
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'random_state': SEED
}

final_feature_cols = cols_up_to(2023)
final_target = 'Attendance_Rate_2024'

X_train, X_test, y_train, y_test = train_test_split(df[final_feature_cols], df[final_target], test_size=0.2, random_state=SEED)


final_model = LGBMRegressor(**best_params)
final_model.fit(
    X_train,
    y_train,
    categorical_feature= [c for c in STATIC_FEATURES if c in final_feature_cols]
)


In [None]:
y_pred = final_model.predict(X_test, num_iteration=final_model.best_iteration_)
mse = mean_squared_error(y_true=y_test, y_pred=y_pred) # type:ignore

mask = y_test != 0
mape = mean_absolute_percentage_error(y_test[mask], y_pred[mask]) #type:ignore

print(f"Root Mean Squared Error: {math.sqrt(mse)}")
print(f"Mean Absolute Percentage Error: {mape}")

In [None]:
# Testing
df_test = X_test.copy()
df_test["y_true"] = y_test.values
df_test["y_pred"] = y_pred


n_students = 6
sample_ids = np.random.choice(df_test["STUDENT_ID"].unique(), n_students, replace=False)

attendance_cols = [f"Attendance_Rate_{yr}" for yr in range(2019, 2024)] 
years = np.arange(2019, 2025)                                   

fig, axs = plt.subplots(2, 3, figsize=(16, 8), sharey=True)
axs = axs.ravel()

for ax, sid in zip(axs, sample_ids):
    row  = df_test.loc[df_test["STUDENT_ID"] == sid].iloc[0]
    hist = row[attendance_cols].values
    ax.plot(years[:-1], hist,  marker="o", label="actual history")
    ax.plot(years[-1],  row["y_pred"],  marker=">", color="tab:green", label="model 24")
    ax.plot(years[-1],  row["y_true"],  marker="X", color="tab:red",   label="true 24")
    ax.set_title(f"Student {sid}")
    ax.set_xticks(years)
    ax.set_ylim(0.7, 1.0)

handles, labels = axs[0].get_legend_handles_labels()
fig.legend(handles, labels, loc="upper center", ncol=3)
fig.suptitle("Model vs. actual for sample students", fontsize=14)
fig.tight_layout(rect=[0, 0, 1, 0.93]) # type:ignore
plt.show()


In [None]:
from scipy import stats

df_test['y_base'] = df_test['Attendance_Rate_2023'].values
df_test["abs_err_model"] = np.abs(df_test["y_true"] - df_test["y_pred"])
df_test["abs_err_base"]  = np.abs(df_test["y_true"] - df_test["y_base"])

plt.figure(figsize=(7,4))
plt.hist(df_test["abs_err_base"],  bins=30, alpha=0.6, label="baseline")
plt.hist(df_test["abs_err_model"], bins=30, alpha=0.6, label="model")
plt.xlabel("Absolute error")
plt.ylabel("Frequency")
plt.title("Error distribution: model vs. baseline")
plt.legend()
plt.show()

In [None]:
df['Predictions'] = final_model.predict(df[final_feature_cols])
df = df.drop('SCHOOL_YEAR', axis=1)

In [None]:
original_df = pd.read_csv(config.INTERIM_DATA_DIR / 'Merged_Data.csv')
original_df = original_df.drop('CURR_GRADE_ORD', axis=1)

In [None]:
import re
import pandas as pd

df_wide = df.copy()
if 'STUDENT_ID' not in df_wide.columns:      
    df_wide = df_wide.reset_index()   

years_rx   = re.compile(r'_(\d{4})$')
year_cols  = [c for c in df_wide.columns if years_rx.search(c)]
stubnames  = sorted({years_rx.sub('', c) for c in year_cols}) 

df_long = (
    pd.wide_to_long(
        df_wide,                 
        stubnames   = stubnames,
        i = 'STUDENT_ID',
        j = 'SCHOOL_YEAR',
        sep = '_',
        suffix = r'\d{4}'
    )
    .reset_index()
    .sort_values(['STUDENT_ID', 'SCHOOL_YEAR'])       
    .reset_index(drop=True)
)

to_fix = ['Att_Rate_Prev1', 'Att_Rate_Prev2', 'Delta_Attendance']
df_long[to_fix] = df_long[to_fix].apply(pd.to_numeric, errors='coerce')

In [None]:
df_long['Predictions_District'] = (
    df_long
      .groupby(['DISTRICT_CODE', 'SCHOOL_YEAR'])['Predictions']
      .transform('mean')
)

df_long['Predictions_School'] = (
    df_long
      .groupby(['LOCATION_ID', 'SCHOOL_YEAR'])['Predictions']
      .transform('mean')
)

df_long['Predictions_Grade'] = (
    df_long
      .groupby(['STUDENT_GRADE_LEVEL', 'SCHOOL_YEAR'])['Predictions']
      .transform('mean')
)

df_long = df_long.loc[:, [*original_df.columns.tolist(),
                          'Predictions',
                          'Predictions_District',
                          'Predictions_School',
                          'Predictions_Grade']]
df_long.info()

In [None]:
df_long.to_csv(config.PROCESSED_DATA_DIR / 'Predictions.csv')