# OOF Stacking + Hill Climb (Educational)
This notebook blends many community OOF predictions. We:  1) load OOF/sub predictions,  2) optionally hill-climb for a weighted blend,  3) fit a RidgeCV meta-model to finalize the stack.


In [None]:
# Install hillclimbers (one-time per environment)
!pip install hillclimbers -q


In [None]:
# Core imports
import numpy as np
import pandas as pd
import glob
import os
import hashlib
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from hillclimbers import climb_hill, partial


In [None]:
# ==========================================
# 0) CONFIGURATION
# ==========================================
# test_req=True makes the run faster by sampling; use False for final blending
test_req = False
skip_hillclimb = False

BASE_DIR = r"C:\Users\james\OneDrive\Documents\GitHub\Kaggle-Competitions\Predicting Student Test Scores"
OOF_DIR = r"C:\Users\james\OneDrive\Documents\GitHub\Kaggle-Competitions\Predicting Student Test Scores\OOF_PREDS"

TARGET = 'exam_score'
TRAIN_PATH = os.path.join(BASE_DIR, 'train.csv')
SAMPLE_SUB_PATH = os.path.join(BASE_DIR, 'sample_submission.csv')

print('Base dir:', BASE_DIR)
print('OOF dir:', OOF_DIR)


In [None]:
# ==========================================
# 1) LOAD OOF + SUB FILES
# ==========================================
# Expected naming: modelname_oof.csv and modelname_sub.csv
oof_files = sorted(glob.glob(os.path.join(OOF_DIR, '*_oof.csv')))
if not oof_files:
    raise FileNotFoundError('No *_oof.csv files found. Check OOF_DIR or filenames.')

sub_files = [f.replace('_oof.csv', '_sub.csv') for f in oof_files]
model_names = [os.path.basename(f).replace('_oof.csv', '') for f in oof_files]

train_df = pd.read_csv(TRAIN_PATH)
y_true = train_df[TARGET].values

print(f'Loaded {len(model_names)} models')


In [None]:
# ==========================================
# 2) DEDUPLICATE IDENTICAL SUBMISSIONS
# ==========================================
# Some community submissions are identical; remove duplicates to reduce noise.
unique_subs = {}
indices_to_keep = []

for i, (s_file, name) in enumerate(zip(sub_files, model_names)):
    temp_sub = pd.read_csv(s_file)[TARGET].values
    sub_hash = hashlib.md5(temp_sub.tobytes()).hexdigest()
    if sub_hash not in unique_subs:
        unique_subs[sub_hash] = name
        indices_to_keep.append(i)
    else:
        print(f'?? Dropping duplicate: {name}')

oof_files = [oof_files[i] for i in indices_to_keep]
sub_files = [sub_files[i] for i in indices_to_keep]
model_names = [model_names[i] for i in indices_to_keep]

print(f'Models after dedup: {len(model_names)}')


In [None]:
# ==========================================
# 3) BUILD OOF/SUB MATRICES
# ==========================================
# Each column = one model's predictions
oofs = np.stack([pd.read_csv(f)[TARGET].values for f in oof_files], axis=1)
subs = np.stack([pd.read_csv(f)[TARGET].values for f in sub_files], axis=1)

df_oof = pd.DataFrame(oofs, columns=model_names)
df_sub = pd.DataFrame(subs, columns=model_names)

print('OOF shape:', df_oof.shape)
print('SUB shape:', df_sub.shape)


In [None]:
# ==========================================
# 4) OPTIONAL: HILL CLIMBING BLEND
# ==========================================
# Hill climbing finds weights that minimize RMSE on OOFs.
hc_precision = 0.01 if test_req else 0.001
hc_negative = False if test_req else True

if test_req:
    np.random.seed(42)
    sample_idx = np.random.choice(len(train_df), size=int(len(train_df) * 0.2), replace=False)
    hc_train = train_df.iloc[sample_idx].reset_index(drop=True)
    hc_oof = df_oof.iloc[sample_idx].reset_index(drop=True)
    y_stack = hc_train[TARGET].values
    print(f'Test mode: using {len(hc_train)} rows')
else:
    hc_train = train_df
    hc_oof = df_oof
    y_stack = y_true

if not skip_hillclimb:
    print('Running hill climb...')
    hc_test, hc_oof_blend = climb_hill(
        train=hc_train,
        target=TARGET,
        objective='minimize',
        eval_metric=partial(lambda y_true, y_pred: mean_squared_error(y_true, y_pred, squared=False)),
        oof_pred_df=hc_oof,
        test_pred_df=df_sub,
        precision=hc_precision,
        negative_weights=hc_negative,
        return_oof_preds=True
    )
    # Use the blended output as a single stacking feature
    X_train = hc_oof_blend.reshape(-1, 1)
    X_test = hc_test.reshape(-1, 1)
else:
    print('Skipping hill climb: using all model OOFs for stacking')
    X_train = oofs
    X_test = subs


In [None]:
# ==========================================
# 5) RIDGE CV STACKING
# ==========================================
# RidgeCV finds the best linear combination of the input predictions.
kf_splits = 3 if test_req else 10
kf = KFold(n_splits=kf_splits, shuffle=True, random_state=42)
alphas = np.logspace(-2, 7, 50)

oof_final = np.zeros(len(y_stack))
sub_final = np.zeros(X_test.shape[0])

for fold, (tr_idx, va_idx) in enumerate(kf.split(X_train)):
    X_tr, y_tr = X_train[tr_idx], y_stack[tr_idx]
    X_va, y_va = X_train[va_idx], y_stack[va_idx]

    model = RidgeCV(alphas=alphas, scoring='neg_root_mean_squared_error')
    model.fit(X_tr, y_tr)

    oof_final[va_idx] = model.predict(X_va)
    sub_final += model.predict(X_test) / kf_splits
    print(f'Fold {fold+1}/{kf_splits} complete. Alpha: {model.alpha_:.4f}')

rmse = mean_squared_error(y_stack, oof_final, squared=False)
print(f'FINAL RMSE: {rmse:.6f}')


In [None]:
# ==========================================
# 6) SAVE SUBMISSION
# ==========================================
sub_template = pd.read_csv(SAMPLE_SUB_PATH)
sub_template[TARGET] = sub_final
out_path = os.path.join(BASE_DIR, f'submission_rmse_{rmse:.6f}.csv')
sub_template.to_csv(out_path, index=False)
print('Saved:', out_path)
