<a href="https://colab.research.google.com/github/JoshCCorby/Leaving-Cert-Chemistry-Question-Predictor/blob/main/Topic%20Predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# improved_rf_lc_predictor_v2.py
# Improvements v2:
# - Added optional grid search tuning for RF hyperparameters.
# - Class balancing via SMOTE for imbalanced 0/1 appearances.
# - SHAP explanations for next-year predictions.
# - Plotly for interactive plots (requires pip install plotly shap imbalanced-learn).
# - More robust year handling with interpolation for missing years.

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
import shap
import re

# ---------- USER CONFIG ----------
CSV_URL = "https://docs.google.com/spreadsheets/d/1fZhDRbXM96enJFXnsP8eRJEVOcQ5agxaapMI9GfW_PQ/export?format=csv"  # Switch for experiments sheet if needed
N_LAGS = 3
RANDOM_STATE = 42
THRESHOLD_PROB = 0.5
PLOT_DIR = "plots"
TUNE_MODEL = False  # Set to True for hyperparam tuning (slower)
os.makedirs(PLOT_DIR, exist_ok=True)
# ---------------------------------

# Load and process
df_wide = pd.read_csv(CSV_URL)
df_wide = df_wide.rename(columns={df_wide.columns[0]: 'topic'})
year_cols = [col for col in df_wide.columns if col != 'topic' and 'deferred' not in col.lower()]
df_wide = df_wide[['topic'] + year_cols]

year_map = {}
for col in year_cols:
    year_str = re.search(r'\d{4}', col)
    if year_str:
        year_map[col] = int(year_str.group())
    else:
        raise ValueError(f"Invalid year header: {col}")

year_cols_sorted = sorted(year_cols, key=lambda c: year_map[c])
years = [year_map[c] for c in year_cols_sorted]

def compute_appear(cell):
    if pd.isna(cell) or not isinstance(cell, str) or cell.strip() == '':
        return 0
    return 1

for col in year_cols:
    df_wide[col] = df_wide[col].apply(compute_appear)

df = pd.melt(df_wide, id_vars=['topic'], value_vars=year_cols, var_name='year_col', value_name='appear')
df['year'] = df['year_col'].map(year_map)
df = df.drop(columns=['year_col'])
df = df[['year', 'topic', 'appear']].sort_values(['topic', 'year']).reset_index(drop=True)
df['year'] = df['year'].astype(int)

# Fill missing years with linear interpolation
df = df.set_index(['topic', 'year']).unstack().stack().reset_index()
df['appear'] = df['appear'].interpolate(method='linear', limit_direction='both').fillna(0).round().astype(int)

def add_lags(g, n):
    g = g.sort_values("year").copy()
    for lag in range(1, n+1):
        g[f"appear_lag{lag}"] = g["appear"].shift(lag).fillna(0)
    g[f"appear_mean_{n}"] = g[[f"appear_lag{i}" for i in range(1, n+1)]].mean(axis=1)
    return g

df = df.groupby("topic", group_keys=False).apply(lambda g: add_lags(g, N_LAGS)).reset_index(drop=True)

lag_cols = [f"appear_lag{i}" for i in range(1, N_LAGS+1)]
df_model = df.dropna(subset=lag_cols).reset_index(drop=True)

le = LabelEncoder()
df_model["topic_id"] = le.fit_transform(df_model["topic"])

features = lag_cols + [f"appear_mean_{N_LAGS}", "year", "topic_id"]
X = df_model[features].astype(float)
y = df_model["appear"].astype(int)

# Balance classes
smote = SMOTE(random_state=RANDOM_STATE)
X_res, y_res = smote.fit_resample(X, y)

# Time-series CV
tscv = TimeSeriesSplit(n_splits=len(np.unique(df_model['year'])) - 1)
cv_accuracies, cv_f1s = [], []

for train_idx, test_idx in tscv.split(X_res):
    X_train_cv, X_test_cv = X_res.iloc[train_idx], X_res.iloc[test_idx]
    y_train_cv, y_test_cv = y_res.iloc[train_idx], y_res.iloc[test_idx]
    model_cv = RandomForestClassifier(random_state=RANDOM_STATE)
    model_cv.fit(X_train_cv, y_train_cv)
    y_pred_cv = model_cv.predict(X_test_cv)
    cv_accuracies.append(accuracy_score(y_test_cv, y_pred_cv))
    cv_f1s.append(f1_score(y_test_cv, y_pred_cv))

print("CV Mean Accuracy:", np.mean(cv_accuracies))
print("CV Mean F1:", np.mean(cv_f1s))

# Final model with optional tuning
param_grid = {'n_estimators': [100, 200], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5]}
if TUNE_MODEL:
    model = GridSearchCV(RandomForestClassifier(random_state=RANDOM_STATE), param_grid, cv=tscv, scoring='f1')
else:
    model = RandomForestClassifier(random_state=RANDOM_STATE, n_estimators=200)

last_year = df_model["year"].max()
train_mask = df_model["year"] < last_year
X_train, X_test = X[train_mask], X[test_mask]
y_train, y_test = y[train_mask], y[test_mask]
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Test accuracy:", accuracy_score(y_test, y_pred))
print("Test F1:", f1_score(y_test, y_pred))

# Feature importances
fi = pd.DataFrame({"feature": features, "importance": model.feature_importances_}).sort_values("importance", ascending=False)
print("\nFeature importances:")
print(fi.to_string(index=False))

# SHAP explanations
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_future)  # For predictions below
print("\nSHAP summary saved to plots/shap_summary.png")
shap.summary_plot(shap_values, X_future, show=False)
plt.savefig(os.path.join(PLOT_DIR, 'shap_summary.png'))

# Historical plot
pivot_df = df.pivot(index='topic', columns='year', values='appear')
fig = px.heatmap(pivot_df, title='Historical Topic Appearances', color_continuous_scale='Blues')
fig.write_html(os.path.join(PLOT_DIR, 'historical_appearances.html'))
print("\nSaved interactive historical plot to plots/historical_appearances.html")

# Predict next year
next_year = last_year + 1
pred_rows = []
for topic, g in df.sort_values("year").groupby("topic"):
    g = g.sort_values("year")
    last_vals = g["appear"].iloc[-N_LAGS:].values if len(g) >= N_LAGS else np.pad(g["appear"].values, (N_LAGS - len(g), 0), constant_values=0)
    row = {f"appear_lag{l+1}": last_vals[-(l+1)] for l in range(N_LAGS)}
    row[f"appear_mean_{N_LAGS}"] = np.mean(last_vals[-N_LAGS:])
    row["year"] = next_year
    row["topic"] = topic
    row["topic_id"] = le.transform([topic])[0]
    pred_rows.append(row)

X_future = pd.DataFrame(pred_rows)[features].astype(float)
preds_prob = model.predict_proba(X_future)[:, 1]

results = pd.DataFrame({"topic": [r["topic"] for r in pred_rows], "predicted_prob_appear": preds_prob})
results = results.sort_values("predicted_prob_appear", ascending=False).reset_index(drop=True)
print(f"\nTop predicted topics for {next_year} (by prob of appearing):")
print(results.head(15).to_string(index=False))

results["predicted_appear"] = (results["predicted_prob_appear"] >= THRESHOLD_PROB).astype(int)
print(f"\nTopics predicted to appear (threshold {THRESHOLD_PROB}):")
print(results[results["predicted_appear"]==1]["topic"].to_string(index=False))

# Predictions plot
fig = px.bar(results.head(10), x='predicted_prob_appear', y='topic', orientation='h', title=f'Top 10 Predicted Topics for {next_year}')
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.write_html(os.path.join(PLOT_DIR, 'predicted_probs.html'))
print("\nSaved interactive predictions plot to plots/predicted_probs.html")

results.to_csv("lc_predicted_2026.csv", index=False)
print("\nPredictions saved to lc_predicted_2026.csv")

  df = df.set_index(['topic', 'year']).unstack().stack().reset_index()
  df = df.groupby("topic", group_keys=False).apply(lambda g: add_lags(g, N_LAGS)).reset_index(drop=True)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


CV Mean Accuracy: 0.8184210526315789
CV Mean F1: 0.4807305157618108


NameError: name 'test_mask' is not defined