In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
import shap
import joblib
import sqlite3

model = joblib.load("student_model.joblib")
scaler = joblib.load("scaler.joblib")
encoder = joblib.load("encoder.joblib")

data_path = "/content/student-combined.csv"
data = pd.read_csv(data_path, sep=';')

number_columns = [
    "age", "Medu", "Fedu", "traveltime", "studytime", "failures",
    "famrel", "freetime", "goout", "Dalc", "Walc", "health", "absences",
    "G1", "G2", "study_effort", "alcohol_index", "parents_education",
    "grade_change", "high_absences"
]
word_columns = [
    "school", "sex", "famsize", "Pstatus", "Mjob", "Fjob", "reason",
    "guardian", "schoolsup", "famsup", "paid", "activities", "nursery",
    "higher", "internet", "romantic"
]

def add_new_columns(data):
    """Add engineered features to the dataset."""
    # Check if 'G3' exists before calculating 'final_grade'
    if 'G3' in data.columns:
        data["final_grade"] = (data["G1"] + data["G2"] + data["G3"]) / 3
        data["final_grade"] = data["final_grade"].round(2)
    #If G3 column doesn't exist
    else:
        #Set final_grade to nan
        data['final_grade'] = np.nan

    data["alcohol_index"] = data["Dalc"] + data["Walc"]
    data["parents_education"] = data["Medu"] + data["Fedu"]
    data["grade_change"] = data["G2"] - data["G1"]
    avg_absences = data["absences"].mean()
    data["high_absences"] = data["absences"].apply(lambda x: 1 if x > avg_absences else 0)
    data["study_effort"] = data["studytime"] * (5 - data["traveltime"])
    return data

data = add_new_columns(data)

X_scaled = scaler.transform(data[number_columns])
clusters = KMeans(n_clusters=3, random_state=42).fit_predict(X_scaled)
data["cluster"] = clusters

def process_user_data(user_input):
    """Process user input, predict risk, and generate insights."""
    # Convert user input to DataFrame
    user_df = pd.DataFrame([user_input])

    # Add engineered features
    user_df = add_new_columns(user_df)

    # Preprocess for model
    user_numbers = scaler.transform(user_df[number_columns])
    user_words = encoder.transform(user_df[word_columns])
    user_ready = np.hstack((user_numbers, user_words))

    # Predict with Random Forest
    risk_prob = model.predict_proba(user_ready)[0][1]
    at_risk = model.predict(user_ready)[0]

    # SHAP analysis
    explainer = shap.TreeExplainer(model)
    predicted_class = model.predict(user_ready)[0]
    shap_values = explainer.shap_values(user_ready)[predicted_class]
    shap_contributions = dict(zip(number_columns + list(encoder.get_feature_names_out(word_columns)), shap_values))

    insights = {}

    # 1.
    base_threshold = 0.5
    threshold_adjust = 0.1 * (user_df["parents_education"].iloc[0] / 8)
    dynamic_threshold = base_threshold + threshold_adjust
    risk_label = "High" if risk_prob > dynamic_threshold else "Low" if risk_prob < 0.3 else "Medium"
    insights["dynamic_risk"] = {"probability": f"{risk_prob:.2f}", "label": risk_label, "threshold": f"{dynamic_threshold:.2f}"}

    # 2.
    shap_interactions = explainer.shap_interaction_values(user_ready)
    shap_interactions = shap_interactions[predicted_class]

    if shap_interactions.ndim > 2:
        shap_interactions = shap_interactions.sum(axis=tuple(range(2, shap_interactions.ndim)))

    if shap_interactions.ndim == 2:
        top_interaction_idx = np.argmax(np.abs(shap_interactions).sum(axis=0))
        feature1, feature2 = np.unravel_index(top_interaction_idx, shap_interactions.shape)
        feature_names = number_columns + list(encoder.get_feature_names_out(word_columns))
        insights["top_interaction"] = f"{feature_names[feature1]} + {feature_names[feature2]}: {shap_interactions[feature1, feature2]:.2f} impact"
    else:
        insights["top_interaction"] = "No significant interactions detected."

    # 3.
    user_cluster = KMeans(n_clusters=3, random_state=42).fit(X_scaled).predict(user_numbers)[0]
    peer_avg = data[data["cluster"] == user_cluster][["studytime", "absences", "G1"]].mean()
    insights["peer_benchmark"] = {
        "studytime": f"Yours: {user_df['studytime'].iloc[0]} vs. Peer Avg: {peer_avg['studytime']:.1f}",
        "absences": f"Yours: {user_df['absences'].iloc[0]} vs. Peer Avg: {peer_avg['absences']:.1f}",
        "G1": f"Yours: {user_df['G1'].iloc[0]} vs. Peer Avg: {peer_avg['G1']:.1f}"
    }

    # 4.
    def what_if(user_df, feature, new_value):
        mod_df = user_df.copy()
        mod_df[feature] = new_value
        mod_numbers = scaler.transform(mod_df[number_columns])
        mod_words = encoder.transform(mod_df[word_columns])
        mod_ready = np.hstack((mod_numbers, mod_words))
        return model.predict_proba(mod_ready)[0][1]

    what_if_study = what_if(user_df, "studytime", user_df["studytime"].iloc[0] + 1)
    what_if_absences = what_if(user_df, "absences", max(0, user_df["absences"].iloc[0] - 5))
    insights["what_if"] = {
        "current": f"{risk_prob:.2f}",
        "study_plus_1": f"{what_if_study:.2f}",
        "absences_minus_5": f"{what_if_absences:.2f}"
    }

    # 5.
    trajectory = [user_df["G1"].iloc[0], user_df["G2"].iloc[0]]
    if "G3" in user_df:
        trajectory.append(user_df["G3"].iloc[0])
    else:
        trajectory.append(trajectory[-1] + (trajectory[-1] - trajectory[-2]))
    insights["trajectory"] = trajectory

    # 6.
    categories = {
        "Academic Effort": ["studytime", "failures", "study_effort"],
        "Lifestyle": ["Dalc", "Walc", "goout", "alcohol_index"],
        "Support": ["famrel", "parents_education"]
    }
    risk_profile = {}
    total_shap = np.sum(np.abs(list(shap_contributions.values())))
    for category, feats in categories.items():
        contrib = np.sum([shap_contributions.get(feat, 0) for feat in feats if feat in shap_contributions])
        risk_profile[category] = max(0, contrib) / total_shap * 100 if total_shap > 0 else 0
    insights["risk_profile"] = {k: f"{v:.1f}%" for k, v in risk_profile.items()}

    # 7.
    interventions = [
        ("studytime", user_df["studytime"].iloc[0] + 1, "Study +1 hr"),
        ("absences", max(0, user_df["absences"].iloc[0] - 3), "Attend 3 more classes")
    ]
    impact_scores = []
    for feat, new_val, label in interventions:
        new_risk = what_if(user_df, feat, new_val)
        impact = risk_prob - new_risk
        if impact > 0:
            impact_scores.append(f"{label}: -{impact:.2f}")
    insights["interventions"] = impact_scores

    # 8.
    resilience = {}
    for k, v in shap_contributions.items():
        value_to_compare = v[0] if isinstance(v, np.ndarray) else v

        if value_to_compare < 0 and k in ["famrel", "parents_education", "schoolsup"]:
            resilience[k] = v
    insights["resilience"] = [f"{k}: {-v[0]:.2f} reduction" for k, v in resilience.items()] if resilience else ["No major resilience factors"]

    # 10.
    anomalies = []
    if user_df["G1"].iloc[0] > 12 and user_df["absences"].iloc[0] > data["absences"].mean():
        anomalies.append("High grades but rising absences")
    insights["anomalies"] = anomalies if anomalies else ["No unusual patterns"]

    result = {
        "student_id": int(pd.Timestamp.now().timestamp()),  # Unique ID
        "risk_probability": risk_prob,
        "at_risk": int(at_risk),
        "final_grade": user_df["final_grade"].iloc[0] if "G3" in user_df else None,
        "alcohol_index": int(user_df["alcohol_index"].iloc[0]),
        "insights": insights
    }
    return result

user_input = {
    "school": "GP", "sex": "F", "age": 17, "famsize": "GT3", "Pstatus": "T",
    "Medu": 2, "Fedu": 2, "Mjob": "services", "Fjob": "services", "reason": "course",
    "guardian": "mother", "traveltime": 2, "studytime": 2, "failures": 0,
    "schoolsup": "yes", "famsup": "no", "paid": "no", "activities": "yes",
    "nursery": "yes", "higher": "yes", "internet": "yes", "romantic": "no",
    "famrel": 4, "freetime": 3, "goout": 2, "Dalc": 1, "Walc": 2, "health": 5,
    "absences": 10, "G1": 12, "G2": 10  # G3 optional
}

# Process the user data
result = process_user_data(user_input)

def save_to_database(result):
    conn = sqlite3.connect("student_features.db")
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS user_predictions (
            student_id INTEGER PRIMARY KEY, risk_probability REAL, at_risk INTEGER,
            final_grade REAL, alcohol_index INTEGER
        )
    ''')
    cursor.execute('''
        INSERT OR REPLACE INTO user_predictions (student_id, risk_probability, at_risk, final_grade, alcohol_index)
        VALUES (?, ?, ?, ?, ?)
    ''', (result["student_id"], result["risk_probability"], result["at_risk"], result["final_grade"], result["alcohol_index"]))
    conn.commit()
    conn.close()

save_to_database(result)

print("Result for Website:", result)

Result for Website: {'student_id': 1742491419, 'risk_probability': np.float64(0.29), 'at_risk': 0, 'final_grade': None, 'alcohol_index': 3, 'insights': {'dynamic_risk': {'probability': '0.29', 'label': 'Low', 'threshold': '0.55'}, 'top_interaction': 'age + G2: -0.00 impact', 'peer_benchmark': {'studytime': 'Yours: 2 vs. Peer Avg: 1.9', 'absences': 'Yours: 10 vs. Peer Avg: 3.9', 'G1': 'Yours: 12 vs. Peer Avg: 10.4'}, 'what_if': {'current': '0.29', 'study_plus_1': '0.30', 'absences_minus_5': '0.30'}, 'trajectory': [np.int64(12), np.int64(10), np.int64(8)], 'risk_profile': {'Academic Effort': '0.0%', 'Lifestyle': '0.0%', 'Support': '0.0%'}, 'interventions': [], 'resilience': ['famrel: 0.00 reduction', 'parents_education: 0.00 reduction'], 'anomalies': ['No unusual patterns']}}
