# Student CSV Analysis

Use this notebook to interactively analyze a students CSV and reproduce the stats and charts used in the app.


In [None]:
# Setup
import pandas as pd
import numpy as np
from pathlib import Path

csv_path = Path('../data/students_synthetic_500.csv')
df = pd.read_csv(csv_path)
df.head()


In [None]:
# Overview stats
numeric_cols = [c for c in ['comprehension','attention','focus','retention','engagement_time','assessment_score'] if c in df.columns]
df[numeric_cols].mean(numeric_only=True).round(2)


In [None]:
# Correlations with assessment_score
corr = df.corr(numeric_only=True)
corr['assessment_score'].sort_values(ascending=False).round(3) if 'assessment_score' in corr.columns else corr


In [None]:
# Persona buckets derived from assessment scores (if model unavailable)
def persona_from_score(score: float) -> str:
    if score >= 85: return 'High Performer'
    if score >= 70: return 'Consistent Learner'
    if score >= 50: return 'Developing Learner'
    return 'Needs Support'

personas = df['assessment_score'].apply(persona_from_score) if 'assessment_score' in df.columns else None
personas.value_counts() if personas is not None else 'assessment_score not present'


In [None]:
# Score distribution (10-point bins)
import pandas as pd
bins = list(range(0, 101, 10))
labels = [f"{b}-{b+10 if b<90 else 100}" for b in bins[:-1]]
dist = pd.cut(df['assessment_score'], bins=bins, include_lowest=True, labels=labels).value_counts().sort_index()
dist


## Source: analyze_csv.py


In [None]:
# Contents of analyze_csv.py
import sys
import json
import io
import pandas as pd
import numpy as np
from pathlib import Path

try:
    from sklearn.cluster import KMeans
    from sklearn.linear_model import LinearRegression
    import joblib
except Exception:
    KMeans = None
    LinearRegression = None
    joblib = None


def compute_overview(df: pd.DataFrame):
    numeric_cols = [c for c in [
        'comprehension','attention','focus','retention','engagement_time','assessment_score'
    ] if c in df.columns]
    stats = df[numeric_cols].mean(numeric_only=True).to_dict()
    return { k: float(round(v,2)) for k,v in stats.items() }


def compute_correlations(df: pd.DataFrame):
    corr = df.corr(numeric_only=True)
    if 'assessment_score' in corr.columns:
        series = corr['assessment_score'].sort_values(ascending=False)
        return { k: float(round(v,3)) for k,v in series.items() }
    return {}


def predict_scores(df: pd.DataFrame):
    model_path = Path(__file__).resolve().parent / 'student_score_model.pkl'
    if joblib is None or not model_path.exists():
        return None
    model = joblib.load(model_path)
    needed = ['comprehension','attention','focus','retention','engagement_time']
    if not all(c in df.columns for c in needed):
        return None
    X = df[needed].astype(float).values
    preds = model.predict(X)
    return [ float(round(p,2)) for p in preds ]


def persona_from_score(score: float) -> str:
    if score >= 85: return 'High Performer'
    if score >= 70: return 'Consistent Learner'
    if score >= 50: return 'Developing Learner'
    return 'Needs Support'


def cluster_personas(df: pd.DataFrame):
    # If model predictions available, use them to label; else kmeans
    preds = predict_scores(df)
    personas = []
    if preds is not None:
        personas = [ persona_from_score(p) for p in preds ]
    else:
        if KMeans is None:
            return None
        needed = ['comprehension','attention','focus','retention','engagement_time']
        if not all(c in df.columns for c in needed):
            return None
        X = df[needed].astype(float).values
        km = KMeans(n_clusters=3, random_state=42, n_init=10)
        labels = km.fit_predict(X)
        mapping = {0:'High Performer',1:'Consistent Learner',2:'Developing Learner'}
        personas = [ mapping.get(int(l), 'Persona') for l in labels ]
    return personas


def main():
    raw = sys.stdin.buffer.read()
    if not raw:
        print(json.dumps({"error":"no_input"}))
        return
    try:
        text = raw.decode('utf-8', errors='ignore')
        df = pd.read_csv(io.StringIO(text))
    except Exception as e:
        print(json.dumps({"error": f"csv_parse_failed: {e}"}))
        return

    overview = compute_overview(df)
    correlations = compute_correlations(df)
    personas = cluster_personas(df)

    result = {
        "overview": overview,
        "correlations": correlations,
        "personas": personas,
        "count": int(len(df)),
        "columns": list(df.columns),
        "preview": df.to_dict(orient='records')
    }
    print(json.dumps(result))


if __name__ == '__main__':
    main()


## Source: predict.py


In [None]:
# Contents of predict.py
import sys
import json
from pathlib import Path
import numpy as np

try:
    import joblib
except Exception as e:
    print(json.dumps({"error": "joblib not installed"}))
    sys.exit(1)


def load_model(model_path: Path):
    if not model_path.exists():
        raise FileNotFoundError(f"Model not found at {model_path}")
    return joblib.load(model_path)


def read_input():
    try:
        data = json.loads(sys.stdin.read() or "{}")
        return data
    except Exception:
        return {}


def to_features(payload: dict):
    keys = ["comprehension", "attention", "focus", "retention", "engagement_time"]
    values = []
    for k in keys:
        v = payload.get(k, 0)
        try:
            v = float(v)
        except Exception:
            v = 0.0
        values.append(v)
    return np.array(values, dtype=float).reshape(1, -1)


def persona_from_score(score: float) -> str:
    if score >= 85:
        return "High Performer"
    if score >= 70:
        return "Consistent Learner"
    if score >= 50:
        return "Developing Learner"
    return "Needs Support"


def main():
    # model path relative to this script
    model_path = Path(__file__).resolve().parent / "student_score_model.pkl"
    try:
        model = load_model(model_path)
    except Exception as e:
        print(json.dumps({"error": str(e)}))
        return

    payload = read_input()
    X = to_features(payload)
    try:
        y_pred = float(model.predict(X)[0])
    except Exception as e:
        print(json.dumps({"error": f"prediction_failed: {e}"}))
        return

    result = {
        "score": round(y_pred, 2),
        "persona": persona_from_score(y_pred),
    }
    print(json.dumps(result))


if __name__ == "__main__":
    main()
