# Cognitive Skills & Student Performance Analysis

This notebook generates a synthetic student dataset, explores correlations, builds a regression model to predict assessment scores, and clusters students into personas.

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.metrics import r2_score
import seaborn as sns
import matplotlib.pyplot as plt

rng = np.random.default_rng(42)
n = 200
first_names = ['Alex','Jordan','Taylor','Casey','Riley','Avery','Quinn','Morgan','Parker','Reese','Jamie','Rowan','Skyler','Dakota','Emerson','Finley','Harper','Jules','Kai','Logan']
last_names = ['Smith','Johnson','Brown','Davis','Miller','Wilson','Moore','Taylor','Anderson','Thomas']
classes = ['A','B','C','D']

def clamp(x, lo=0, hi=100):
    return np.clip(x, lo, hi)

base = rng.normal(55, 15, size=n)
comprehension = clamp(base + 20 * rng.random(n))
attention = clamp(base + 15 * rng.normal(size=n) + 10 * rng.random(n))
focus = clamp(base + 10 * rng.normal(size=n) + 10 * rng.random(n))
retention = clamp((comprehension + focus) / 2 + 10 * rng.normal(size=n))
engagement_time = clamp(60 + 20 * rng.normal(size=n) + 0.3 * attention, 0, 120)

trueW = dict(comp=0.35, att=0.2, foc=0.25, ret=0.3, eng=0.05)
noise = 8 * rng.normal(size=n)
score_raw = (trueW['comp']*comprehension + trueW['att']*attention + trueW['foc']*focus + trueW['ret']*retention + trueW['eng']*(engagement_time/1.2) + noise)
assessment_score = clamp(score_raw / 1.6)

names = [f"{rng.choice(first_names)} {rng.choice(last_names)}" for _ in range(n)]
classes_vec = [f"Class {rng.choice(classes)}" for _ in range(n)]
student_id = [f"S{1000+i}" for i in range(n)]

df = pd.DataFrame({
    'student_id': student_id,
    'name': names,
    'class': classes_vec,
    'comprehension': comprehension,
    'attention': attention,
    'focus': focus,
    'retention': retention,
    'engagement_time': engagement_time,
    'assessment_score': assessment_score
})
df.head()

In [None]:
# Correlations
corr = df[['comprehension','attention','focus','retention','engagement_time','assessment_score']].corr()['assessment_score'].sort_values(ascending=False)
corr

In [None]:
# Regression model
X = df[['comprehension','attention','focus','retention','engagement_time']].values
y = df['assessment_score'].values
reg = LinearRegression().fit(X, y)
pred = reg.predict(X)
r2 = r2_score(y, pred)
reg.coef_, reg.intercept_, r2

In [None]:
# Clustering into personas
kmeans = KMeans(n_clusters=3, n_init=10, random_state=42)
clusters = kmeans.fit_predict(X)
df['cluster'] = clusters
centers = pd.DataFrame(kmeans.cluster_centers_, columns=['comprehension','attention','focus','retention','engagement_time'])
centers

In [None]:
# Plots
sns.set_theme(style='whitegrid')
fig, axes = plt.subplots(1, 2, figsize=(12,4))
sns.barplot(x=corr.index[:-1], y=corr.values[:-1], ax=axes[0])
axes[0].set_title('Skill correlation with score')
sns.scatterplot(data=df, x='attention', y='assessment_score', hue='cluster', palette='tab10', ax=axes[1])
axes[1].set_title('Attention vs Performance')
plt.tight_layout()
plt.show()