# Cognitive Skills & Student Performance Analysis

## Introduction & Objective

This notebook explores the relationship between cognitive skills and student assessment performance using a synthetic dataset. We perform EDA, correlation analysis, regression modeling, clustering, and generate actionable insights for instructors.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import joblib
import os

# Ensure output directory exists
os.makedirs('notebooks/outputs', exist_ok=True)


## Load & Describe Data


In [None]:
df = pd.read_csv('../data/students.csv')
df.head()


In [None]:
df.info()


In [None]:
df.describe()


## Exploratory Data Analysis (EDA)


In [None]:
# Histograms
features = ['comprehension', 'attention', 'focus', 'retention', 'engagement_time', 'assessment_score']
df[features].hist(bins=20, figsize=(12,8))
plt.tight_layout()
plt.savefig('notebooks/outputs/histograms.png')
plt.show()


In [None]:
# Pairwise scatter plots
sns.pairplot(df[features])
plt.savefig('notebooks/outputs/pairplot.png')
plt.show()


In [None]:
# Correlation matrix
corr = df[features].corr()
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.savefig('notebooks/outputs/corr_matrix.png')
plt.show()


**Interpretation:**

Assessment score is expected to be most strongly correlated with comprehension, attention, focus, retention, and engagement time.


## Feature Engineering


In [None]:
# Standardize features
X = df[['comprehension', 'attention', 'focus', 'retention', 'engagement_time']]
y = df['assessment_score']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Optionally add interaction features here
# ...


## Modeling: Regression to Predict Assessment Score


In [None]:
# Baseline: Linear Regression
lr = LinearRegression()
cv = KFold(n_splits=5, shuffle=True, random_state=42)
mae = -cross_val_score(lr, X_scaled, y, cv=cv, scoring='neg_mean_absolute_error').mean()
rmse = np.sqrt(-cross_val_score(lr, X_scaled, y, cv=cv, scoring='neg_mean_squared_error').mean())
r2 = cross_val_score(lr, X_scaled, y, cv=cv, scoring='r2').mean()
print(f'Linear Regression MAE: {mae:.2f}, RMSE: {rmse:.2f}, R²: {r2:.2f}')


In [None]:
# Stronger: Random Forest with GridSearchCV
rf = RandomForestRegressor(random_state=42)
param_grid = {'n_estimators': [50, 100], 'max_depth': [None, 5, 10]}
grid = GridSearchCV(rf, param_grid, cv=cv, scoring='neg_mean_absolute_error')
grid.fit(X_scaled, y)
best_rf = grid.best_estimator_
y_pred = best_rf.predict(X_scaled)
mae_rf = mean_absolute_error(y, y_pred)
rmse_rf = np.sqrt(mean_squared_error(y, y_pred))
r2_rf = r2_score(y, y_pred)
print(f'Random Forest MAE: {mae_rf:.2f}, RMSE: {rmse_rf:.2f}, R²: {r2_rf:.2f}')


### Save Model & Preprocessing Pipeline


In [None]:
from sklearn.pipeline import make_pipeline
pipeline = make_pipeline(StandardScaler(), best_rf)
os.makedirs('../models', exist_ok=True)
joblib.dump(pipeline, '../models/pipeline.pkl')
print('Pipeline saved to ../models/pipeline.pkl')


## Clustering: Learning Personas


In [None]:
# Normalize cognitive features
cog_features = ['comprehension', 'attention', 'focus', 'retention', 'engagement_time']
X_cog = scaler.fit_transform(df[cog_features])
# Elbow method
inertia = []
silhouette = []
for k in range(2, 7):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_cog)
    inertia.append(kmeans.inertia_)
    silhouette.append(silhouette_score(X_cog, kmeans.labels_))
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.plot(range(2,7), inertia, marker='o')
plt.title('Elbow Method')
plt.xlabel('n_clusters')
plt.ylabel('Inertia')
plt.subplot(1,2,2)
plt.plot(range(2,7), silhouette, marker='o', color='orange')
plt.title('Silhouette Score')
plt.xlabel('n_clusters')
plt.ylabel('Silhouette')
plt.tight_layout()
plt.savefig('notebooks/outputs/clustering_metrics.png')
plt.show()
# Choose best k (e.g., 4)
best_k = 4
kmeans = KMeans(n_clusters=best_k, random_state=42)
clusters = kmeans.fit_predict(X_cog)
df['cluster'] = clusters
# Save with clusters
df.to_csv('../data/students_with_clusters.csv', index=False)


### Persona Descriptions

For each cluster, we describe the persona and suggest teaching strategies.


In [None]:
persona_desc = []
for c in range(best_k):
    group = df[df['cluster'] == c]
    avg = group[cog_features].mean()
    desc = f'Cluster {c}: ' \
        f'Comprehension={avg["comprehension"]:.1f}, Attention={avg["attention"]:.1f}, Focus={avg["focus"]:.1f}, Retention={avg["retention"]:.1f}, Engagement={avg["engagement_time"]:.1f}'
    # Simple rule-based persona
    if avg['attention'] < 60 and avg['retention'] < 60:
        persona = 'Needs revision-based learning'
        recs = 'Use spaced repetition, regular quizzes.'
    elif avg['comprehension'] > 75 and avg['focus'] > 75:
        persona = 'Self-motivated high performer'
        recs = 'Offer advanced material, encourage peer teaching.'
    elif avg['engagement_time'] < 120:
        persona = 'Low engagement risk'
        recs = 'Increase interactive activities, parental involvement.'
    else:
        persona = 'Average learner'
        recs = 'Monitor progress, provide balanced support.'
    persona_desc.append((desc, persona, recs))
for desc, persona, recs in persona_desc:
    print(f'{desc} -> {persona}. Recommendations: {recs}')


## Insights & Recommendations

1. Assessment scores are most strongly correlated with comprehension and focus.
2. Students with low engagement time tend to underperform.
3. Cluster analysis reveals distinct learning personas.
4. Random Forest outperforms linear regression in predictive accuracy.
5. Instructors should tailor strategies based on cluster personas.


## Export Artifacts

All figures are saved to `notebooks/outputs/`. Model pipeline is saved to `../models/pipeline.pkl`. Clustered data is saved to `../data/students_with_clusters.csv`.


In [None]:
# Sample code to load pipeline and predict for new students
def predict_sample(new_data):
    pipeline = joblib.load('../models/pipeline.pkl')
    return pipeline.predict(new_data)

# Example usage:
# new_students = pd.DataFrame({
#     'comprehension': [80],
#     'attention': [75],
#     'focus': [78],
#     'retention': [70],
#     'engagement_time': [200]
# })
# print(predict_sample(new_students))
