# Cognitive Skills & Student Performance Analysis

This notebook analyzes the relationship between cognitive skills and student performance, builds predictive models, and identifies student personas through clustering.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import pickle
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 1. Data Loading and Exploration

In [None]:
# Load the dataset
df = pd.read_csv('../data/students.csv')

print(f"Dataset shape: {df.shape}")
print("\nColumn names:")
print(df.columns.tolist())
print("\nFirst 5 rows:")
df.head()

In [None]:
# Basic information about the dataset
print("Dataset Info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())
print("\nClass distribution:")
print(df['class'].value_counts())

In [None]:
# Summary statistics
print("Summary Statistics:")
df.describe()

In [None]:
# Distribution plots
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('Distribution of Cognitive Skills and Performance Metrics', fontsize=16)

skills = ['comprehension', 'attention', 'focus', 'retention', 'assessment_score', 'engagement_time']

for i, skill in enumerate(skills):
    row, col = i // 3, i % 3
    axes[row, col].hist(df[skill], bins=20, alpha=0.7, edgecolor='black')
    axes[row, col].set_title(f'{skill.replace("_", " ").title()}')
    axes[row, col].set_xlabel('Score')
    axes[row, col].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 2. Correlation Analysis

In [None]:
# Select numeric columns for correlation analysis
numeric_cols = ['comprehension', 'attention', 'focus', 'retention', 'assessment_score', 'engagement_time']
correlation_data = df[numeric_cols]

# Pearson correlation
pearson_corr = correlation_data.corr(method='pearson')
print("Pearson Correlation Matrix:")
print(pearson_corr.round(3))

In [None]:
# Spearman correlation
spearman_corr = correlation_data.corr(method='spearman')
print("Spearman Correlation Matrix:")
print(spearman_corr.round(3))

In [None]:
# Correlation heatmaps
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Pearson correlation heatmap
sns.heatmap(pearson_corr, annot=True, cmap='coolwarm', center=0, 
            square=True, ax=axes[0], cbar_kws={'shrink': 0.8})
axes[0].set_title('Pearson Correlation Matrix')

# Spearman correlation heatmap
sns.heatmap(spearman_corr, annot=True, cmap='coolwarm', center=0, 
            square=True, ax=axes[1], cbar_kws={'shrink': 0.8})
axes[1].set_title('Spearman Correlation Matrix')

plt.tight_layout()
plt.show()

## 3. Statistical Significance Testing

In [None]:
# Calculate p-values for correlations with assessment_score
target = 'assessment_score'
features = ['comprehension', 'attention', 'focus', 'retention', 'engagement_time']

print(f"Statistical significance of correlations with {target}:")
print("=" * 60)

for feature in features:
    # Pearson correlation and p-value
    pearson_r, pearson_p = stats.pearsonr(df[feature], df[target])
    
    # Spearman correlation and p-value
    spearman_r, spearman_p = stats.spearmanr(df[feature], df[target])
    
    print(f"\n{feature.upper()}:")
    print(f"  Pearson:  r = {pearson_r:.3f}, p = {pearson_p:.2e}")
    print(f"  Spearman: r = {spearman_r:.3f}, p = {spearman_p:.2e}")
    
    # Significance level
    significance = "***" if min(pearson_p, spearman_p) < 0.001 else "**" if min(pearson_p, spearman_p) < 0.01 else "*" if min(pearson_p, spearman_p) < 0.05 else "ns"
    print(f"  Significance: {significance}")

## 4. Machine Learning Models

In [None]:
# Prepare data for modeling
X = df[['comprehension', 'attention', 'focus', 'retention', 'engagement_time']]
y = df['assessment_score']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

In [None]:
# Baseline Model: Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predictions
y_pred_lr = lr_model.predict(X_test)

# Metrics
lr_mae = mean_absolute_error(y_test, y_pred_lr)
lr_rmse = np.sqrt(mean_squared_error(y_test, y_pred_lr))
lr_r2 = r2_score(y_test, y_pred_lr)

print("Linear Regression Results:")
print(f"MAE: {lr_mae:.3f}")
print(f"RMSE: {lr_rmse:.3f}")
print(f"R²: {lr_r2:.3f}")

# Feature coefficients
print("\nFeature Coefficients:")
for feature, coef in zip(X.columns, lr_model.coef_):
    print(f"  {feature}: {coef:.3f}")

In [None]:
# Advanced Model: Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10)
rf_model.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test)

# Metrics
rf_mae = mean_absolute_error(y_test, y_pred_rf)
rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))
rf_r2 = r2_score(y_test, y_pred_rf)

print("Random Forest Results:")
print(f"MAE: {rf_mae:.3f}")
print(f"RMSE: {rf_rmse:.3f}")
print(f"R²: {rf_r2:.3f}")

# Feature importance
print("\nFeature Importance:")
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance)

In [None]:
# Model comparison
comparison_df = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest'],
    'MAE': [lr_mae, rf_mae],
    'RMSE': [lr_rmse, rf_rmse],
    'R²': [lr_r2, rf_r2]
})

print("Model Comparison:")
print(comparison_df)

In [None]:
# Residual plots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Model Performance Analysis', fontsize=16)

# Linear Regression residuals
residuals_lr = y_test - y_pred_lr
axes[0, 0].scatter(y_pred_lr, residuals_lr, alpha=0.6)
axes[0, 0].axhline(y=0, color='red', linestyle='--')
axes[0, 0].set_title('Linear Regression - Residuals vs Predicted')
axes[0, 0].set_xlabel('Predicted Values')
axes[0, 0].set_ylabel('Residuals')

# Random Forest residuals
residuals_rf = y_test - y_pred_rf
axes[0, 1].scatter(y_pred_rf, residuals_rf, alpha=0.6)
axes[0, 1].axhline(y=0, color='red', linestyle='--')
axes[0, 1].set_title('Random Forest - Residuals vs Predicted')
axes[0, 1].set_xlabel('Predicted Values')
axes[0, 1].set_ylabel('Residuals')

# Actual vs Predicted - Linear Regression
axes[1, 0].scatter(y_test, y_pred_lr, alpha=0.6)
axes[1, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'red', linestyle='--')
axes[1, 0].set_title('Linear Regression - Actual vs Predicted')
axes[1, 0].set_xlabel('Actual Values')
axes[1, 0].set_ylabel('Predicted Values')

# Actual vs Predicted - Random Forest
axes[1, 1].scatter(y_test, y_pred_rf, alpha=0.6)
axes[1, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'red', linestyle='--')
axes[1, 1].set_title('Random Forest - Actual vs Predicted')
axes[1, 1].set_xlabel('Actual Values')
axes[1, 1].set_ylabel('Predicted Values')

plt.tight_layout()
plt.show()

In [None]:
# Feature importance visualization
plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Random Forest - Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.tight_layout()
plt.show()

## 5. Student Clustering and Personas

In [None]:
# Prepare data for clustering
clustering_features = ['comprehension', 'attention', 'focus', 'retention', 'assessment_score', 'engagement_time']
X_cluster = df[clustering_features]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cluster)

# Determine optimal number of clusters using elbow method
inertias = []
k_range = range(2, 9)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)

# Plot elbow curve
plt.figure(figsize=(10, 6))
plt.plot(k_range, inertias, 'bo-')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.grid(True)
plt.show()

In [None]:
# Apply K-means clustering with optimal number of clusters (let's use 4)
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)

# Add cluster labels to dataframe
df['cluster'] = cluster_labels

print(f"Cluster distribution:")
print(df['cluster'].value_counts().sort_index())

In [None]:
# Analyze cluster characteristics
cluster_summary = df.groupby('cluster')[clustering_features].mean()
print("Cluster Characteristics (Mean Values):")
print(cluster_summary.round(2))

In [None]:
# Visualize clusters
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Student Clusters by Cognitive Skills', fontsize=16)

for i, feature in enumerate(clustering_features):
    row, col = i // 3, i % 3
    
    for cluster in range(optimal_k):
        cluster_data = df[df['cluster'] == cluster][feature]
        axes[row, col].hist(cluster_data, alpha=0.6, label=f'Cluster {cluster}', bins=15)
    
    axes[row, col].set_title(f'{feature.replace("_", " ").title()}')
    axes[row, col].set_xlabel('Score')
    axes[row, col].set_ylabel('Frequency')
    axes[row, col].legend()

plt.tight_layout()
plt.show()

In [None]:
# Create radar chart for cluster profiles
from math import pi

# Normalize cluster means to 0-1 scale for radar chart
cluster_normalized = cluster_summary.div(cluster_summary.max())

# Set up radar chart
categories = list(cluster_normalized.columns)
N = len(categories)

angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]  # Complete the circle

fig, ax = plt.subplots(figsize=(12, 12), subplot_kw=dict(projection='polar'))

colors = ['red', 'blue', 'green', 'orange']

for i in range(optimal_k):
    values = cluster_normalized.iloc[i].tolist()
    values += values[:1]  # Complete the circle
    
    ax.plot(angles, values, 'o-', linewidth=2, label=f'Cluster {i}', color=colors[i])
    ax.fill(angles, values, alpha=0.25, color=colors[i])

ax.set_xticks(angles[:-1])
ax.set_xticklabels([cat.replace('_', ' ').title() for cat in categories])
ax.set_ylim(0, 1)
ax.set_title('Student Cluster Profiles (Normalized)', size=16, y=1.1)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
ax.grid(True)

plt.tight_layout()
plt.show()

## 6. Persona Interpretation

In [None]:
# Define persona labels based on cluster characteristics
persona_labels = {
    0: "High Achievers",
    1: "Struggling Learners", 
    2: "Average Performers",
    3: "Focused Specialists"
}

# Add persona labels to dataframe
df['persona'] = df['cluster'].map(persona_labels)

# Detailed persona analysis
print("STUDENT PERSONAS ANALYSIS")
print("=" * 50)

for cluster in range(optimal_k):
    cluster_data = df[df['cluster'] == cluster]
    persona = persona_labels[cluster]
    
    print(f"\n{persona.upper()} (Cluster {cluster})")
    print(f"Number of students: {len(cluster_data)} ({len(cluster_data)/len(df)*100:.1f}%)")
    print(f"Average assessment score: {cluster_data['assessment_score'].mean():.1f}")
    print(f"Average engagement time: {cluster_data['engagement_time'].mean():.0f} minutes/week")
    
    print("\nCognitive Skills Profile:")
    for skill in ['comprehension', 'attention', 'focus', 'retention']:
        avg_score = cluster_data[skill].mean()
        print(f"  {skill.title()}: {avg_score:.1f}")
    
    print(f"\nClass distribution:")
    class_dist = cluster_data['class'].value_counts(normalize=True) * 100
    for class_name, percentage in class_dist.items():
        print(f"  Class {class_name}: {percentage:.1f}%")

## 7. Key Insights and Recommendations

### Key Findings:

1. **Strong Correlations**: All cognitive skills show significant positive correlations with assessment scores, with comprehension and retention being the strongest predictors.

2. **Model Performance**: The Random Forest model outperforms Linear Regression, achieving better predictive accuracy with an R² score indicating good model fit.

3. **Student Personas**: Four distinct student personas were identified:
   - **High Achievers**: Excellent across all metrics, high engagement
   - **Struggling Learners**: Below average in most areas, need intensive support
   - **Average Performers**: Balanced profile, room for targeted improvement
   - **Focused Specialists**: Strong in specific areas, may need broader skill development

### Recommendations:

1. **Personalized Learning**: Tailor interventions based on student personas
2. **Focus on Comprehension**: Prioritize comprehension skills as they show the strongest impact
3. **Engagement Strategies**: Increase engagement time for struggling learners
4. **Early Intervention**: Use the predictive model to identify at-risk students early
5. **Balanced Development**: Help focused specialists develop broader skill sets


In [None]:
# Save the best performing model
import os
os.makedirs('../models', exist_ok=True)

# Save the Random Forest model (best performer)
with open('../models/final_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

print("Model saved successfully to ../models/final_model.pkl")

# Also save the feature names for prediction script
model_info = {
    'model': rf_model,
    'features': list(X.columns),
    'model_type': 'RandomForestRegressor',
    'performance': {
        'mae': rf_mae,
        'rmse': rf_rmse,
        'r2': rf_r2
    }
}

with open('../models/model_info.pkl', 'wb') as f:
    pickle.dump(model_info, f)

print("Model info saved successfully to ../models/model_info.pkl")

In [None]:
# Final summary
print("ANALYSIS COMPLETE!")
print("=" * 30)
print(f"Dataset: {len(df)} students analyzed")
print(f"Best Model: Random Forest (R² = {rf_r2:.3f})")
print(f"Student Personas: {optimal_k} clusters identified")
print(f"Model saved: ../models/final_model.pkl")
print("\nReady for dashboard integration!")