In [2]:
!pip install faker


Defaulting to user installation because normal site-packages is not writeable


In [3]:
!pip install xgboost


Defaulting to user installation because normal site-packages is not writeable


In [None]:
# ============================================
# 📌 Student Cognitive Skills Analysis (Improved Dataset + ML + Clustering)
# ============================================

import pandas as pd
import numpy as np
from faker import Faker
import random

# ML libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# ---------------------------
# 1. Generate Synthetic Dataset
# ---------------------------
fake = Faker()
random.seed(42)

n_students = 500

data = {
    "student_id": [i for i in range(1, n_students + 1)],
    "name": [fake.name() for _ in range(n_students)],
    "class": [random.choice(["A", "B", "C", "D"]) for _ in range(n_students)],
    "comprehension": np.random.randint(50, 100, n_students),
    "attention": np.random.randint(40, 100, n_students),
    "focus": np.random.randint(30, 100, n_students),
    "retention": np.random.randint(20, 100, n_students),
    "engagement_time": np.random.randint(10, 60, n_students)
}

df = pd.DataFrame(data)

# ---------------------------
# 2. Add Richer Features
# ---------------------------
df["concentration_synergy"] = df["focus"] * df["attention"]
df["learning_efficiency"] = df["retention"] / (df["engagement_time"] + 1)  # avoid /0
df["understand_and_retain"] = df["comprehension"] * df["retention"]

# ---------------------------
# 3. Define Assessment Score (Target)
# ---------------------------
df["assessment_score"] = (
    0.3 * df["comprehension"] +
    0.25 * df["retention"] +
    0.2 * df["attention"] +
    0.15 * df["focus"] +
    0.1 * df["engagement_time"] +
    0.05 * df["concentration_synergy"] / 100 +   # scaled
    0.05 * df["learning_efficiency"] +
    0.05 * df["understand_and_retain"] / 100 +
    np.random.normal(0, 2, n_students)  # less noise
).round(2)

print("✅ Improved Dataset Created. Shape:", df.shape)
df.head()

# ---------------------------
# 4. Train-Test Split
# ---------------------------
features = ["comprehension", "attention", "focus", "retention", "engagement_time",
            "concentration_synergy", "learning_efficiency", "understand_and_retain"]

X = df[features]
y = df["assessment_score"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ---------------------------
# 5. Random Forest (Hyperparameter Tuning)
# ---------------------------
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_
y_pred_rf_test = best_rf.predict(X_test)

print("\n🌲 Random Forest Results (Test Set)")
print("Best Params:", grid_search.best_params_)
print("MSE:", mean_squared_error(y_test, y_pred_rf_test))
print("R2 Score:", r2_score(y_test, y_pred_rf_test))

# ---------------------------
# 6. XGBoost Model
# ---------------------------
xgb = XGBRegressor(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb.fit(X_train, y_train)
y_pred_xgb_test = xgb.predict(X_test)

print("\n🚀 XGBoost Results (Test Set)")
print("MSE:", mean_squared_error(y_test, y_pred_xgb_test))
print("R2 Score:", r2_score(y_test, y_pred_xgb_test))

# ---------------------------
# 7. Predict for Entire Dataset
# ---------------------------
df["predicted_score"] = xgb.predict(df[features])

# ---------------------------
# 8. Clustering Students into Learning Personas
# ---------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[features])

kmeans = KMeans(n_clusters=3, random_state=42)
df["learning_persona"] = kmeans.fit_predict(X_scaled)

# ---------------------------
# 9. Export Enriched Dataset
# ---------------------------
df.to_csv("students_results.csv", index=False)
print("✅ Enriched dataset exported as 'students_results.csv'")

# ---------------------------
# 10. Feature Importance (XGBoost)
# ---------------------------
xgb_importances = xgb.feature_importances_
plt.figure(figsize=(10,6))
sns.barplot(x=features, y=xgb_importances)
plt.xticks(rotation=45)
plt.title("Feature Importance (XGBoost)")
plt.show()

# ---------------------------
# 11. Predicted vs Actual Scatter
# ---------------------------
plt.figure(figsize=(8,6))
sns.scatterplot(x=df["assessment_score"], y=df["predicted_score"])
plt.plot([df["assessment_score"].min(), df["assessment_score"].max()],
         [df["assessment_score"].min(), df["assessment_score"].max()],
         'r--', lw=2)  # reference line
plt.xlabel("Actual Assessment Score")
plt.ylabel("Predicted Score")
plt.title("Predicted vs Actual Assessment Score")
plt.show()


✅ Improved Dataset Created. Shape: (500, 12)
Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [None]:
import os
print(os.getcwd())


In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df[features + ["assessment_score"]].corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Matrix: Skills vs Assessment Score")
plt.show()


In [None]:
sns.pairplot(df, vars=features + ["assessment_score"], kind='scatter', diag_kind='kde')
plt.suptitle("Pairplot of Cognitive Skills vs Assessment Score", y=1.02)
plt.show()


In [None]:
plt.figure(figsize=(8,5))
sns.histplot(df["assessment_score"], bins=20, kde=True, color='skyblue')
plt.title("Distribution of Assessment Scores")
plt.xlabel("Assessment Score")
plt.ylabel("Count")
plt.show()


In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(x="class", y="assessment_score", data=df, palette="Set2")
plt.title("Assessment Score Distribution by Class")
plt.show()


In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(x=df["attention"], y=df["assessment_score"],
                hue=df["learning_persona"], palette="viridis", s=100)
plt.title("Learning Personas: Attention vs Assessment Score")
plt.show()


In [None]:
importances = pd.Series(xgb.feature_importances_, index=features).sort_values(ascending=False)
plt.figure(figsize=(10,5))
sns.barplot(x=importances.index, y=importances.values, palette="magma")
plt.title("Feature Importance (XGBoost)")
plt.xticks(rotation=45)
plt.show()
