# Model Training for Real-Time Clustering

This notebook trains two models for real-time student engagement clustering:

1. **K-Means**: Initial clustering from first question
2. **Random Forest**: Dynamic cluster updates during session

**Prerequisites**: Run `01_Preprocessing_Final.ipynb` first to generate training data.

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import pickle
import joblib
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix,
    silhouette_score, davies_bouldin_score, calinski_harabasz_score
)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

## Step 1: Load Preprocessed Data

In [None]:
data_dir = '/content/drive/MyDrive/FYP_Data/Preprocessed'

# Load initial question data
X_initial = np.load(f'{data_dir}/X_initial_scaled.npy')
y_initial = np.load(f'{data_dir}/y_initial.npy')

# Load completed question data
X_completed = np.load(f'{data_dir}/X_completed_scaled.npy')
y_completed = np.load(f'{data_dir}/y_completed.npy')

# Load not completed question data
X_not_completed = np.load(f'{data_dir}/X_not_completed_scaled.npy')
y_not_completed = np.load(f'{data_dir}/y_not_completed.npy')

print("Data loaded successfully!")
print(f"\nInitial questions: {X_initial.shape[0]} samples, {X_initial.shape[1]} features")
print(f"Completed questions: {X_completed.shape[0]} samples, {X_completed.shape[1]} features")
print(f"Not completed questions: {X_not_completed.shape[0]} samples, {X_not_completed.shape[1]} features")

## Step 2: Train K-Means for Initial Clustering

K-Means provides quick baseline clustering when students first join.

In [None]:
# Train K-Means with k=3 (Passive, Moderate, Active)
kmeans = KMeans(n_clusters=3, init='k-means++', n_init=10, max_iter=300, random_state=42)
kmeans_labels = kmeans.fit_predict(X_initial)

# Evaluate clustering quality
silhouette = silhouette_score(X_initial, kmeans_labels)
davies_bouldin = davies_bouldin_score(X_initial, kmeans_labels)
calinski = calinski_harabasz_score(X_initial, kmeans_labels)

print("K-Means Clustering Results:")
print(f"  Silhouette Score: {silhouette:.4f} (higher is better, >0.4 is good)")
print(f"  Davies-Bouldin Index: {davies_bouldin:.4f} (lower is better, <1.0 is good)")
print(f"  Calinski-Harabasz Score: {calinski:.2f} (higher is better)")

# Map clusters to engagement levels based on characteristics
cluster_centers = kmeans.cluster_centers_
# Assuming feature 0 is response time (lower is better)
cluster_scores = -cluster_centers[:, 0]  # Negate so lower time = higher score
cluster_order = np.argsort(cluster_scores)  # Sort by score

cluster_mapping = {
    cluster_order[0]: 'Passive',
    cluster_order[1]: 'Moderate',
    cluster_order[2]: 'Active'
}

print(f"\nCluster Mapping: {cluster_mapping}")

# Apply mapping
kmeans_mapped_labels = [cluster_mapping[label] for label in kmeans_labels]
print(f"\nCluster Distribution:")
unique, counts = np.unique(kmeans_mapped_labels, return_counts=True)
for eng_level, count in zip(unique, counts):
    print(f"  {eng_level}: {count} ({count/len(kmeans_mapped_labels)*100:.1f}%)")

## Step 3: Combine Data for Dynamic Clustering Model

Merge completed and not-completed questions for comprehensive training.

In [None]:
# For not completed, we need to match feature dimensions with completed
# We'll use the first 5 features which are common
X_not_completed_reduced = X_not_completed[:, :5]

# Combine datasets
X_combined = np.vstack([X_completed, X_not_completed_reduced])
y_combined = np.concatenate([y_completed, y_not_completed])

print(f"Combined training data: {X_combined.shape[0]} samples")
print(f"\nClass distribution:")
unique, counts = np.unique(y_combined, return_counts=True)
for label, count in zip(unique, counts):
    print(f"  {label}: {count} ({count/len(y_combined)*100:.1f}%)")

## Step 4: Train Random Forest for Dynamic Updates

Random Forest learns patterns and predicts cluster updates in real-time.

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y_combined, test_size=0.2, random_state=42, stratify=y_combined
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

# Train Random Forest
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)

print("\nTraining Random Forest...")
rf_model.fit(X_train, y_train)
print("âœ… Training complete!")

# Predictions
y_pred = rf_model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nðŸ“Š Random Forest Performance:")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall: {recall:.4f}")
print(f"  F1-Score: {f1:.4f}")

print("\nðŸ“‹ Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

## Step 5: Cross-Validation

In [None]:
# 5-fold cross-validation
cv_scores = cross_val_score(rf_model, X_combined, y_combined, cv=5, scoring='f1_weighted')

print("5-Fold Cross-Validation Results:")
print(f"  Mean F1-Score: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
print(f"  Individual Folds: {cv_scores}")

## Step 6: Confusion Matrix

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=['Passive', 'Moderate', 'Active'])

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Passive', 'Moderate', 'Active'],
            yticklabels=['Passive', 'Moderate', 'Active'])
plt.title('Confusion Matrix - Random Forest')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/FYP_Data/Models/confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("âœ… Confusion matrix saved!")

## Step 7: Feature Importance

In [None]:
# Feature importance
feature_names = ['Cumulative Accuracy', 'Avg Response Time', 'Total Questions', 'Current Response Time', 'Is Correct']
importances = rf_model.feature_importances_

plt.figure(figsize=(10, 6))
plt.barh(feature_names, importances)
plt.xlabel('Importance')
plt.title('Feature Importance - Random Forest')
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/FYP_Data/Models/feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nðŸ“Š Feature Importance:")
for name, importance in zip(feature_names, importances):
    print(f"  {name}: {importance:.4f}")

## Step 8: Save Trained Models

In [None]:
import os

# Create models directory
models_dir = '/content/drive/MyDrive/FYP_Data/Models'
os.makedirs(models_dir, exist_ok=True)

# Save K-Means
joblib.dump(kmeans, f'{models_dir}/kmeans_initial.pkl')
joblib.dump(cluster_mapping, f'{models_dir}/cluster_mapping.pkl')

# Save Random Forest
joblib.dump(rf_model, f'{models_dir}/rf_dynamic.pkl')

# Load scalers and save to models directory
with open(f'{data_dir}/scaler_initial.pkl', 'rb') as f:
    scaler_initial = pickle.load(f)
with open(f'{data_dir}/scaler_completed.pkl', 'rb') as f:
    scaler_completed = pickle.load(f)

joblib.dump(scaler_initial, f'{models_dir}/scaler_initial.pkl')
joblib.dump(scaler_completed, f'{models_dir}/scaler_dynamic.pkl')

print("\nâœ… All models saved successfully!")
print(f"\nSaved files:")
print(f"  - kmeans_initial.pkl")
print(f"  - cluster_mapping.pkl")
print(f"  - rf_dynamic.pkl")
print(f"  - scaler_initial.pkl")
print(f"  - scaler_dynamic.pkl")
print(f"  - confusion_matrix.png")
print(f"  - feature_importance.png")

## Summary

**Training Complete!**

**Model 1 - K-Means (Initial Clustering)**:
- Silhouette Score: {silhouette:.4f}
- Davies-Bouldin Index: {davies_bouldin:.4f}
- Purpose: Quick baseline classification from initial question

**Model 2 - Random Forest (Dynamic Updates)**:
- Accuracy: {accuracy:.4f}
- F1-Score: {f1:.4f}
- Cross-Validation: {cv_scores.mean():.4f}
- Purpose: Real-time cluster updates during session

**Next Step**: Use `03_RealTime_Inference_Demo.ipynb` to see real-time clustering in action!