# Model 1: Predictive Clustering + Engagement Prediction

**Objectives**:
1. **Part A - Clustering (K-Means)**: Real-time student engagement clustering
2. **Part B - Prediction (Random Forest)**: Engagement level prediction

**Prerequisites**: Run `01_Preprocessing_Enhanced_Dataset.ipynb` first

**Evaluation**:
- Clustering: Silhouette Score, Davies-Bouldin Index, Calinski-Harabasz
- Classification: Accuracy, Precision, Recall, F1-Score, ROC-AUC

## Setup

In [None]:
# Install packages
!pip install pandas numpy matplotlib seaborn scikit-learn xgboost imbalanced-learn -q
print("‚úÖ Packages installed")

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
print("‚úÖ Drive mounted")

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import (
    silhouette_score, davies_bouldin_score, calinski_harabasz_score,
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix, roc_auc_score
)
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

print("‚úÖ Libraries imported")

## Load Preprocessed Data

In [None]:
# Path to preprocessed data
DATA_PATH = '/content/drive/MyDrive/FYP_Data/Preprocessed/'

# Load initial questions data
X_initial = np.load(DATA_PATH + 'X_initial_scaled.npy')
y_initial = np.load(DATA_PATH + 'y_initial.npy')

# Load completed questions data
X_completed = np.load(DATA_PATH + 'X_completed_scaled.npy')
y_completed = np.load(DATA_PATH + 'y_completed.npy')

# Load not completed questions data
X_not_completed = np.load(DATA_PATH + 'X_not_completed_scaled.npy')
y_not_completed = np.load(DATA_PATH + 'y_not_completed.npy')

# Load scalers
with open(DATA_PATH + 'scaler_initial.pkl', 'rb') as f:
    scaler_initial = pickle.load(f)
with open(DATA_PATH + 'scaler_completed.pkl', 'rb') as f:
    scaler_completed = pickle.load(f)

print("‚úÖ Data loaded")
print(f"\nInitial Questions: {X_initial.shape}")
print(f"Completed Questions: {X_completed.shape}")
print(f"Not Completed Questions: {X_not_completed.shape}")

## Part A: K-Means Clustering

### 1. Elbow Method for Optimal K

In [None]:
# Elbow method on initial questions
print("Performing Elbow Method...")
inertias = []
K_range = range(2, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, init='k-means++', n_init=10, random_state=42)
    kmeans.fit(X_initial)
    inertias.append(kmeans.inertia_)
    print(f"K={k}: Inertia={kmeans.inertia_:.2f}")

# Plot
plt.figure(figsize=(10, 6))
plt.plot(K_range, inertias, 'bo-', linewidth=2, markersize=8)
plt.xlabel('Number of Clusters (K)', fontsize=12)
plt.ylabel('Inertia (WCSS)', fontsize=12)
plt.title('Elbow Method for Optimal K', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.xticks(K_range)
plt.tight_layout()
plt.show()

print("\n‚úÖ Optimal K appears to be around 3")

### 2. Train K-Means Model (K=3)

In [None]:
# Train K-Means with K=3
print("Training K-Means Clustering Model (K=3)...")
kmeans = KMeans(
    n_clusters=3,
    init='k-means++',
    n_init=10,
    max_iter=300,
    random_state=42
)

# Fit on initial questions
cluster_labels = kmeans.fit_predict(X_initial)

print(f"\n‚úÖ Model trained")
print(f"Cluster distribution: {np.bincount(cluster_labels)}")

### 3. Map Clusters to Engagement Levels

In [None]:
# Map cluster IDs to engagement levels based on characteristics
cluster_means = {}
for i in range(3):
    cluster_data = X_initial[cluster_labels == i]
    engagement_score = cluster_data.mean(axis=0).mean()
    cluster_means[i] = engagement_score
    print(f"Cluster {i}: Mean Score = {engagement_score:.3f}")

# Sort and map: lowest ‚Üí Passive, middle ‚Üí Moderate, highest ‚Üí Active
sorted_clusters = sorted(cluster_means.items(), key=lambda x: x[1])
cluster_mapping = {
    sorted_clusters[0][0]: 0,  # Passive
    sorted_clusters[1][0]: 1,  # Moderate
    sorted_clusters[2][0]: 2   # Active
}

mapped_labels = np.array([cluster_mapping[label] for label in cluster_labels])
cluster_names = {0: 'Passive', 1: 'Moderate', 2: 'Active'}

print("\n‚úÖ Clusters mapped to engagement levels")
for i in range(3):
    count = (mapped_labels == i).sum()
    print(f"{cluster_names[i]}: {count} ({count/len(mapped_labels)*100:.1f}%)")

### 4. Evaluate Clustering Quality

In [None]:
# Calculate metrics
silhouette = silhouette_score(X_initial, mapped_labels)
davies_bouldin = davies_bouldin_score(X_initial, mapped_labels)
calinski = calinski_harabasz_score(X_initial, mapped_labels)

print("Clustering Quality Metrics:")
print("="*60)
print(f"Silhouette Score:        {silhouette:.4f}")
print(f"Davies-Bouldin Index:    {davies_bouldin:.4f}")
print(f"Calinski-Harabasz:       {calinski:.4f}")
print(f"Inertia (WCSS):          {kmeans.inertia_:.4f}")
print("="*60)

print("\nInterpretation:")
if silhouette > 0.5:
    print(f"‚úÖ Silhouette ({silhouette:.3f}): Excellent clustering")
elif silhouette > 0.3:
    print(f"‚úÖ Silhouette ({silhouette:.3f}): Good clustering")
else:
    print(f"‚ö†Ô∏è  Silhouette ({silhouette:.3f}): Weak clustering")

if davies_bouldin < 1.0:
    print(f"‚úÖ Davies-Bouldin ({davies_bouldin:.3f}): Excellent separation")
elif davies_bouldin < 2.0:
    print(f"‚úÖ Davies-Bouldin ({davies_bouldin:.3f}): Good separation")
else:
    print(f"‚ö†Ô∏è  Davies-Bouldin ({davies_bouldin:.3f}): Poor separation")

### 5. Visualize Clusters with PCA

In [None]:
# Apply PCA for 2D visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_initial)

# Plot
plt.figure(figsize=(12, 8))
colors = ['#FF6B6B', '#FFD93D', '#6BCB77']  # Red, Yellow, Green

for i in range(3):
    mask = mapped_labels == i
    plt.scatter(X_pca[mask, 0], X_pca[mask, 1],
               c=colors[i], label=cluster_names[i],
               alpha=0.6, s=100, edgecolors='black', linewidth=0.5)

# Plot centroids
centroids_pca = pca.transform(kmeans.cluster_centers_)
plt.scatter(centroids_pca[:, 0], centroids_pca[:, 1],
           c='black', marker='X', s=300,
           edgecolors='white', linewidth=2,
           label='Centroids', zorder=10)

plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)', fontsize=12)
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)', fontsize=12)
plt.title('Student Engagement Clusters (K-Means)', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

### 6. Save Clustering Model

In [None]:
# Save model and mapping
MODEL_PATH = '/content/drive/MyDrive/FYP_Data/Models/'
!mkdir -p "$MODEL_PATH"

with open(MODEL_PATH + 'kmeans_model.pkl', 'wb') as f:
    pickle.dump(kmeans, f)

with open(MODEL_PATH + 'cluster_mapping.pkl', 'wb') as f:
    pickle.dump(cluster_mapping, f)

print("‚úÖ Clustering model saved")

## Part B: Engagement Prediction (Supervised Learning)

### 1. Prepare Data for Classification

In [None]:
# Use completed questions for supervised learning
X_train, X_test, y_train, y_test = train_test_split(
    X_completed, y_completed,
    test_size=0.3,
    random_state=42,
    stratify=y_completed
)

print("Data Split:")
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

print("\nClass distribution in training:")
unique, counts = np.unique(y_train, return_counts=True)
for label, count in zip(unique, counts):
    print(f"  {cluster_names[label]}: {count} ({count/len(y_train)*100:.1f}%)")

### 2. Handle Class Imbalance with SMOTE

In [None]:
# Apply SMOTE to balance classes
print("Applying SMOTE to balance classes...")
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print(f"\nBefore SMOTE: {X_train.shape}")
print(f"After SMOTE: {X_train_balanced.shape}")

print("\nBalanced class distribution:")
unique, counts = np.unique(y_train_balanced, return_counts=True)
for label, count in zip(unique, counts):
    print(f"  {cluster_names[label]}: {count} ({count/len(y_train_balanced)*100:.1f}%)")

### 3. Train Random Forest Classifier

In [None]:
# Train Random Forest
print("Training Random Forest Classifier...")
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train_balanced, y_train_balanced)
print("‚úÖ Model trained")

### 4. Evaluate Random Forest

In [None]:
# Predictions
y_pred_rf = rf_model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred_rf)
precision = precision_score(y_test, y_pred_rf, average='weighted')
recall = recall_score(y_test, y_pred_rf, average='weighted')
f1 = f1_score(y_test, y_pred_rf, average='weighted')

print("Random Forest Performance:")
print("="*60)
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")
print("="*60)

# Classification report
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred_rf,
                          target_names=['Passive', 'Moderate', 'Active']))

### 5. Confusion Matrix

In [None]:
# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred_rf)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
           xticklabels=['Passive', 'Moderate', 'Active'],
           yticklabels=['Passive', 'Moderate', 'Active'])
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('Actual', fontsize=12)
plt.title('Confusion Matrix - Random Forest', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

### 6. Train XGBoost (Alternative)

In [None]:
# Train XGBoost
print("Training XGBoost Classifier...")
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    eval_metric='mlogloss'
)

xgb_model.fit(X_train_balanced, y_train_balanced)
print("‚úÖ XGBoost trained")

# Evaluate
y_pred_xgb = xgb_model.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb, average='weighted')

print(f"\nXGBoost Performance:")
print(f"Accuracy:  {accuracy_xgb:.4f}")
print(f"F1-Score:  {f1_xgb:.4f}")

### 7. Model Comparison

In [None]:
# Compare models
comparison = pd.DataFrame({
    'Model': ['Random Forest', 'XGBoost'],
    'Accuracy': [accuracy, accuracy_xgb],
    'F1-Score': [f1, f1_xgb]
})

print("Model Comparison:")
print(comparison.to_string(index=False))

# Choose best model
best_model_name = 'Random Forest' if accuracy >= accuracy_xgb else 'XGBoost'
best_model = rf_model if accuracy >= accuracy_xgb else xgb_model
print(f"\n‚úÖ Best Model: {best_model_name}")

### 8. Cross-Validation

In [None]:
# 5-fold cross-validation
print("Performing 5-fold cross-validation...")
cv_scores = cross_val_score(best_model, X_train_balanced, y_train_balanced,
                           cv=5, scoring='accuracy')

print(f"\nCV Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

### 9. Save Prediction Models

In [None]:
# Save models
with open(MODEL_PATH + 'random_forest_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

with open(MODEL_PATH + 'xgboost_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

print("‚úÖ Prediction models saved to:")
print(MODEL_PATH)

## Summary & Results

In [None]:
print("="*80)
print(" "*25 + "MODEL TRAINING SUMMARY")
print("="*80)

print("\nüìä CLUSTERING (K-Means):")
print(f"  Silhouette Score:     {silhouette:.4f}")
print(f"  Davies-Bouldin Index: {davies_bouldin:.4f}")
print(f"  Number of Clusters:   3")

print("\nüéØ CLASSIFICATION (Random Forest):")
print(f"  Accuracy:  {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1-Score:  {f1:.4f}")

print("\nüíæ SAVED MODELS:")
print(f"  ‚úÖ kmeans_model.pkl")
print(f"  ‚úÖ random_forest_model.pkl")
print(f"  ‚úÖ xgboost_model.pkl")
print(f"  ‚úÖ cluster_mapping.pkl")

print("\n" + "="*80)
print("‚úÖ Model training complete!")
print("="*80)