# Preprocessing for Real-Time Clustering Model

This notebook prepares training data for the real-time student engagement clustering system.

**Purpose**: Create labeled training data where cluster assignments are generated dynamically based on cumulative student performance.

**Workflow**:
1. Load datasets from Google Drive
2. Filter participating students ONLY
3. Separate initial questions from regular questions
4. Apply dynamic cluster assignment logic
5. Create features and labels
6. Save preprocessed data for model training

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import pickle
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

## Step 1: Load Datasets

In [None]:
# Load datasets
df = pd.read_csv('/content/drive/MyDrive/FYP_Data/Merge_Enhanced_Fixed.csv')
participant_df = pd.read_csv('/content/drive/MyDrive/FYP_Data/Participant_Tracking.csv')

print(f"Dataset loaded: {df.shape[0]} records")
print(f"Columns: {list(df.columns)}")
print(f"\nParticipant tracking: {participant_df.shape[0]} events")
df.head()

## Step 2: Filter Participating Students ONLY

**Critical**: Only students who joined the session should be included in clustering.

In [None]:
# Get students who joined sessions (participating students)
participated_students = participant_df[
    participant_df['Event Type'] == 'Joined'
]['Admission No'].unique()

print(f"Total students in dataset: {df['Admission No'].nunique()}")
print(f"Students who participated: {len(participated_students)}")

# Filter dataset to only include participants
df_filtered = df[df['Admission No'].isin(participated_students)].copy()

print(f"\nRecords after filtering: {df_filtered.shape[0]}")
print(f"Students after filtering: {df_filtered['Admission No'].nunique()}")

## Step 3: Separate Initial Questions from Regular Questions

In [None]:
# Separate initial questions (Quiz# = 0) from regular questions
df_initial = df_filtered[df_filtered['Quiz#'] == 0].copy()
df_regular = df_filtered[df_filtered['Quiz#'] > 0].copy()

print(f"Initial questions: {df_initial.shape[0]} (should be {len(participated_students)})")
print(f"Regular questions: {df_regular.shape[0]}")

# Verify each student has exactly 1 initial question
initial_counts = df_initial.groupby('Admission No').size()
print(f"\nStudents with != 1 initial question: {(initial_counts != 1).sum()}")
if (initial_counts != 1).sum() > 0:
    print("WARNING: Some students have != 1 initial question!")

## Step 4: Apply Dynamic Cluster Assignment Logic

For each student, calculate cumulative metrics after each question and assign cluster based on performance.

In [None]:
def assign_cluster(accuracy, avg_response_time, has_network_issue):
    """
    Assign cluster based on cumulative performance metrics.
    
    Rules:
    - If network issue: Passive (can't judge performance fairly)
    - High accuracy + fast response: Active
    - Medium accuracy + medium response: Moderate
    - Otherwise: Passive
    """
    if has_network_issue:
        return 'Passive'
    
    if accuracy > 0.80 and avg_response_time < 30:
        return 'Active'
    elif accuracy > 0.50 and avg_response_time < 60:
        return 'Moderate'
    else:
        return 'Passive'

# Process each student's questions chronologically
df_regular_sorted = df_regular.sort_values(['Admission No', 'Timestamp']).copy()

# Initialize lists to store features and labels
training_data = []

for student_id in participated_students:
    student_questions = df_regular_sorted[df_regular_sorted['Admission No'] == student_id]
    
    # Initialize cumulative metrics
    correct_count = 0
    total_count = 0
    response_times = []
    
    for idx, row in student_questions.iterrows():
        # Update cumulative metrics
        if row['Attempt Status'] == 'Completed':
            correct_count += row['Is_Correct']
            total_count += 1
            response_times.append(row['Response Time (seconds)'])
            has_network_issue = False
        else:
            # Not completed - check network
            total_count += 1
            response_times.append(row['Response Time (seconds)'])
            has_network_issue = (row['RTT (ms)'] > 3000 or row['Jitter (ms)'] > 2000 or row['Stability (%)'] < 75)
        
        # Calculate cumulative metrics
        if total_count > 0:
            cumulative_accuracy = correct_count / total_count
            avg_response_time = np.mean(response_times)
            
            # Assign cluster
            cluster = assign_cluster(cumulative_accuracy, avg_response_time, has_network_issue)
            
            # Create feature vector
            if row['Attempt Status'] == 'Completed':
                # For completed: Response time + correctness ONLY
                features = {
                    'cumulative_accuracy': cumulative_accuracy,
                    'avg_response_time': avg_response_time,
                    'total_questions': total_count,
                    'current_response_time': row['Response Time (seconds)'],
                    'is_correct': row['Is_Correct'],
                    'cluster': cluster,
                    'admission_no': student_id,
                    'question_type': 'completed'
                }
            else:
                # For not completed: Include network metrics
                features = {
                    'cumulative_accuracy': cumulative_accuracy,
                    'avg_response_time': avg_response_time,
                    'total_questions': total_count,
                    'current_response_time': row['Response Time (seconds)'],
                    'rtt': row['RTT (ms)'],
                    'jitter': row['Jitter (ms)'],
                    'stability': row['Stability (%)'],
                    'cluster': cluster,
                    'admission_no': student_id,
                    'question_type': 'not_completed'
                }
            
            training_data.append(features)

# Convert to DataFrame
training_df = pd.DataFrame(training_data)

print(f"Training samples created: {len(training_df)}")
print(f"\nCluster distribution:")
print(training_df['cluster'].value_counts())
print(f"\nCluster distribution (%):")
print(training_df['cluster'].value_counts(normalize=True) * 100)

## Step 5: Prepare Initial Question Features

Initial questions use different features (response time + network metrics) for K-Means clustering.

In [None]:
# Prepare initial question features
initial_features = df_initial[[
    'Response Time (seconds)',
    'RTT (ms)',
    'Jitter (ms)',
    'Stability (%)'
]].values

# Assign initial clusters based on response time and network
# Simple rule: Good network + fast response = Moderate start, else Passive
initial_clusters = []
for _, row in df_initial.iterrows():
    if row['Response Time (seconds)'] < 45 and row['Stability (%)'] > 80:
        initial_clusters.append('Moderate')
    else:
        initial_clusters.append('Passive')

df_initial['initial_cluster'] = initial_clusters

print(f"Initial question features shape: {initial_features.shape}")
print(f"\nInitial cluster distribution:")
print(df_initial['initial_cluster'].value_counts())

## Step 6: Create Feature Matrices for Training

In [None]:
# Separate completed and not completed questions
completed_df = training_df[training_df['question_type'] == 'completed'].copy()
not_completed_df = training_df[training_df['question_type'] == 'not_completed'].copy()

print(f"Completed questions: {len(completed_df)}")
print(f"Not completed questions: {len(not_completed_df)}")

# Features for completed questions (NO network metrics)
X_completed = completed_df[[
    'cumulative_accuracy',
    'avg_response_time',
    'total_questions',
    'current_response_time',
    'is_correct'
]].values

y_completed = completed_df['cluster'].values

# Features for not completed questions (WITH network metrics)
X_not_completed = not_completed_df[[
    'cumulative_accuracy',
    'avg_response_time',
    'total_questions',
    'current_response_time',
    'rtt',
    'jitter',
    'stability'
]].values

y_not_completed = not_completed_df['cluster'].values

print(f"\nCompleted features shape: {X_completed.shape}")
print(f"Not completed features shape: {X_not_completed.shape}")

## Step 7: Scale Features

In [None]:
from sklearn.preprocessing import StandardScaler

# Scale initial question features
scaler_initial = StandardScaler()
X_initial_scaled = scaler_initial.fit_transform(initial_features)

# Scale completed question features
scaler_completed = StandardScaler()
X_completed_scaled = scaler_completed.fit_transform(X_completed)

# Scale not completed question features
scaler_not_completed = StandardScaler()
X_not_completed_scaled = scaler_not_completed.fit_transform(X_not_completed)

print("Feature scaling completed.")
print(f"\nInitial features - Mean: {X_initial_scaled.mean():.4f}, Std: {X_initial_scaled.std():.4f}")
print(f"Completed features - Mean: {X_completed_scaled.mean():.4f}, Std: {X_completed_scaled.std():.4f}")
print(f"Not completed features - Mean: {X_not_completed_scaled.mean():.4f}, Std: {X_not_completed_scaled.std():.4f}")

## Step 8: Save Preprocessed Data

In [None]:
# Create output directory
import os
output_dir = '/content/drive/MyDrive/FYP_Data/Preprocessed'
os.makedirs(output_dir, exist_ok=True)

# Save arrays
np.save(f'{output_dir}/X_initial_scaled.npy', X_initial_scaled)
np.save(f'{output_dir}/y_initial.npy', df_initial['initial_cluster'].values)

np.save(f'{output_dir}/X_completed_scaled.npy', X_completed_scaled)
np.save(f'{output_dir}/y_completed.npy', y_completed)

np.save(f'{output_dir}/X_not_completed_scaled.npy', X_not_completed_scaled)
np.save(f'{output_dir}/y_not_completed.npy', y_not_completed)

# Save scalers
with open(f'{output_dir}/scaler_initial.pkl', 'wb') as f:
    pickle.dump(scaler_initial, f)

with open(f'{output_dir}/scaler_completed.pkl', 'wb') as f:
    pickle.dump(scaler_completed, f)

with open(f'{output_dir}/scaler_not_completed.pkl', 'wb') as f:
    pickle.dump(scaler_not_completed, f)

# Save full training dataset as CSV
training_df.to_csv(f'{output_dir}/Final_Training_Data.csv', index=False)

print("\n✅ All preprocessed data saved successfully!")
print(f"\nSaved files:")
print(f"  - X_initial_scaled.npy ({X_initial_scaled.shape})")
print(f"  - y_initial.npy ({len(df_initial)} labels)")
print(f"  - X_completed_scaled.npy ({X_completed_scaled.shape})")
print(f"  - y_completed.npy ({len(y_completed)} labels)")
print(f"  - X_not_completed_scaled.npy ({X_not_completed_scaled.shape})")
print(f"  - y_not_completed.npy ({len(y_not_completed)} labels)")
print(f"  - scaler_initial.pkl")
print(f"  - scaler_completed.pkl")
print(f"  - scaler_not_completed.pkl")
print(f"  - Final_Training_Data.csv ({len(training_df)} records)")

## Summary

**Preprocessing Complete!**

- ✅ Filtered participating students only
- ✅ Applied dynamic cluster assignment based on cumulative performance
- ✅ Created separate features for:
  - Initial questions (for K-Means baseline)
  - Completed questions (response time + correctness)
  - Not completed questions (+ network metrics)
- ✅ Scaled all features
- ✅ Saved 10 files for model training

**Next Step**: Open `02_Model_Training_RealTime.ipynb` to train the models.