# Data Preprocessing with Detailed Step-by-Step Outputs

**Fixed Issues**:
- ‚úÖ Each student has ONLY 1 initial question (not 3)
- ‚úÖ Clusters update dynamically based on performance
- ‚úÖ Show dataset state after each transformation
- ‚úÖ Balanced data distribution

**Dataset**: `Merge_Enhanced_Fixed.csv`

## Setup

In [None]:
# Install packages
!pip install pandas numpy matplotlib seaborn scikit-learn -q
print("‚úÖ Packages installed")

In [None]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')
print("‚úÖ Drive mounted")

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
print("‚úÖ Libraries imported")

## Step 1: Load Fixed Dataset

In [None]:
# Load dataset
DATA_PATH = '/content/drive/MyDrive/FYP_Data/'
df = pd.read_csv(DATA_PATH + 'Merge_Enhanced_Fixed.csv')

print("="*80)
print(" "*20 + "STEP 1: DATASET LOADED")
print("="*80)
print(f"\nTotal records: {len(df)}")
print(f"Total students: {df['Admission No'].nunique()}")
print(f"Columns: {len(df.columns)}")

print(f"\nÔøΩÔøΩ Dataset Info:")
print(df.info())

print(f"\nüìä First 5 rows:")
display(df.head())

## Step 2: Verify Initial Questions (1 per student)

In [None]:
# Check initial questions
initial_q = df[df['Quiz#'] == 0]
regular_q = df[df['Quiz#'] > 0]

print("="*80)
print(" "*15 + "STEP 2: VERIFIED INITIAL QUESTIONS")
print("="*80)

print(f"\n‚úÖ Initial Questions (Quiz# = 0):")
print(f"  Total records: {len(initial_q)}")
print(f"  Unique students: {initial_q['Admission No'].nunique()}")

# Verify 1 per student
initial_per_student = initial_q.groupby('Admission No').size()
print(f"  Questions per student: {initial_per_student.unique()}")

if len(initial_per_student.unique()) == 1 and initial_per_student.unique()[0] == 1:
    print("  ‚úÖ CORRECT: Each student has exactly 1 initial question")
else:
    print("  ‚ùå ERROR: Some students have multiple initial questions")

print(f"\nüìä Initial Cluster Distribution:")
for cluster, count in initial_q['Engagement Level'].value_counts().items():
    print(f"  {cluster}: {count} ({count/len(initial_q)*100:.1f}%)")

print(f"\nüìä Regular Questions (Quiz# > 0):")
print(f"  Total records: {len(regular_q)}")

print(f"\nüìä Sample Initial Questions:")
display(initial_q.head(3)[['Admission No', 'Student Name', 'Question', 
                           'Response Time (sec)', 'Engagement Level', 'Network Quality']])

## Step 3: Filter Participating Students

In [None]:
# Load participant tracking
participant_df = pd.read_csv(DATA_PATH + 'Participant_Tracking.csv')

# Get participating students
participated = participant_df[
    participant_df['Event Type'] == 'Joined'
]['Admission No'].unique()

print("="*80)
print(" "*15 + "STEP 3: FILTER PARTICIPATING STUDENTS")
print("="*80)

print(f"\nStudents who joined sessions: {len(participated)}")
print(f"Total students in dataset: {df['Admission No'].nunique()}")

# Filter
df_filtered = df[df['Admission No'].isin(participated)].copy()

print(f"\n‚úÖ After filtering:")
print(f"  Records: {len(df_filtered)} (from {len(df)})")
print(f"  Students: {df_filtered['Admission No'].nunique()}")

if len(participated) == df['Admission No'].nunique():
    print(f"  ‚úÖ All students participated (100% participation rate)")

# Update df
df = df_filtered.copy()
print(f"\nüìä Dataset shape after filtering: {df.shape}")

## Step 4: Analyze Cluster Updates

In [None]:
# Load cluster history
cluster_history = pd.read_csv(DATA_PATH + 'Cluster_Update_History.csv')

print("="*80)
print(" "*15 + "STEP 4: ANALYZE DYNAMIC CLUSTER UPDATES")
print("="*80)

print(f"\nStudents with cluster transitions: {len(cluster_history)}")
print(f"Total cluster transitions: {cluster_history['Transitions'].sum()}")

print(f"\nüìä Cluster Transition Examples:")
display(cluster_history.head(10))

print(f"\nüìä Transition Statistics:")
print(f"  Students who improved (changed cluster): {len(cluster_history[cluster_history['Transitions'] > 0])}")
print(f"  Average transitions per student: {cluster_history['Transitions'].mean():.2f}")
print(f"  Max transitions: {cluster_history['Transitions'].max()}")

# Visualize transitions
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
cluster_history['Transitions'].hist(bins=20, edgecolor='black')
plt.xlabel('Number of Transitions')
plt.ylabel('Number of Students')
plt.title('Distribution of Cluster Transitions')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
transition_matrix = pd.crosstab(cluster_history['Initial Cluster'], 
                               cluster_history['Final Cluster'])
sns.heatmap(transition_matrix, annot=True, fmt='d', cmap='YlOrRd')
plt.title('Cluster Transition Matrix')
plt.xlabel('Final Cluster')
plt.ylabel('Initial Cluster')

plt.tight_layout()
plt.show()

print(f"\n‚úÖ Cluster updates tracked successfully")

## Step 5: Separate by Question Type

In [None]:
# Separate
initial_questions = df[df['Quiz#'] == 0].copy()
quiz_questions = df[df['Quiz#'] > 0].copy()

# Further separate by completion
completed = quiz_questions[quiz_questions['Attempt Status'] == 'Completed'].copy()
not_completed = quiz_questions[quiz_questions['Attempt Status'] == 'Not Completed'].copy()

print("="*80)
print(" "*15 + "STEP 5: SEPARATE BY QUESTION TYPE")
print("="*80)

print(f"\nüìä Dataset Breakdown:")
print(f"  Initial Questions: {len(initial_questions)}")
print(f"  Quiz Questions: {len(quiz_questions)}")
print(f"    - Completed: {len(completed)} ({len(completed)/len(quiz_questions)*100:.1f}%)")
print(f"    - Not Completed: {len(not_completed)} ({len(not_completed)/len(quiz_questions)*100:.1f}%)")

print(f"\nüìä Engagement Distribution by Type:")
print(f"\nInitial Questions:")
print(initial_questions['Engagement Level'].value_counts())
print(f"\nCompleted Questions:")
print(completed['Engagement Level'].value_counts())
print(f"\nNot Completed Questions:")
print(not_completed['Engagement Level'].value_counts())

## Step 6: Feature Engineering

In [None]:
def prepare_features(data_df):
    df_prep = data_df.copy()
    
    # Binary encoding
    df_prep['Is_Correct_Binary'] = df_prep['Is Correct'].apply(
        lambda x: 1 if str(x).lower() == 'yes' else 0
    )
    
    # Engagement encoding
    engagement_map = {'Passive': 0, 'Moderate': 1, 'Active': 2}
    df_prep['Engagement_Encoded'] = df_prep['Engagement Level'].map(engagement_map)
    
    # Network quality encoding
    network_map = {'Poor': 0, 'Fair': 1, 'Good': 2, 'Excellent': 3}
    df_prep['Network_Quality_Encoded'] = df_prep['Network Quality'].map(network_map)
    df_prep['Network_Quality_Encoded'].fillna(1, inplace=True)
    
    return df_prep

# Apply
initial_questions = prepare_features(initial_questions)
completed = prepare_features(completed)
not_completed = prepare_features(not_completed)

print("="*80)
print(" "*15 + "STEP 6: FEATURE ENGINEERING")
print("="*80)

print(f"\n‚úÖ Created binary encodings:")
print(f"  - Is_Correct_Binary (0/1)")
print(f"  - Engagement_Encoded (0=Passive, 1=Moderate, 2=Active)")
print(f"  - Network_Quality_Encoded (0=Poor, 1=Fair, 2=Good, 3=Excellent)")

print(f"\nüìä Sample encoded data:")
display(initial_questions[['Engagement Level', 'Engagement_Encoded', 
                          'Network Quality', 'Network_Quality_Encoded']].head())

## Step 7: Feature Selection by Status

In [None]:
print("="*80)
print(" "*15 + "STEP 7: FEATURE SELECTION BY STATUS")
print("="*80)

# STAGE 1: Initial (baseline clustering)
initial_features = ['Response Time (sec)', 'RTT (ms)', 'Jitter (ms)', 'Stability (%)']
X_initial = initial_questions[initial_features].copy()
y_initial = initial_questions['Engagement_Encoded'].copy()

print(f"\n‚úÖ Stage 1 - Initial Questions (Baseline):")
print(f"  Features: {initial_features}")
print(f"  Shape: {X_initial.shape}")
print(f"  Rationale: Use all metrics for initial engagement assessment")

# STAGE 2: Completed (NO network params)
completed_features = ['Response Time (sec)', 'Is_Correct_Binary']
X_completed = completed[completed_features].copy()
y_completed = completed['Engagement_Encoded'].copy()

print(f"\n‚úÖ Stage 2 - Completed Questions:")
print(f"  Features: {completed_features}")
print(f"  Shape: {X_completed.shape}")
print(f"  ‚ö†Ô∏è  Network params EXCLUDED (student succeeded)")
print(f"  Rationale: Network not a factor if question completed successfully")

# STAGE 3: Not Completed (USE network params)
not_completed_features = ['Response Time (sec)', 'RTT (ms)', 'Jitter (ms)', 
                          'Stability (%)', 'Network_Quality_Encoded']
X_not_completed = not_completed[not_completed_features].copy()
y_not_completed = not_completed['Engagement_Encoded'].copy()

print(f"\n‚úÖ Stage 3 - Not Completed Questions:")
print(f"  Features: {not_completed_features}")
print(f"  Shape: {X_not_completed.shape}")
print(f"  ‚úÖ Network params INCLUDED for validation")
print(f"  Rationale: Need to check if network caused failure")

print(f"\nüìä Feature Statistics:")
print(f"\nInitial Questions:")
display(X_initial.describe())
print(f"\nCompleted Questions:")
display(X_completed.describe())
print(f"\nNot Completed Questions:")
display(X_not_completed.describe())

## Step 8: Scale Features

In [None]:
# Create scalers
scaler_initial = StandardScaler()
scaler_completed = StandardScaler()
scaler_not_completed = StandardScaler()

# Fit and transform
X_initial_scaled = scaler_initial.fit_transform(X_initial)
X_completed_scaled = scaler_completed.fit_transform(X_completed)
X_not_completed_scaled = scaler_not_completed.fit_transform(X_not_completed)

print("="*80)
print(" "*15 + "STEP 8: STANDARDIZE FEATURES")
print("="*80)

print(f"\n‚úÖ Scaled using StandardScaler:")
print(f"  Initial: {X_initial_scaled.shape}")
print(f"  Completed: {X_completed_scaled.shape}")
print(f"  Not Completed: {X_not_completed_scaled.shape}")

print(f"\nüìä Scaling Statistics (Initial):")
print(f"  Mean: {X_initial_scaled.mean(axis=0)}")
print(f"  Std: {X_initial_scaled.std(axis=0)}")

print(f"\n‚úÖ All features normalized to mean=0, std=1")

## Step 9: Save Preprocessed Data

In [None]:
# Save
OUTPUT_PATH = '/content/drive/MyDrive/FYP_Data/Preprocessed/'
!mkdir -p "$OUTPUT_PATH"

# Save arrays
np.save(OUTPUT_PATH + 'X_initial_scaled.npy', X_initial_scaled)
np.save(OUTPUT_PATH + 'y_initial.npy', y_initial.values)
np.save(OUTPUT_PATH + 'X_completed_scaled.npy', X_completed_scaled)
np.save(OUTPUT_PATH + 'y_completed.npy', y_completed.values)
np.save(OUTPUT_PATH + 'X_not_completed_scaled.npy', X_not_completed_scaled)
np.save(OUTPUT_PATH + 'y_not_completed.npy', y_not_completed.values)

# Save scalers
import pickle
with open(OUTPUT_PATH + 'scaler_initial.pkl', 'wb') as f:
    pickle.dump(scaler_initial, f)
with open(OUTPUT_PATH + 'scaler_completed.pkl', 'wb') as f:
    pickle.dump(scaler_completed, f)
with open(OUTPUT_PATH + 'scaler_not_completed.pkl', 'wb') as f:
    pickle.dump(scaler_not_completed, f)

print("="*80)
print(" "*15 + "STEP 9: SAVE PREPROCESSED DATA")
print("="*80)

print(f"\n‚úÖ Saved to: {OUTPUT_PATH}")
print(f"\nFiles created:")
print(f"  1. X_initial_scaled.npy ({X_initial_scaled.shape})")
print(f"  2. y_initial.npy ({len(y_initial)},)")
print(f"  3. X_completed_scaled.npy ({X_completed_scaled.shape})")
print(f"  4. y_completed.npy ({len(y_completed)},)")
print(f"  5. X_not_completed_scaled.npy ({X_not_completed_scaled.shape})")
print(f"  6. y_not_completed.npy ({len(y_not_completed)},)")
print(f"  7. scaler_initial.pkl")
print(f"  8. scaler_completed.pkl")
print(f"  9. scaler_not_completed.pkl")

print(f"\n" + "="*80)
print(" "*25 + "PREPROCESSING COMPLETE")
print("="*80)
print(f"\n‚úÖ All data preprocessed and saved")
print(f"‚úÖ Ready for model training")
print(f"\nNext: Run 02_Model1_Clustering_Prediction.ipynb")