# Data Preprocessing Notebook
## Privacy, Security, and Compliance of GenAI in LMS

**Module:** 7150CEM
**Date:** November 2024

This notebook performs data cleaning, transformation, and preparation for analysis.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

print('Environment ready')

## 1. Load Raw Data

In [None]:
# Load raw dataset
df_raw = pd.read_csv('../data_proc/survey_raw_copy.csv')
print(f'Loaded: {df_raw.shape[0]:,} rows × {df_raw.shape[1]} columns')

## 2. Handle Missing Data

In [None]:
# Analyze missing data patterns
missing_pct = (df_raw.isnull().sum() / len(df_raw)) * 100
print('Variables with >20% missing:')
print(missing_pct[missing_pct > 20].sort_values(ascending=False))

## 3. Data Cleaning

In [None]:
# Remove duplicates
df_clean = df_raw.drop_duplicates()
print(f'Removed {len(df_raw) - len(df_clean)} duplicates')

# Remove rows with excessive missing data (>50% missing)
missing_per_row = df_clean.isnull().sum(axis=1) / len(df_clean.columns)
df_clean = df_clean[missing_per_row < 0.5]
print(f'Remaining rows: {len(df_clean):,}')

## 4. Variable Transformation

In [None]:
# Standardize Likert scales to 1-5
# Q18 series (privacy concerns) already in 1-5 format
q18_cols = [col for col in df_clean.columns if col.startswith('Q18')]
print(f'Privacy concern variables (Q18): {len(q18_cols)}')

## 5. Create Composite Indices

In [None]:
# Privacy Concern Index (PCI)
q18_cols = [col for col in df_clean.columns if col.startswith('Q18') and len(col) == 4]
if q18_cols:
    df_clean['Privacy_Concern_Index'] = df_clean[q18_cols].mean(axis=1)
    print(f'Privacy Concern Index created')
    print(f'  Mean: {df_clean["Privacy_Concern_Index"].mean():.2f}')
    print(f'  Median: {df_clean["Privacy_Concern_Index"].median():.2f}')
    print(f'  Std: {df_clean["Privacy_Concern_Index"].std():.2f}')

In [None]:
# Data Protection Awareness Index (DPAI)
q19_cols = [col for col in df_clean.columns if col.startswith('Q19')]
if q19_cols:
    df_clean['Data_Protection_Awareness_Index'] = df_clean[q19_cols].mean(axis=1)
    print(f'Data Protection Awareness Index created')
    print(f'  Mean: {df_clean["Data_Protection_Awareness_Index"].mean():.2f}')

## 6. Demographic Recoding

In [None]:
# Age groups
if 'Q3' in df_clean.columns:
    df_clean['Age'] = pd.to_numeric(df_clean['Q3'], errors='coerce')
    df_clean['Age_Group'] = pd.cut(df_clean['Age'], 
                                     bins=[0, 21, 25, 30, 40, 100],
                                     labels=['18-21', '22-25', '26-30', '31-40', '41+'])
    print('Age groups created')
    print(df_clean['Age_Group'].value_counts())

## 7. Data Quality Checks

In [None]:
# Check for outliers in age
if 'Age' in df_clean.columns:
    Q1 = df_clean['Age'].quantile(0.25)
    Q3 = df_clean['Age'].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df_clean[(df_clean['Age'] < Q1 - 1.5*IQR) | (df_clean['Age'] > Q3 + 1.5*IQR)]
    print(f'Age outliers detected: {len(outliers)}')

## 8. Save Cleaned Dataset

In [None]:
# Save cleaned dataset
output_file = '../data_proc/survey_clean.csv'
df_clean.to_csv(output_file, index=False)
print(f'✓ Cleaned dataset saved: {output_file}')
print(f'  Final shape: {df_clean.shape[0]:,} rows × {df_clean.shape[1]} columns')

## 9. Preprocessing Summary

In [None]:
print('='*80)
print('PREPROCESSING SUMMARY')
print('='*80)
print(f'Original rows: {len(df_raw):,}')
print(f'Final rows: {len(df_clean):,}')
print(f'Rows removed: {len(df_raw) - len(df_clean):,} ({(len(df_raw)-len(df_clean))/len(df_raw)*100:.1f}%)')
print(f'\nComposite indices created:')
if 'Privacy_Concern_Index' in df_clean.columns:
    print(f'  ✓ Privacy Concern Index')
if 'Data_Protection_Awareness_Index' in df_clean.columns:
    print(f'  ✓ Data Protection Awareness Index')
print('\n✓ Data preprocessing complete')