In [1]:
# =============================================================================
# CELL 1: Setup (Always Run First)
# =============================================================================

# Add Python_files directory to path
import sys
import os
sys.path.append(os.path.join('..', 'Python_files'))

# Import all required modules
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Import custom modules
from data_loader import load_and_validate
from data_processor import process_all_data
from analysis_engine import run_complete_analysis, create_cross_tables
from visualization import (
    create_plotly_regression_plot, 
    create_student_segments_plot,
    plot_all_cross_tables,
    create_all_visualizations
)
from config_file import VARIABLE_CONFIGS, RESULT_CODE_MAPPING, CATEGORICAL_VARS

print("✅ Setup complete - ready to analyze!")
print(f"📁 Working directory: {os.getcwd()}")
print(f"📊 Available variables: {', '.join(CATEGORICAL_VARS)}")

✅ Setup complete - ready to analyze!
📁 Working directory: c:\Dissertation_Term3\MScProject\Notebooks
📊 Available variables: gender, region, highest_education, imd_band, age_band, disability


In [2]:
# =============================================================================
# CELL 2: Quick Load and Process (Run Once)
# =============================================================================

# Load and process all data in one step
print("🔄 Loading data...")
data, is_valid = load_and_validate()

if is_valid:
    print("\n🔄 Processing data...")
    processed_data = process_all_data(data)
    print(f"\n✅ Ready! Dataset: {processed_data.shape[0]:,} students × {processed_data.shape[1]} features")
    
    # Quick data overview
    print(f"\n📊 QUICK OVERVIEW:")
    print(f"   • Score range: {processed_data['score'].min():.0f} - {processed_data['score'].max():.0f}")
    print(f"   • VLE clicks range: {processed_data['total_click_vle'].min()} - {processed_data['total_click_vle'].max():,}")
    print(f"   • High performers: {(processed_data['excellent_Score'] == 1).sum():,} ({(processed_data['excellent_Score'] == 1).mean()*100:.1f}%)")
    print(f"   • Active in VLE: {(processed_data['active_in_VLE'] == 1).sum():,} ({(processed_data['active_in_VLE'] == 1).mean()*100:.1f}%)")
else:
    print("❌ Data loading failed. Please check your Data folder and files.")


🔄 Loading data...
LOADING STUDENT ASSESSMENT DATA
Loading data from: ..\Data
✓ Loaded assessments.csv: (206, 6)
✓ Loaded courses.csv: (22, 3)
✓ Loaded studentAssessment.csv: (173912, 5)
✓ Loaded studentInfo.csv: (32593, 12)
✓ Loaded studentRegistration.csv: (32593, 5)
✓ Loaded studentVle.csv: (10655280, 6)
✓ Loaded vle.csv: (6364, 6)

Validating data structure...
✓ assessments has all required columns
✓ student_assessment has all required columns
✓ student_registration has all required columns
✓ student_vle has all required columns
✓ student_info has all required columns

✓ All data validation checks passed

DATA SUMMARY

ASSESSMENTS:
  Shape: (206, 6)
  Columns: ['code_module', 'code_presentation', 'id_assessment', 'assessment_type', 'date', 'weight']
  Missing values: 11

COURSES:
  Shape: (22, 3)
  Columns: ['code_module', 'code_presentation', 'module_presentation_length']
  Missing values: None

STUDENT ASSESSMENT:
  Shape: (173912, 5)
  Columns: ['id_assessment', 'id_student', 'da

In [3]:
# =============================================================================
# CELL 3: Show Processed Data
# =============================================================================

# Display the processed dataset
print(f"📋 PROCESSED DATASET OVERVIEW")
print(f"   Shape: {processed_data.shape}")
print(f"   Memory: {processed_data.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
print(f"\n📝 Columns: {list(processed_data.columns)}")

print("\n📊 FIRST 10 ROWS:")
display(processed_data.head(10))

print("\n📈 KEY STATISTICS:")
key_cols = ['score', 'total_click_vle', 'average_click_vle', 'excellent_Score', 'active_in_VLE']
display(processed_data[key_cols].describe())

print("\n🎯 FINAL RESULTS DISTRIBUTION:")
result_counts = processed_data['final_result'].value_counts().sort_index()
for result, count in result_counts.items():
    pct = count / len(processed_data) * 100
    print(f"   {result:12}: {count:,} ({pct:.1f}%)")

📋 PROCESSED DATASET OVERVIEW
   Shape: (27725, 24)
   Memory: 17.7 MB

📝 Columns: ['code_module', 'code_presentation', 'id_student', 'date_registration', 'date_unregistration', 'id_assessment', 'date_first_assessment', 'date_submitted', 'is_banked', 'score', 'total_click_vle', 'average_click_vle', 'excellent_Score', 'active_in_VLE', 'student_engagementt', 'gender', 'region', 'highest_education', 'imd_band', 'age_band', 'studied_credits', 'disability', 'final_result', 'final_result_code']

📊 FIRST 10 ROWS:


Unnamed: 0,code_module,code_presentation,id_student,date_registration,date_unregistration,id_assessment,date_first_assessment,date_submitted,is_banked,score,...,student_engagementt,gender,region,highest_education,imd_band,age_band,studied_credits,disability,final_result,final_result_code
0,AAA,2013J,11391,-159.0,,1752,19,18.0,0.0,78.0,...,1,M,East Anglian Region,HE Qualification,90-100%,55<=,240,N,Pass,2
1,AAA,2013J,28400,-53.0,,1752,19,22.0,0.0,70.0,...,1,F,Scotland,HE Qualification,20-30%,35-55,60,N,Pass,2
2,AAA,2013J,31604,-52.0,,1752,19,17.0,0.0,72.0,...,1,F,South East Region,A Level or Equivalent,50-60%,35-55,60,N,Pass,2
3,AAA,2013J,32885,-176.0,,1752,19,26.0,0.0,69.0,...,1,F,West Midlands Region,Lower Than A Level,50-60%,0-35,60,N,Pass,2
4,AAA,2013J,38053,-110.0,,1752,19,19.0,0.0,79.0,...,1,M,Wales,A Level or Equivalent,80-90%,35-55,60,N,Pass,2
5,AAA,2013J,45462,-67.0,,1752,19,20.0,0.0,70.0,...,1,M,Scotland,HE Qualification,30-40%,0-35,60,N,Pass,2
6,AAA,2013J,45642,-29.0,,1752,19,18.0,0.0,72.0,...,1,F,North Western Region,A Level or Equivalent,90-100%,0-35,120,N,Pass,2
7,AAA,2013J,52130,-33.0,,1752,19,19.0,0.0,72.0,...,1,F,East Anglian Region,A Level or Equivalent,70-80%,0-35,90,N,Pass,2
8,AAA,2013J,53025,-179.0,,1752,19,9.0,0.0,71.0,...,1,M,North Region,Post Graduate Qualification,,55<=,60,N,Pass,2
9,AAA,2013J,57506,-103.0,,1752,19,18.0,0.0,68.0,...,1,M,South Region,Lower Than A Level,70-80%,35-55,60,N,Pass,2



📈 KEY STATISTICS:


Unnamed: 0,score,total_click_vle,average_click_vle,excellent_Score,active_in_VLE
count,27725.0,27725.0,27725.0,27725.0,27725.0
mean,64.635564,329.356213,329.353133,0.601876,0.355996
std,31.174086,388.209551,152.890566,0.48952,0.478823
min,0.0,0.0,106.3,0.0,0.0
25%,57.0,85.0,162.4,0.0,0.0
50%,75.0,215.0,337.4,1.0,0.0
75%,86.0,442.0,482.1,1.0,1.0
max,100.0,7129.0,559.6,1.0,1.0



🎯 FINAL RESULTS DISTRIBUTION:
   Distinction : 3,024 (10.9%)
   Fail        : 7,044 (25.4%)
   Pass        : 12,361 (44.6%)
   Withdrawn   : 5,296 (19.1%)


In [4]:
# =============================================================================
# CELL 4: Create Cross-Tabulation Tables
# =============================================================================

# Create and display cross tables
print("📊 Creating cross-tabulation tables...")
cross_tables = create_cross_tables(processed_data)

print("\n📋 CROSS-TABULATION TABLES:")
print("   (Proportions by row - shows outcome distribution within each category)")
print("   Columns: 0=Withdrawn, 1=Fail, 2=Pass/Distinction\n")

# Display all cross tables with sample sizes
for var_name, table in cross_tables.items():
    print(f"\n🔍 {var_name.upper().replace('_', ' ')}:")
    print("-" * 60)
    display(table)
    
    # Show sample sizes
    sample_sizes = processed_data[var_name].value_counts().sort_index()
    print("📊 Sample sizes:")
    for category, count in sample_sizes.items():
        print(f"   {category}: {count:,} students")
    print()

📊 Creating cross-tabulation tables...
Creating cross-tabulation tables...
  Creating cross table for: gender
    Categories: 2, Outcomes: 3
  Creating cross table for: region
    Categories: 13, Outcomes: 3
  Creating cross table for: highest_education
    Categories: 5, Outcomes: 3
  Creating cross table for: imd_band
    Categories: 10, Outcomes: 3
  Creating cross table for: age_band
    Categories: 3, Outcomes: 3
  Creating cross table for: disability
    Categories: 2, Outcomes: 3
✓ Created 6 cross-tabulation tables

📋 CROSS-TABULATION TABLES:
   (Proportions by row - shows outcome distribution within each category)
   Columns: 0=Withdrawn, 1=Fail, 2=Pass/Distinction


🔍 GENDER:
------------------------------------------------------------


final_result_code,0,1,2
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
F,0.18,0.248,0.571
M,0.2,0.259,0.541


📊 Sample sizes:
   F: 12,478 students
   M: 15,247 students


🔍 REGION:
------------------------------------------------------------


final_result_code,0,1,2
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
East Anglian Region,0.175,0.247,0.579
East Midlands Region,0.204,0.238,0.558
Ireland,0.185,0.233,0.581
London Region,0.193,0.285,0.523
North Region,0.194,0.211,0.595
North Western Region,0.205,0.299,0.496
Scotland,0.197,0.27,0.534
South East Region,0.177,0.211,0.612
South Region,0.18,0.206,0.614
South West Region,0.189,0.224,0.587


📊 Sample sizes:
   East Anglian Region: 2,828 students
   East Midlands Region: 1,938 students
   Ireland: 1,118 students
   London Region: 2,606 students
   North Region: 1,547 students
   North Western Region: 2,355 students
   Scotland: 3,160 students
   South East Region: 1,776 students
   South Region: 2,639 students
   South West Region: 2,068 students
   Wales: 1,913 students
   West Midlands Region: 2,109 students
   Yorkshire Region: 1,668 students


🔍 HIGHEST EDUCATION:
------------------------------------------------------------


final_result_code,0,1,2
highest_education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A Level or Equivalent,0.173,0.223,0.604
HE Qualification,0.173,0.189,0.638
Lower Than A Level,0.217,0.314,0.469
No Formal quals,0.256,0.357,0.387
Post Graduate Qualification,0.173,0.118,0.709


📊 Sample sizes:
   A Level or Equivalent: 12,104 students
   HE Qualification: 4,166 students
   Lower Than A Level: 10,900 students
   No Formal quals: 266 students
   Post Graduate Qualification: 289 students


🔍 IMD BAND:
------------------------------------------------------------


final_result_code,0,1,2
imd_band,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0-10%,0.22,0.343,0.436
10-20,0.205,0.319,0.476
20-30%,0.226,0.279,0.495
30-40%,0.19,0.26,0.55
40-50%,0.192,0.254,0.554
50-60%,0.18,0.258,0.562
60-70%,0.189,0.213,0.597
70-80%,0.18,0.236,0.584
80-90%,0.169,0.207,0.624
90-100%,0.163,0.188,0.65


📊 Sample sizes:
   0-10%: 2,667 students
   10-20: 2,853 students
   20-30%: 3,009 students
   30-40%: 3,018 students
   40-50%: 2,740 students
   50-60%: 2,711 students
   60-70%: 2,524 students
   70-80%: 2,538 students
   80-90%: 2,391 students
   90-100%: 2,245 students


🔍 AGE BAND:
------------------------------------------------------------


final_result_code,0,1,2
age_band,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0-35,0.195,0.27,0.535
35-55,0.182,0.218,0.6
55<=,0.169,0.149,0.682


📊 Sample sizes:
   0-35: 19,327 students
   35-55: 8,203 students
   55<=: 195 students


🔍 DISABILITY:
------------------------------------------------------------


final_result_code,0,1,2
disability,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
N,0.182,0.252,0.565
Y,0.273,0.27,0.458


📊 Sample sizes:
   N: 25,087 students
   Y: 2,638 students



In [5]:
# =============================================================================
# CELL 5: Show Specific Cross Table
# =============================================================================

# Choose which cross table to display
# Options: 'gender', 'age_band', 'disability', 'highest_education', 'imd_band', 'region'
variable_to_show = 'imd_band'  # ⚙️ CHANGE THIS to see different variables

if variable_to_show in cross_tables:
    print(f"📊 Cross table for {variable_to_show.upper().replace('_', ' ')}:")
    print("=" * 50)
    display(cross_tables[variable_to_show])
    
    # Show interpretation
    print("\n💡 INTERPRETATION:")
    table = cross_tables[variable_to_show]
    best_category = table[2].idxmax()  # Category with highest pass rate
    worst_category = table[0].idxmax()  # Category with highest withdrawal rate
    print(f"   • Highest pass rate: {best_category} ({table.loc[best_category, 2]:.1%})")
    print(f"   • Highest withdrawal rate: {worst_category} ({table.loc[worst_category, 0]:.1%})")
else:
    print(f"❌ Variable '{variable_to_show}' not found.")
    print(f"📝 Available variables: {list(cross_tables.keys())}")

📊 Cross table for IMD BAND:


final_result_code,0,1,2
imd_band,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0-10%,0.22,0.343,0.436
10-20,0.205,0.319,0.476
20-30%,0.226,0.279,0.495
30-40%,0.19,0.26,0.55
40-50%,0.192,0.254,0.554
50-60%,0.18,0.258,0.562
60-70%,0.189,0.213,0.597
70-80%,0.18,0.236,0.584
80-90%,0.169,0.207,0.624
90-100%,0.163,0.188,0.65



💡 INTERPRETATION:
   • Highest pass rate: 90-100% (65.0%)
   • Highest withdrawal rate: 20-30% (22.6%)


In [6]:
# =============================================================================
# CELL 6: Create Individual Plot
# =============================================================================

# Create a plot for a specific variable
# Options: 'gender', 'age_band', 'disability', 'highest_education', 'imd_band', 'region'
variable_to_plot = 'imd_band'  # ⚙️ CHANGE THIS to plot different variables

if variable_to_plot in cross_tables and variable_to_plot in VARIABLE_CONFIGS:
    print(f"📈 Creating plot for {variable_to_plot.replace('_', ' ').title()}...")
    
    cross_table = cross_tables[variable_to_plot]
    config = VARIABLE_CONFIGS[variable_to_plot]
    
    # Handle special cases
    if variable_to_plot == 'region':
        sorted_regions = sorted(cross_table.index.tolist())
        cross_table = cross_table.reindex(sorted_regions)
        x_values = list(range(len(sorted_regions)))
        categories = sorted_regions
    elif variable_to_plot == 'highest_education' and 'order' in config:
        cross_table = cross_table.reindex(config['order'])
        x_values = config['x_values']
        categories = config['categories']
    else:
        x_values = config['x_values']
        categories = config['categories']
    
    # Create and show the plot
    fig = create_plotly_regression_plot(
        cross_table,
        variable_to_plot.replace('_', ' ').title(),
        x_values,
        config['x_label'],
        categories
    )
    
    # Show regression stats
    print("\n📊 REGRESSION STATISTICS:")
    from analysis_engine import calculate_linear_regression
    for col in cross_table.columns:
        y_values = cross_table[col].values
        regression = calculate_linear_regression(x_values, y_values)
        result_name = RESULT_CODE_MAPPING.get(col, f'Code {col}')
        trend = "📈 Positive" if regression['slope'] > 0 else "📉 Negative"
        strength = "💪 Strong" if regression['r_squared'] > 0.7 else "🔸 Moderate" if regression['r_squared'] > 0.3 else "🔹 Weak"
        print(f"   {result_name:>10}: R² = {regression['r_squared']:.3f}, Slope = {regression['slope']:+.4f} ({trend}, {strength})")
    
    fig.show()
    
else:
    print(f"❌ Variable '{variable_to_plot}' not available for plotting.")
    print(f"📝 Available variables: {list(cross_tables.keys())}")

📈 Creating plot for Imd Band...

📊 REGRESSION STATISTICS:
    Withdrawn: R² = 0.819, Slope = -0.0006 (📉 Negative, 💪 Strong)
         Fail: R² = 0.909, Slope = -0.0015 (📉 Negative, 💪 Strong)
         Pass: R² = 0.948, Slope = +0.0022 (📈 Positive, 💪 Strong)


In [10]:
## Complete Linear Regression Analysis - All Variables

# Enhanced version with detailed analysis and organized display
print("🔬 COMPREHENSIVE LINEAR REGRESSION ANALYSIS")
print("=" * 70)
print("This analysis examines linear relationships between all demographic")
print("variables and student outcomes (Withdrawn=0, Fail=1, Pass=2).\n")

# Ensure we have the required data
if 'cross_tables' not in locals():
    print("🔄 Creating cross-tabulation tables...")
    cross_tables = create_cross_tables(processed_data)

# Create all plots with enhanced statistics
from analysis_engine import calculate_linear_regression
import pandas as pd

print("📊 CREATING PLOTS WITH STATISTICAL ANALYSIS:")
print("-" * 60)

# Store all results for summary
all_results = []
plot_order = ['gender', 'age_band', 'disability', 'highest_education', 'imd_band', 'region']

for i, var_name in enumerate(plot_order, 1):
    if var_name in cross_tables:
        print(f"\n{i}. 📈 {var_name.replace('_', ' ').title()}:")
        print("   " + "-" * 45)
        
        cross_table = cross_tables[var_name]
        config = VARIABLE_CONFIGS[var_name].copy()
        
        # Handle special cases for x-values
        if var_name == 'region':
            sorted_regions = sorted(cross_table.index.tolist())
            cross_table = cross_table.reindex(sorted_regions)
            x_values = list(range(len(sorted_regions)))
            categories = sorted_regions
        elif var_name == 'highest_education' and 'order' in config:
            cross_table = cross_table.reindex(config['order'])
            x_values = config['x_values']
            categories = config['categories']
        else:
            x_values = config['x_values']
            categories = config['categories']
        
        # Calculate and display regression statistics
        print("   📊 Regression Statistics:")
        var_results = []
        for col in cross_table.columns:
            y_values = cross_table[col].values
            regression = calculate_linear_regression(x_values, y_values)
            result_name = RESULT_CODE_MAPPING.get(col, f'Code {col}')
            
            # Classify relationship strength
            r2 = regression['r_squared']
            if r2 > 0.7:
                strength = "💪 Strong"
            elif r2 > 0.4:
                strength = "🔸 Moderate" 
            elif r2 > 0.2:
                strength = "🔹 Weak"
            else:
                strength = "⚪ Very Weak"
            
            trend = "📈 Positive" if regression['slope'] > 0 else "📉 Negative"
            
            print(f"      {result_name:>11}: R² = {r2:.3f} | Slope = {regression['slope']:+.4f} | {strength} {trend}")
            
            # Store for summary
            var_results.append({
                'Variable': var_name.replace('_', ' ').title(),
                'Outcome': result_name,
                'R_Squared': r2,
                'Slope': regression['slope'],
                'Strength': strength.split()[1],  # Remove emoji
                'Trend': trend.split()[1]  # Remove emoji
            })
        
        all_results.extend(var_results)
        
        # Create and show the plot
        try:
            fig = create_plotly_regression_plot(
                cross_table,
                var_name.replace('_', ' ').title(),
                x_values,
                config['x_label'],
                categories
            )
            
            print("   ✅ Plot created successfully")
            fig.show()
            
        except Exception as e:
            print(f"   ❌ Error creating plot: {e}")

# Create comprehensive summary
print(f"\n📋 COMPREHENSIVE ANALYSIS SUMMARY:")
print("=" * 60)

if all_results:
    results_df = pd.DataFrame(all_results)
    
    # Overall statistics
    avg_r2 = results_df['R_Squared'].mean()
    max_r2 = results_df['R_Squared'].max()
    strong_count = len(results_df[results_df['R_Squared'] > 0.5])
    
    print(f"📊 Overall Statistics:")
    print(f"   • Total relationships analyzed: {len(results_df)}")
    print(f"   • Average R²: {avg_r2:.3f}")
    print(f"   • Strongest relationship: R² = {max_r2:.3f}")
    print(f"   • Strong relationships (R² > 0.5): {strong_count}")
    
    # Best predictors by variable
    print(f"\n🏆 Best Predictors by Variable:")
    best_by_var = results_df.loc[results_df.groupby('Variable')['R_Squared'].idxmax()]
    for _, row in best_by_var.iterrows():
        print(f"   • {row['Variable']:15} → {row['Outcome']:10}: R² = {row['R_Squared']:.3f}")
    
    # Strongest overall relationships
    print(f"\n🎯 Top 5 Strongest Relationships:")
    top_5 = results_df.nlargest(5, 'R_Squared')
    for i, (_, row) in enumerate(top_5.iterrows(), 1):
        trend_emoji = "📈" if row['Trend'] == 'Positive' else "📉"
        print(f"   {i}. {trend_emoji} {row['Variable']} → {row['Outcome']}: R² = {row['R_Squared']:.3f}")
    
    # Trend analysis
    print(f"\n📈 Trend Analysis:")
    positive_trends = len(results_df[results_df['Trend'] == 'Positive'])
    negative_trends = len(results_df[results_df['Trend'] == 'Negative'])
    print(f"   • Positive trends: {positive_trends} ({positive_trends/len(results_df)*100:.1f}%)")
    print(f"   • Negative trends: {negative_trends} ({negative_trends/len(results_df)*100:.1f}%)")
    
    # Display full results table
    print(f"\n📋 Complete Results Table:")
    print("-" * 50)
    display(results_df.sort_values('R_Squared', ascending=False).round(4))

else:
    print("❌ No results to summarize - check if plots were created successfully")

print(f"\n🎉 Complete analysis finished!")
print(f"💡 All plots are interactive - hover, zoom, and explore the data!")
print(f"📊 Use the regression lines to understand trends and relationships.")

🔬 COMPREHENSIVE LINEAR REGRESSION ANALYSIS
This analysis examines linear relationships between all demographic
variables and student outcomes (Withdrawn=0, Fail=1, Pass=2).

📊 CREATING PLOTS WITH STATISTICAL ANALYSIS:
------------------------------------------------------------

1. 📈 Gender:
   ---------------------------------------------
   📊 Regression Statistics:
        Withdrawn: R² = 1.000 | Slope = +0.0200 | 💪 Strong 📈 Positive
             Fail: R² = 1.000 | Slope = +0.0110 | 💪 Strong 📈 Positive
             Pass: R² = 1.000 | Slope = -0.0300 | 💪 Strong 📉 Negative
   ✅ Plot created successfully



2. 📈 Age Band:
   ---------------------------------------------
   📊 Regression Statistics:
        Withdrawn: R² = 1.000 | Slope = -0.0130 | 💪 Strong 📉 Negative
             Fail: R² = 0.993 | Slope = -0.0605 | 💪 Strong 📉 Negative
             Pass: R² = 0.996 | Slope = +0.0735 | 💪 Strong 📈 Positive
   ✅ Plot created successfully



3. 📈 Disability:
   ---------------------------------------------
   📊 Regression Statistics:
        Withdrawn: R² = 1.000 | Slope = +0.0910 | 💪 Strong 📈 Positive
             Fail: R² = 1.000 | Slope = +0.0180 | 💪 Strong 📈 Positive
             Pass: R² = 1.000 | Slope = -0.1070 | 💪 Strong 📉 Negative
   ✅ Plot created successfully



4. 📈 Highest Education:
   ---------------------------------------------
   📊 Regression Statistics:
        Withdrawn: R² = 0.788 | Slope = -0.0210 | 💪 Strong 📉 Negative
             Fail: R² = 0.984 | Slope = -0.0603 | 💪 Strong 📉 Negative
             Pass: R² = 0.966 | Slope = +0.0813 | 💪 Strong 📈 Positive
   ✅ Plot created successfully



5. 📈 Imd Band:
   ---------------------------------------------
   📊 Regression Statistics:
        Withdrawn: R² = 0.819 | Slope = -0.0006 | 💪 Strong 📉 Negative
             Fail: R² = 0.909 | Slope = -0.0015 | 💪 Strong 📉 Negative
             Pass: R² = 0.948 | Slope = +0.0022 | 💪 Strong 📈 Positive
   ✅ Plot created successfully



6. 📈 Region:
   ---------------------------------------------
   📊 Regression Statistics:
        Withdrawn: R² = 0.010 | Slope = +0.0003 | ⚪ Very Weak 📈 Positive
             Fail: R² = 0.036 | Slope = +0.0018 | ⚪ Very Weak 📈 Positive
             Pass: R² = 0.040 | Slope = -0.0021 | ⚪ Very Weak 📉 Negative
   ✅ Plot created successfully



📋 COMPREHENSIVE ANALYSIS SUMMARY:
📊 Overall Statistics:
   • Total relationships analyzed: 18
   • Average R²: 0.805
   • Strongest relationship: R² = 1.000
   • Strong relationships (R² > 0.5): 15

🏆 Best Predictors by Variable:
   • Age Band        → Withdrawn : R² = 1.000
   • Disability      → Withdrawn : R² = 1.000
   • Gender          → Withdrawn : R² = 1.000
   • Highest Education → Fail      : R² = 0.984
   • Imd Band        → Pass      : R² = 0.948
   • Region          → Pass      : R² = 0.040

🎯 Top 5 Strongest Relationships:
   1. 📈 Gender → Withdrawn: R² = 1.000
   2. 📈 Gender → Fail: R² = 1.000
   3. 📉 Gender → Pass: R² = 1.000
   4. 📉 Age Band → Withdrawn: R² = 1.000
   5. 📈 Disability → Withdrawn: R² = 1.000

📈 Trend Analysis:
   • Positive trends: 9 (50.0%)
   • Negative trends: 9 (50.0%)

📋 Complete Results Table:
--------------------------------------------------


Unnamed: 0,Variable,Outcome,R_Squared,Slope,Strength,Trend
0,Gender,Withdrawn,1.0,0.02,Strong,Positive
2,Gender,Pass,1.0,-0.03,Strong,Negative
3,Age Band,Withdrawn,1.0,-0.013,Strong,Negative
6,Disability,Withdrawn,1.0,0.091,Strong,Positive
7,Disability,Fail,1.0,0.018,Strong,Positive
8,Disability,Pass,1.0,-0.107,Strong,Negative
1,Gender,Fail,1.0,0.011,Strong,Positive
5,Age Band,Pass,0.9956,0.0735,Strong,Positive
4,Age Band,Fail,0.9935,-0.0605,Strong,Negative
10,Highest Education,Fail,0.9844,-0.0603,Strong,Negative



🎉 Complete analysis finished!
💡 All plots are interactive - hover, zoom, and explore the data!
📊 Use the regression lines to understand trends and relationships.


In [7]:
# =============================================================================
# CELL 7: Full Analysis with All Plots
# =============================================================================

# Run complete analysis and create all visualizations
print("🔄 Running complete analysis...")
cross_tables, summary_stats, insights, segments = run_complete_analysis(processed_data)

print("\n📊 REGRESSION SUMMARY TABLE:")
print("=" * 60)
display(summary_stats.round(4))

print("\n🎯 KEY INSIGHTS:")
print("=" * 40)
for i, insight in enumerate(insights.values(), 1):
    print(f"{i}. {insight}")

print("\n👥 STUDENT SEGMENTS:")
print("=" * 40)
for segment_name, stats in segments.items():
    name = segment_name.replace('_', ' ').title()
    emoji = {'High Performers': '🌟', 'High Engagement': '🎯', 'Overall Engaged': '💪', 
             'No Vle': '😴', 'Low Performers': '📉'}.get(name, '📊')
    print(f"\n{emoji} {name}:")
    print(f"   Count: {stats['count']:,} students ({stats['percentage']:.1f}% of total)")
    if 'completion_rate' in stats:
        print(f"   Success Rate: {stats['completion_rate']:.1f}%")
    if 'withdrawal_rate' in stats:
        print(f"   Withdrawal Rate: {stats['withdrawal_rate']:.1f}%")

🔄 Running complete analysis...

STARTING STATISTICAL ANALYSIS
Creating cross-tabulation tables...
  Creating cross table for: gender
    Categories: 2, Outcomes: 3
  Creating cross table for: region
    Categories: 13, Outcomes: 3
  Creating cross table for: highest_education
    Categories: 5, Outcomes: 3
  Creating cross table for: imd_band
    Categories: 10, Outcomes: 3
  Creating cross table for: age_band
    Categories: 3, Outcomes: 3
  Creating cross table for: disability
    Categories: 2, Outcomes: 3
✓ Created 6 cross-tabulation tables
Analyzing cross-table trends...
✓ Analyzed trends for 18 variable-outcome combinations
Analyzing student segments...
✓ Analyzed 5 student segments

STATISTICAL ANALYSIS COMPLETED

KEY INSIGHTS:
  • Strongest correlation: Gender with Withdrawn (R² = 1.000, Positive trend)
  • Strong relationships (R² > 0.5): 15 out of 18 combinations
  • Best predictor variable: Disability (avg R² = 1.000)

STUDENT SEGMENTS:
  • High Performers: 16,687 students (

Unnamed: 0,Variable,Outcome,R_Squared,Slope,Intercept,Trend,Strength
0,Gender,Withdrawn,1.0,0.02,0.18,Positive,Strong
1,Gender,Fail,1.0,0.011,0.248,Positive,Strong
2,Gender,Pass,1.0,-0.03,0.571,Negative,Strong
3,Region,Withdrawn,0.0101,0.0003,0.1893,Positive,Weak
4,Region,Fail,0.0361,0.0018,0.2414,Positive,Weak
5,Region,Pass,0.0399,-0.0021,0.5697,Negative,Weak
6,Highest Education,Withdrawn,0.7876,-0.021,0.2404,Negative,Strong
7,Highest Education,Fail,0.9844,-0.0603,0.3608,Negative,Strong
8,Highest Education,Pass,0.966,0.0813,0.3988,Positive,Strong
9,Imd Band,Withdrawn,0.8186,-0.0006,0.222,Negative,Strong



🎯 KEY INSIGHTS:
1. Strongest correlation: Gender with Withdrawn (R² = 1.000, Positive trend)
2. Strong relationships (R² > 0.5): 15 out of 18 combinations
3. Best predictor variable: Disability (avg R² = 1.000)

👥 STUDENT SEGMENTS:

🌟 High Performers:
   Count: 16,687 students (60.2% of total)
   Success Rate: 70.0%

🎯 High Engagement:
   Count: 9,870 students (35.6% of total)
   Success Rate: 72.5%

💪 Overall Engaged:
   Count: 19,176 students (69.2% of total)
   Success Rate: 67.9%

😴 No Vle:
   Count: 981 students (3.5% of total)
   Withdrawal Rate: 30.0%

📉 Low Performers:
   Count: 7,013 students (25.3% of total)
   Withdrawal Rate: 34.5%


In [8]:
# =============================================================================
# CELL 8: Student Segments Visualization
# =============================================================================

# Create and display student segments chart
print("📊 Creating student segments visualization...")
segments_fig = create_student_segments_plot(segments)
segments_fig.show()

# Show detailed segment comparison
print("\n📋 SEGMENT COMPARISON:")
segments_df = pd.DataFrame(segments).T
segments_df = segments_df.round(1)
display(segments_df)

📊 Creating student segments visualization...



📋 SEGMENT COMPARISON:


Unnamed: 0,count,percentage,avg_vle_clicks,completion_rate,avg_score,withdrawal_rate
high_performers,16687.0,60.2,413.4,70.0,,
high_engagement,9870.0,35.6,,72.5,74.8,
overall_engaged,19176.0,69.2,,67.9,79.1,
no_vle,981.0,3.5,,,19.8,30.0
low_performers,7013.0,25.3,170.9,,,34.5


In [9]:
# =============================================================================
# CELL 9: Custom Data Exploration
# =============================================================================

# Custom exploration of specific aspects
print("🔍 CUSTOM DATA EXPLORATION:")
print("=" * 50)

# 1. Score distribution by gender
print("\n1️⃣  Average score by gender:")
gender_scores = processed_data.groupby('gender')['score'].agg(['mean', 'std', 'count'])
gender_scores.columns = ['Average Score', 'Std Dev', 'Count']
display(gender_scores.round(2))

# 2. VLE engagement by final result
print("\n2️⃣  VLE engagement by final result:")
vle_by_result = processed_data.groupby('final_result')['total_click_vle'].agg(['mean', 'median', 'std'])
vle_by_result.columns = ['Mean Clicks', 'Median Clicks', 'Std Dev']
display(vle_by_result.round(1))

# 3. Pass rates by demographic factors
print("\n3️⃣  Pass rates by key demographics:")
pass_rates = {}
for var in ['gender', 'age_band', 'disability']:
    if var in processed_data.columns:
        rates = processed_data.groupby(var).agg({
            'final_result_code': ['count', lambda x: (x >= 2).sum()]
        })
        rates.columns = ['Total', 'Passed']
        rates['Pass_Rate_%'] = (rates['Passed'] / rates['Total'] * 100).round(1)
        pass_rates[var] = rates

for var, rates in pass_rates.items():
    print(f"\n📊 {var.replace('_', ' ').title()}:")
    display(rates)

# 4. Correlation matrix for key numeric variables
print("\n4️⃣  Correlations between key variables:")
numeric_vars = ['score', 'total_click_vle', 'excellent_Score', 'active_in_VLE', 'final_result_code']
corr_matrix = processed_data[numeric_vars].corr().round(3)
display(corr_matrix)

print("\n✅ Custom exploration complete!")

🔍 CUSTOM DATA EXPLORATION:

1️⃣  Average score by gender:


Unnamed: 0_level_0,Average Score,Std Dev,Count
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
F,62.28,31.43,12478
M,66.56,30.83,15247



2️⃣  VLE engagement by final result:


Unnamed: 0_level_0,Mean Clicks,Median Clicks,Std Dev
final_result,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Distinction,533.5,381.5,549.7
Fail,211.3,125.0,273.0
Pass,376.8,268.0,395.1
Withdrawn,259.2,169.5,317.2



3️⃣  Pass rates by key demographics:

📊 Gender:


Unnamed: 0_level_0,Total,Passed,Pass_Rate_%
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
F,12478,7129,57.1
M,15247,8256,54.1



📊 Age Band:


Unnamed: 0_level_0,Total,Passed,Pass_Rate_%
age_band,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0-35,19327,10332,53.5
35-55,8203,4920,60.0
55<=,195,133,68.2



📊 Disability:


Unnamed: 0_level_0,Total,Passed,Pass_Rate_%
disability,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
N,25087,14178,56.5
Y,2638,1207,45.8



4️⃣  Correlations between key variables:


Unnamed: 0,score,total_click_vle,excellent_Score,active_in_VLE,final_result_code
score,1.0,0.279,0.779,0.243,0.374
total_click_vle,0.279,1.0,0.266,0.605,0.187
excellent_Score,0.779,0.266,1.0,0.222,0.33
active_in_VLE,0.243,0.605,0.222,1.0,0.22
final_result_code,0.374,0.187,0.33,0.22,1.0



✅ Custom exploration complete!
