In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os

def create_student_split_files():
    """
    Split the student data into 3 separate CSV files:
    - studentdata_training.csv (64%)
    - studentdata_validation.csv (16%) 
    - studentdata_testing.csv (20%)
    """
    
    # Create data directory if it doesn't exist
    if not os.path.exists('data'):
        os.makedirs('data')
    
    try:
        # Load your original encoded data
        df = pd.read_csv("data/studentdata_encoded.csv")
        print(f"Loaded {len(df):,} students")
        
        # Show original column order
        print(f"Original column order: {list(df.columns)}")
        
        # PerformanceCategory_Encoded AS TARGET
        print(f"Performance distribution: {df['PerformanceCategory_Encoded'].value_counts().to_dict()}")
        print("0 = High, 1 = Medium, 2 = Low (At Risk)")
        
        # Prepare features and target
        exclude_cols = ['PerformanceCategory_Encoded']
        feature_cols = [col for col in df.columns if col not in exclude_cols]
        
        X = df[feature_cols]
        y = df['PerformanceCategory_Encoded']
        
        print(f"Features used: {len(feature_cols)}")
        
        # FIRST SPLIT: 80% development, 20% testing (holdout)
        print("\nPerforming first split (80% development, 20% testing)")
        X_dev, X_testing, y_dev, y_testing = train_test_split(
            X, y, 
            test_size=0.2, 
            random_state=42, 
            stratify=y
        )
        
        # SECOND SPLIT: Development set -> training/validation
        print("\nPerforming second split (64% training, 16% validation)")
        X_train, X_val, y_train, y_val = train_test_split(
            X_dev, y_dev, 
            test_size=0.2, 
            random_state=42, 
            stratify=y_dev
        )
        
        # Preserve original column order
        original_columns = list(df.columns)  # Keep original order
        
        # Create DataFrames with ORIGINAL column order
        df_train = X_train.copy()
        df_train['PerformanceCategory_Encoded'] = y_train.values
        df_train = df_train[original_columns]  # Reorder to original
        
        df_val = X_val.copy()
        df_val['PerformanceCategory_Encoded'] = y_val.values
        df_val = df_val[original_columns]  # Reorder to original
        
        df_testing = X_testing.copy()
        df_testing['PerformanceCategory_Encoded'] = y_testing.values
        df_testing = df_testing[original_columns]  # Reorder to original
        
        print(f"Training set: {len(df_train):,} students (64%)")
        print(f"Validation set: {len(df_val):,} students (16%)")
        print(f"Testing set: {len(df_testing):,} students (20%)")
        
        # Save to separate files
        df_train.to_csv("data/studentdata_training.csv", index=False)
        df_val.to_csv("data/studentdata_validation.csv", index=False)
        df_testing.to_csv("data/studentdata_testing.csv", index=False)
        print("Split files created successfully!")
        
        # Show performance distribution
        print(f"\nPerformance distribution:")
        print(f"   Training: {df_train['PerformanceCategory_Encoded'].value_counts().to_dict()}")
        print(f"   Validation: {df_val['PerformanceCategory_Encoded'].value_counts().to_dict()}")
        print(f"   Testing: {df_testing['PerformanceCategory_Encoded'].value_counts().to_dict()}")
        
        return df_train, df_val, df_testing
        
    except FileNotFoundError:
        print("Error: studentdata_encoded.csv not found!")
        print("Make sure your file is in the 'data' folder")
        return None, None, None
    except Exception as e:
        print(f"Error: {str(e)}")
        return None, None, None

if __name__ == "__main__":
   # Create the split files
    df_train, df_val, df_testing = create_student_split_files()

Loaded 999,997 students
Original column order: ['Age', 'TestScore_Math', 'TestScore_Reading', 'TestScore_Science', 'AttendanceRate', 'StudyHours', 'Extracurricular', 'PartTimeJob', 'ParentSupport', 'Gender_Encoded', 'SchoolType_Encoded', 'ParentalEducation_Encoded', 'PerformanceCategory_Encoded', 'SESQuartile_Encoded']
Performance distribution: {0: 715844, 2: 142244, 1: 141909}
0 = High, 1 = Medium, 2 = Low (At Risk)
Features used: 13

Performing first split (80% development, 20% testing)

Performing second split (64% training, 16% validation)
Training set: 639,997 students (64%)
Validation set: 160,000 students (16%)
Testing set: 200,000 students (20%)
Split files created successfully!

Performance distribution:
   Training: {0: 458140, 2: 91036, 1: 90821}
   Validation: {0: 114535, 2: 22759, 1: 22706}
   Testing: {0: 143169, 2: 28449, 1: 28382}
