In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load data
df = pd.read_csv("data/studentdata_featured.csv")

# Binary Encoding
binary_columns = ['Gender', 'SchoolType']
binary_mappings = {
    'Gender': {'Male': 0, 'Female': 1},
    'SchoolType': {'Public': 0, 'Private': 1},
}
for col in binary_columns:
    if col in df.columns:
        df[f'{col}_Encoded'] = df[col].map(binary_mappings[col])

# Ordinal Encoding
df['ParentalEducation_Encoded'] = df['ParentalEducation'].map({'<HS': 1, 'HS': 2, 'SomeCollege': 3, 'Bachelors+': 4})

# Label Encoding for targets
le1 = LabelEncoder()
df['PerformanceCategory_Encoded'] = le1.fit_transform(df['PerformanceCategory'])

# SES_Quartile make meaning explicit 
df['SESQuartile_Encoded'] = df['SES_Quartile'].map({
    1: 1,  # Lowest
    2: 2,  # Low-medium
    3: 3,  # Medium-high
    4: 4   # Highest
})

# Drop original columns
df = df.drop(columns=['ParentalEducation', 'PerformanceCategory', 'SES_Quartile', 'GPA'] + binary_columns)

# Summary
print("DATA ENCODING COMPLETE")
df.to_csv("data/studentdata_encoded.csv", index=False)
print("Saved to: data/studentdata_encoded.csv")

DATA ENCODING COMPLETE
Saved to: data/studentdata_encoded.csv


In [2]:
df.head()

Unnamed: 0,Age,TestScore_Math,TestScore_Reading,TestScore_Science,AttendanceRate,StudyHours,Extracurricular,PartTimeJob,ParentSupport,Gender_Encoded,SchoolType_Encoded,ParentalEducation_Encoded,PerformanceCategory_Encoded,SESQuartile_Encoded
0,15,72.346053,62.217134,73.008079,0.868836,0.310172,1,1,1,1,0,2,0,1
1,16,77.889157,72.74803,76.303717,0.909595,1.175586,1,0,0,1,1,1,0,1
2,17,72.966587,65.585472,68.099411,0.870952,1.112556,1,0,0,1,0,2,0,2
3,16,96.674049,88.035852,100.0,1.0,1.067679,0,0,0,1,0,2,1,2
4,16,81.98927,77.485372,72.715066,0.897957,0.841936,1,0,1,0,0,4,2,3


In [3]:
df.shape

(999997, 14)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999997 entries, 0 to 999996
Data columns (total 14 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Age                          999997 non-null  int64  
 1   TestScore_Math               999997 non-null  float64
 2   TestScore_Reading            999997 non-null  float64
 3   TestScore_Science            999997 non-null  float64
 4   AttendanceRate               999997 non-null  float64
 5   StudyHours                   999997 non-null  float64
 6   Extracurricular              999997 non-null  int64  
 7   PartTimeJob                  999997 non-null  int64  
 8   ParentSupport                999997 non-null  int64  
 9   Gender_Encoded               999997 non-null  int64  
 10  SchoolType_Encoded           999997 non-null  int64  
 11  ParentalEducation_Encoded    999997 non-null  int64  
 12  PerformanceCategory_Encoded  999997 non-null  int64  
 13 