In [1]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv("data/studentdata_clean.csv")
print(f"Original shape: {df.shape}\n")

# 1. Average Test Score
df['AvgTestScore'] = df[['TestScore_Math', 'TestScore_Reading', 'TestScore_Science']].mean(axis=1)

# 2. Performance Category
def categorize_performance(gpa):
    if gpa >= 3.5:
        return 'High_Achiever'
    elif gpa >= 2.5:
        return 'Average_Performer'
    else:
        return 'Struggling_Learner'

df['PerformanceCategory'] = df['GPA'].apply(categorize_performance)

# 3. Study Efficiency (GPA per study hour)
df['StudyEfficiency'] = np.where(df['StudyHours'] > 0, 
                                  df['GPA'] / df['StudyHours'], 
                                  df['GPA'])

# 4. Support Index
education_map = {'<HS': 1, 'HS': 2, 'SomeCollege': 3, 'Bachelors+': 4}
df['ParentalEducation_Numeric'] = df['ParentalEducation'].map(education_map)
df['SupportIndex'] = ((df['ParentalEducation_Numeric'] / 4 * 0.5) + 
                       (df['ParentSupport'] * 0.5)) * 100

# 5. Engagement Score
df['EngagementScore'] = ((df['AttendanceRate'] * 0.6) + 
                          (df['StudyHours'] / 4 * 0.4)) * 100

# 6. At-Risk Indicators
df['AtRisk_LowGPA'] = (df['GPA'] < 2.5).astype(int)
df['AtRisk_LowAttendance'] = (df['AttendanceRate'] < 0.85).astype(int)
df['AtRisk_LowEngagement'] = (df['EngagementScore'] < 70).astype(int)
df['AtRisk_Count'] = (df['AtRisk_LowGPA'] + 
                       df['AtRisk_LowAttendance'] + 
                       df['AtRisk_LowEngagement'])
df['AtRisk'] = (df['AtRisk_Count'] >= 2).astype(int)

# 7. Risk Level
def categorize_risk(count):
    if count == 0:
        return 'Low_Risk'
    elif count == 1:
        return 'Moderate_Risk'
    else:
        return 'High_Risk'

df['RiskLevel'] = df['AtRisk_Count'].apply(categorize_risk)

# 8. Test Score Consistency
df['TestScore_StdDev'] = df[['TestScore_Math', 'TestScore_Reading', 'TestScore_Science']].std(axis=1)

# 9. Subject Strengths (compared to personal average)
df['MathStrength'] = (df['TestScore_Math'] > df['AvgTestScore']).astype(int)
df['ReadingStrength'] = (df['TestScore_Reading'] > df['AvgTestScore']).astype(int)
df['ScienceStrength'] = (df['TestScore_Science'] > df['AvgTestScore']).astype(int)

# SUMMARY
print("=" * 60)
print("FEATURE ENGINEERING COMPLETE")
print("=" * 60)
print(f"New features created: {df.shape[1] - 11}")
print(f"Final shape: {df.shape}\n")

print("Performance Category Distribution:")
print(df['PerformanceCategory'].value_counts())
print(f"\nRisk Level Distribution:")
print(df['RiskLevel'].value_counts())
print(f"\nAt-Risk Students: {df['AtRisk'].sum()} ({df['AtRisk'].mean()*100:.1f}%)")

# Save featured dataset (NO ENCODING)
output_path = "data/studentdata_featured.csv"
df.to_csv(output_path, index=False)
print(f"\n✓ Saved: {output_path}")

Original shape: (999997, 14)

FEATURE ENGINEERING COMPLETE
New features created: 19
Final shape: (999997, 30)

Performance Category Distribution:
PerformanceCategory
Average_Performer     715844
Struggling_Learner    142244
High_Achiever         141909
Name: count, dtype: int64

Risk Level Distribution:
RiskLevel
Moderate_Risk    644856
High_Risk        218600
Low_Risk         136541
Name: count, dtype: int64

At-Risk Students: 218600 (21.9%)

✓ Saved: data/studentdata_featured.csv


In [2]:
df.head()

Unnamed: 0,Age,Gender,SES_Quartile,ParentalEducation,SchoolType,TestScore_Math,TestScore_Reading,TestScore_Science,GPA,AttendanceRate,...,AtRisk_LowGPA,AtRisk_LowAttendance,AtRisk_LowEngagement,AtRisk_Count,AtRisk,RiskLevel,TestScore_StdDev,MathStrength,ReadingStrength,ScienceStrength
0,15,Female,1,HS,Public,72.346053,62.217134,73.008079,2.521745,0.868836,...,0,0,1,1,0,Moderate_Risk,6.04811,1,0,1
1,16,Female,1,<HS,Private,77.889157,72.74803,76.303717,3.275626,0.909595,...,0,0,1,1,0,Moderate_Risk,2.632734,1,0,1
2,17,Female,2,HS,Public,72.966587,65.585472,68.099411,2.974137,0.870952,...,0,0,1,1,0,Moderate_Risk,3.752558,1,0,0
3,16,Female,2,HS,Public,96.674049,88.035852,100.0,3.67659,1.0,...,0,0,0,0,0,Low_Risk,6.175506,1,0,1
4,16,Male,3,Bachelors+,Public,81.98927,77.485372,72.715066,2.255014,0.897957,...,1,0,1,2,1,High_Risk,4.63774,1,1,0


In [1]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv("data/studentdata_clean.csv")
print(f"Original shape: {df.shape}\n")

Original shape: (999997, 14)



In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999997 entries, 0 to 999996
Data columns (total 14 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Age                999997 non-null  int64  
 1   Gender             999997 non-null  object 
 2   SES_Quartile       999997 non-null  int64  
 3   ParentalEducation  999997 non-null  object 
 4   SchoolType         999997 non-null  object 
 5   TestScore_Math     999997 non-null  float64
 6   TestScore_Reading  999997 non-null  float64
 7   TestScore_Science  999997 non-null  float64
 8   GPA                999997 non-null  float64
 9   AttendanceRate     999997 non-null  float64
 10  StudyHours         999997 non-null  float64
 11  PartTimeJob        999997 non-null  int64  
 12  ParentSupport      999997 non-null  int64  
 13  FreeTime           999997 non-null  int64  
dtypes: float64(6), int64(5), object(3)
memory usage: 106.8+ MB
