# Feature Engineering for Student Success Prediction

This notebook creates features from all data sources for predicting student outcomes.

## Feature Categories
1. **Admissions Features** - Demographics, financial aid, exam type
2. **High School Exam Features** - Normalized scores from WASSCE, IB, A-Level, etc.
3. **Year 1 Academic Features** - GPA, courses, failures in first year
4. **Year 1-2 Academic Features** - Extended academic history
5. **Target Variables** - For each research question

In [10]:
import os
import sys

# Get the current working directory
current_dir = os.getcwd()

# Check if we are in the 'notebooks' directory and move up if so
if current_dir.endswith("notebooks"):
    os.chdir("..") # Move up one level to project root
    sys.path.append(os.getcwd()) # Add project root to python path

print(f"Current Working Directory: {os.getcwd()}")

Current Working Directory: /Users/user/coding/School/Ashesi/Semester-1/Machine-learning-&-data-science/final-project/project-claude


In [13]:
# Imports
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Project paths
PROJECT_ROOT = Path.cwd().parent
DATA_DIR = PROJECT_ROOT / 'data'
PROCESSED_DIR = DATA_DIR / 'processed'
PROCESSED_DIR.mkdir(exist_ok=True)

# Import custom modules
# import sys
# sys.path.insert(0, str(PROJECT_ROOT))
from src.data.loader import load_all_datasets
from src.data.exam_normalizer import (
    normalize_wassce_grade, normalize_ib_score, normalize_a_level_grade,
    normalize_french_bac_score, normalize_hsdiploma_grade, normalize_exam_score
)

# Load all datasets
print("Loading datasets...")
datasets = load_all_datasets(verbose=True)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/user/coding/School/Ashesi/Semester-1/Machine-learning-&-data-science/final-project/data/processed'

## 1. Admissions Features

Extract demographics, financial aid, and exam type information from application data.

In [2]:
def create_admissions_features(application_df):
    """Create admissions-based features from application data."""
    df = application_df.copy()
    features = pd.DataFrame()
    
    # Student identifier
    features['StudentRef'] = df['StudentRef']
    
    # Gender encoding
    features['gender_male'] = (df['Gender'] == 'M').astype(int)
    
    # International status (assume Country0 is Ghana/local)
    features['is_international'] = (df['Nationality'] != 'Country0').astype(int)
    
    # Offer type (for enrolled students)
    features['offer_enrolled'] = (df['Offer type'] == 'Enrolled').astype(int)
    
    # Intended major categories
    intended_major = df['Offer course name'].fillna('')
    features['intended_cs'] = intended_major.str.contains('Computer Science', case=False).astype(int)
    features['intended_engineering'] = intended_major.str.contains('Engineering', case=False).astype(int)
    features['intended_business'] = intended_major.str.contains('Business', case=False).astype(int)
    features['intended_mis'] = intended_major.str.contains('MIS|Information Systems', case=False).astype(int)
    
    # Financial aid need
    financial_aid_col = 'Extra question: Do you Need Financial Aid?'
    if financial_aid_col in df.columns:
        features['needs_financial_aid'] = (df[financial_aid_col] == 'Yes').astype(int)
    
    # Exam type category
    exam_type_col = 'Extra question: Type of Exam'
    if exam_type_col in df.columns:
        exam_type = df[exam_type_col].fillna('Unknown')
        features['exam_wassce'] = exam_type.str.contains('WASSCE', case=False, na=False).astype(int)
        features['exam_ib'] = exam_type.str.contains('IB|International Baccalaureate', case=False, na=False).astype(int)
        features['exam_alevel'] = exam_type.str.contains('A Level|IGCSE|KCSE', case=False, na=False).astype(int)
    
    # Previous application flag
    prev_app_col = 'Extra question: Have you applied to Ashesi before? If "yes" indicate the year.'
    if prev_app_col in df.columns:
        features['has_previous_application'] = (~df[prev_app_col].isin(['No', 'NaN', np.nan])).astype(int)
    
    # Disadvantaged background
    if 'Disadvantaged background' in df.columns:
        features['disadvantaged_background'] = df['Disadvantaged background'].notna().astype(int)
    
    return features.drop_duplicates('StudentRef')

# Create admissions features
admissions_features = create_admissions_features(datasets['application'])
print(f"Admissions features shape: {admissions_features.shape}")
print(f"\nFeature columns:")
for col in admissions_features.columns[1:]:
    print(f"  - {col}")
admissions_features.head()

Admissions features shape: (12207, 14)

Feature columns:
  - gender_male
  - is_international
  - offer_enrolled
  - intended_cs
  - intended_engineering
  - intended_business
  - intended_mis
  - needs_financial_aid
  - exam_wassce
  - exam_ib
  - exam_alevel
  - has_previous_application
  - disadvantaged_background


Unnamed: 0,StudentRef,gender_male,is_international,offer_enrolled,intended_cs,intended_engineering,intended_business,intended_mis,needs_financial_aid,exam_wassce,exam_ib,exam_alevel,has_previous_application,disadvantaged_background
0,Sd25fcbb18e84f890,1,0,0,1,0,0,0,1,1,0,0,1,0
1,Sfd5f545f824e3b45,1,0,0,0,0,0,1,0,0,0,1,0,0
2,Se4a2f9bcf28873f3,1,1,0,0,1,0,0,1,0,0,1,1,0
3,S2c5748435b37f518,1,0,0,0,1,0,0,1,0,0,1,0,0
4,Sa3d7b10c3d22ffe0,1,1,0,0,1,0,0,1,0,0,0,0,0


## 2. High School Exam Features

Normalize and extract features from all 6 exam types.

In [3]:
def create_wassce_features(wassce_df):
    """Create normalized features from WASSCE exam data."""
    df = wassce_df.copy()
    features = pd.DataFrame()
    features['StudentRef'] = df['StudentRef']
    features['exam_source'] = 'wassce'
    
    # Core subjects
    core_subjects = ['Mathematics', 'English Language', 'Integrated Science', 'Social Studies']
    
    for subj in core_subjects:
        if subj in df.columns:
            col_name = subj.lower().replace(' ', '_')
            features[f'hs_{col_name}'] = df[subj].apply(normalize_wassce_grade)
    
    # Elective math
    if 'Elective Math' in df.columns:
        features['hs_elective_math'] = df['Elective Math'].apply(normalize_wassce_grade)
        features['has_elective_math'] = df['Elective Math'].notna().astype(int)
    
    # Science subjects
    science_cols = ['Physics', 'Chemistry', 'Biology']
    science_scores = []
    for col in science_cols:
        if col in df.columns:
            features[f'hs_{col.lower()}'] = df[col].apply(normalize_wassce_grade)
            science_scores.append(f'hs_{col.lower()}')
    
    # Best science score
    if science_scores:
        features['hs_best_science'] = features[science_scores].max(axis=1)
    
    # Total aggregate (lower is better for WASSCE)
    if 'Total Aggregate' in df.columns:
        features['hs_total_aggregate'] = pd.to_numeric(df['Total Aggregate'], errors='coerce')
        # Invert aggregate for consistency (higher = better)
        features['hs_aggregate_score'] = 100 - features['hs_total_aggregate'] * 2  # Scale to ~0-100
        features['hs_aggregate_score'] = features['hs_aggregate_score'].clip(0, 100)
    
    return features

# Process WASSCE
if datasets['wassce'] is not None:
    wassce_features = create_wassce_features(datasets['wassce'])
    print(f"WASSCE features: {wassce_features.shape}")
    wassce_features.head()

WASSCE features: (1274, 14)


In [4]:
def create_ib_features(ib_df):
    """Create normalized features from IB exam data."""
    df = ib_df.copy()
    features = pd.DataFrame()
    features['StudentRef'] = df['StudentRef']
    features['exam_source'] = 'ib'
    
    # IB total points
    if 'Points' in df.columns:
        features['hs_total_points'] = pd.to_numeric(df['Points'], errors='coerce')
        # Normalize to 0-100 (IB max is typically 45)
        features['hs_aggregate_score'] = (features['hs_total_points'] / 45) * 100
    
    # Look for Math subjects
    math_cols = [col for col in df.columns if 'math' in col.lower()]
    for col in math_cols[:1]:  # Take first match
        features['hs_mathematics'] = df[col].apply(normalize_ib_score)
    
    # Look for English subjects
    eng_cols = [col for col in df.columns if 'english' in col.lower()]
    for col in eng_cols[:1]:
        features['hs_english_language'] = df[col].apply(normalize_ib_score)
    
    # Look for Science subjects
    science_cols = [col for col in df.columns if any(s in col.lower() for s in ['physics', 'chemistry', 'biology'])]
    if science_cols:
        science_scores = df[science_cols].applymap(normalize_ib_score)
        features['hs_best_science'] = science_scores.max(axis=1)
    
    return features

# Process IB
if datasets['ib'] is not None:
    ib_features = create_ib_features(datasets['ib'])
    print(f"IB features: {ib_features.shape}")

IB features: (131, 7)


In [5]:
def create_alevel_features(alevel_df):
    """Create normalized features from O/A Level exam data."""
    df = alevel_df.copy()
    features = pd.DataFrame()
    features['StudentRef'] = df['StudentRef']
    features['exam_source'] = 'a_level'
    
    # Points/aggregate
    if 'Points' in df.columns:
        features['hs_total_points'] = pd.to_numeric(df['Points'], errors='coerce')
        # Normalize (A-level points vary, rough scale)
        features['hs_aggregate_score'] = features['hs_total_points'].clip(0, 100)
    
    # Math
    math_cols = [col for col in df.columns if 'math' in col.lower()]
    for col in math_cols[:1]:
        features['hs_mathematics'] = df[col].apply(normalize_a_level_grade)
    
    # English
    eng_cols = [col for col in df.columns if 'english' in col.lower()]
    for col in eng_cols[:1]:
        features['hs_english_language'] = df[col].apply(normalize_a_level_grade)
    
    # Science
    science_cols = [col for col in df.columns if any(s in col.lower() for s in ['physics', 'chemistry', 'biology'])]
    if science_cols:
        for col in science_cols:
            features[f'hs_{col.lower().split()[0]}'] = df[col].apply(normalize_a_level_grade)
        features['hs_best_science'] = features[[f'hs_{col.lower().split()[0]}' for col in science_cols]].max(axis=1)
    
    return features

# Process O/A Level
if datasets['o_a_level'] is not None:
    alevel_features = create_alevel_features(datasets['o_a_level'])
    print(f"O/A Level features: {alevel_features.shape}")

O/A Level features: (343, 11)


In [6]:
def combine_exam_features(exam_feature_list):
    """Combine all exam features into a single DataFrame."""
    # Common columns to keep
    common_cols = ['StudentRef', 'exam_source', 'hs_mathematics', 'hs_english_language', 
                   'hs_best_science', 'hs_aggregate_score', 'has_elective_math']
    
    combined_list = []
    for df in exam_feature_list:
        if df is not None and len(df) > 0:
            # Add missing columns with NaN
            for col in common_cols:
                if col not in df.columns:
                    df[col] = np.nan
            combined_list.append(df[common_cols])
    
    if combined_list:
        combined = pd.concat(combined_list, ignore_index=True)
        return combined.drop_duplicates('StudentRef')
    return pd.DataFrame()

# Combine all exam features
exam_feature_list = []
if datasets['wassce'] is not None:
    exam_feature_list.append(create_wassce_features(datasets['wassce']))
if datasets['ib'] is not None:
    exam_feature_list.append(create_ib_features(datasets['ib']))
if datasets['o_a_level'] is not None:
    exam_feature_list.append(create_alevel_features(datasets['o_a_level']))

hs_exam_features = combine_exam_features(exam_feature_list)
print(f"\nCombined HS exam features: {hs_exam_features.shape}")
print(f"\nExam source distribution:")
print(hs_exam_features['exam_source'].value_counts())
hs_exam_features.head()


Combined HS exam features: (1717, 7)

Exam source distribution:
exam_source
wassce     1264
a_level     322
ib          131
Name: count, dtype: int64


Unnamed: 0,StudentRef,exam_source,hs_mathematics,hs_english_language,hs_best_science,hs_aggregate_score,has_elective_math
0,S7047a4e6df5e8cf5,wassce,70.0,80.0,,78.0,0.0
1,S5e71a2543b6dae93,wassce,90.0,60.0,,82.0,1.0
2,S1b0f4121c3b0de8a,wassce,90.0,90.0,90.0,86.0,1.0
3,Sd8609988972b7669,wassce,90.0,90.0,,86.0,0.0
4,S81023cf42ee1bcb8,wassce,90.0,80.0,80.0,80.0,1.0


## 3. Year 1 Academic Features

Extract features from first-year academic performance (semesters 1-2).

In [9]:
def create_year1_features(cgpa_df, transcript_df):
    """Create features from Year 1 academic data (semesters 1-2)."""
    
    # Filter to Year 1 data (Semester 1 and Semester 2)
    cgpa = cgpa_df.copy()
    transcript = transcript_df.copy()
    
    # Extract semester number
    cgpa['semester_num'] = cgpa['Semester/Year'].str.extract(r'Semester\s*(\d+)').astype(float)
    transcript['semester_num'] = transcript['Semester/Year'].str.extract(r'Semester\s*(\d+)').astype(float)
    
    # Year 1 data
    cgpa_y1 = cgpa[cgpa['semester_num'].isin([1, 2])].copy()
    transcript_y1 = transcript[transcript['semester_num'].isin([1, 2])].copy()
    
    features = pd.DataFrame()
    
    # GPA features from CGPA data
    gpa_agg = cgpa_y1.groupby('StudentRef').agg({
        'GPA': ['mean', 'min', 'max', 'std'],
        'CGPA': ['last', 'min']
    })
    gpa_agg.columns = ['y1_gpa_mean', 'y1_gpa_min', 'y1_gpa_max', 'y1_gpa_std', 'y1_cgpa_end', 'y1_cgpa_min']
    gpa_agg = gpa_agg.reset_index()
    
    features = gpa_agg.copy()
    
    # GPA trend (semester 2 - semester 1)
    sem1_gpa = cgpa_y1[cgpa_y1['semester_num'] == 1].set_index('StudentRef')['GPA']
    sem2_gpa = cgpa_y1[cgpa_y1['semester_num'] == 2].set_index('StudentRef')['GPA']
    sem3_gpa = cgpa_y1[cgpa_y1['semester_num'] == 3].set_index('StudentRef')['GPA']
    gpa_trend = sem2_gpa - sem1_gpa
    features = features.merge(
        gpa_trend.rename('y1_gpa_trend').reset_index(), 
        on='StudentRef', how='left'
    )
    
    # Course performance from transcript
    # Define failure grades
    fail_grades = ['E', 'F', 'D', 'D-', 'D+']
    transcript_y1['is_fail'] = transcript_y1['Grade'].isin(fail_grades)
    transcript_y1['is_a_grade'] = transcript_y1['Grade'].isin(['A', 'A-', 'A+'])
    transcript_y1['is_b_grade'] = transcript_y1['Grade'].isin(['B', 'B-', 'B+'])
    
    course_agg = transcript_y1.groupby('StudentRef').agg({
        'Course Name': 'count',
        'is_fail': 'sum',
        'is_a_grade': 'sum',
        'is_b_grade': 'sum',
        'Grade point': 'mean'
    })
    course_agg.columns = ['y1_courses_taken', 'y1_fail_count', 'y1_a_count', 'y1_b_count', 'y1_avg_grade_point']
    course_agg = course_agg.reset_index()
    
    # Calculate percentages
    course_agg['y1_fail_rate'] = course_agg['y1_fail_count'] / course_agg['y1_courses_taken']
    course_agg['y1_a_rate'] = course_agg['y1_a_count'] / course_agg['y1_courses_taken']
    
    features = features.merge(course_agg, on='StudentRef', how='left')
    
    # Probation flag
    features['y1_ever_probation'] = (features['y1_cgpa_min'] < 2.0).astype(int)
    
    return features

# Create Year 1 features
year1_features = create_year1_features(datasets['cgpa'], datasets['transcript'])
print(f"Year 1 features shape: {year1_features.shape}")
print(f"\nFeature columns:")
for col in year1_features.columns[1:]:
    print(f"  - {col}")
year1_features.head()

Year 1 features shape: (35115, 16)

Feature columns:
  - y1_gpa_mean
  - y1_gpa_min
  - y1_gpa_max
  - y1_gpa_std
  - y1_cgpa_end
  - y1_cgpa_min
  - y1_gpa_trend
  - y1_courses_taken
  - y1_fail_count
  - y1_a_count
  - y1_b_count
  - y1_avg_grade_point
  - y1_fail_rate
  - y1_a_rate
  - y1_ever_probation


Unnamed: 0,StudentRef,y1_gpa_mean,y1_gpa_min,y1_gpa_max,y1_gpa_std,y1_cgpa_end,y1_cgpa_min,y1_gpa_trend,y1_courses_taken,y1_fail_count,y1_a_count,y1_b_count,y1_avg_grade_point,y1_fail_rate,y1_a_rate,y1_ever_probation
0,S00039f6fd1b74390,3.834,3.78,3.9,0.055498,3.8,3.78,0.1,25.0,0.0,17.0,6.0,3.66,0.0,0.68,0
1,S00039f6fd1b74390,3.834,3.78,3.9,0.055498,3.8,3.78,0.1,25.0,0.0,17.0,6.0,3.66,0.0,0.68,0
2,S00039f6fd1b74390,3.834,3.78,3.9,0.055498,3.8,3.78,0.05,25.0,0.0,17.0,6.0,3.66,0.0,0.68,0
3,S00039f6fd1b74390,3.834,3.78,3.9,0.055498,3.8,3.78,0.05,25.0,0.0,17.0,6.0,3.66,0.0,0.68,0
4,S00039f6fd1b74390,3.834,3.78,3.9,0.055498,3.8,3.78,0.12,25.0,0.0,17.0,6.0,3.66,0.0,0.68,0


In [8]:
datasets['cgpa'].head()

Unnamed: 0,Yeargroup,StudentRef,Admission Year,Program,Semester/Year,Academic Year,Student Status,Gender,Nationality,Application Category,GPA,CGPA
0,2017,Sb01f8b2a9888be6f,2013-2014,B.Sc - Management Information Systems,Semester 1,2015-2016,Active,Male,Country0,Unknown,3.63,3.26
1,2017,Sb01f8b2a9888be6f,2013-2014,B.Sc - Management Information Systems,Semester 2,2015-2016,Active,Male,Country0,Unknown,2.7,3.15
2,2017,S87f7615365ccf796,2013-2014,B.Sc - Computer Science,Semester 1,2015-2016,Active,Male,Country0,Unknown,3.6,3.66
3,2017,S87f7615365ccf796,2013-2014,B.Sc - Computer Science,Semester 2,2015-2016,Active,Male,Country0,Unknown,3.4,3.61
4,2018,Sfea019092a0d1158,2014-2015,B.Sc - Business Administration,Semester 1,2015-2016,Graduated,Female,Country1,Unknown,3.45,3.11


## 4. Math Track Detection

Identify which math track each student started on (Calculus, Pre-Calculus, or College Algebra).

In [11]:
def detect_math_track(transcript_df):
    """Detect math track for each student from transcript data."""
    df = transcript_df.copy()
    
    # Define math track courses
    calculus_patterns = ['Calculus', 'MATH142']
    precalc_patterns = ['Pre-Calculus', 'Pre Calculus', 'PreCalculus', 'MATH141']
    algebra_patterns = ['College Algebra', 'Algebra', 'MATH140']
    
    def identify_track(course_name):
        course = str(course_name)
        for pattern in precalc_patterns:
            if pattern.lower() in course.lower():
                return 'precalculus'
        for pattern in calculus_patterns:
            if pattern.lower() in course.lower():
                return 'calculus'
        for pattern in algebra_patterns:
            if pattern.lower() in course.lower():
                return 'college_algebra'
        return None
    
    # Find math courses
    df['math_track_detected'] = df['Course Name'].apply(identify_track)
    
    # Get first math course per student (earliest semester)
    math_courses = df[df['math_track_detected'].notna()].copy()
    math_courses['semester_num'] = math_courses['Semester/Year'].str.extract(r'(\d+)').astype(float)
    
    # Get the first math track
    first_math = math_courses.sort_values('semester_num').groupby('StudentRef').first()
    first_math = first_math[['math_track_detected', 'Grade', 'Grade point']].reset_index()
    first_math.columns = ['StudentRef', 'math_track', 'first_math_grade', 'first_math_grade_point']
    
    # Encode math track
    track_encoding = {'calculus': 3, 'precalculus': 2, 'college_algebra': 1}
    first_math['math_track_encoded'] = first_math['math_track'].map(track_encoding)
    
    return first_math

# Detect math tracks
math_track_features = detect_math_track(datasets['transcript'])
print(f"Math track features: {math_track_features.shape}")
print(f"\nMath track distribution:")
print(math_track_features['math_track'].value_counts())
math_track_features.head()

Math track features: (3636, 5)

Math track distribution:
math_track
calculus           2035
precalculus        1288
college_algebra     313
Name: count, dtype: int64


Unnamed: 0,StudentRef,math_track,first_math_grade,first_math_grade_point,math_track_encoded
0,S00039f6fd1b74390,calculus,B+,3.5,3
1,S000901505ca1ec7f,calculus,A,4.0,3
2,S0021eb5e8ac9bfec,precalculus,B,3.0,2
3,S0027200343737e85,calculus,A,4.0,3
4,S002e2924edb73507,calculus,D+,1.5,3


## 5. Target Variable Creation

Create target variables for each research question.

In [13]:
def create_target_variables(cgpa_df, transcript_df, ajc_df):
    """Create all target variables for research questions."""
    cgpa = cgpa_df.copy()
    transcript = transcript_df.copy()
    ajc = ajc_df.copy() if ajc_df is not None else pd.DataFrame()
    
    # Get unique students
    all_students = set(cgpa['StudentRef'].unique())
    targets = pd.DataFrame({'StudentRef': list(all_students)})
    
    # Extract semester number
    cgpa['semester_num'] = cgpa['Semester/Year'].str.extract(r'Semester\s*(\d+)').astype(float)
    
    # RQ1: First year struggle (CGPA < 2.0 in Year 1)
    y1_cgpa = cgpa[cgpa['semester_num'].isin([1, 2, 3])].groupby('StudentRef')['CGPA'].min()
    targets = targets.merge(
        (y1_cgpa < 2.0).rename('target_y1_struggle').reset_index(),
        on='StudentRef', how='left'
    )
    targets['target_y1_struggle'] = targets['target_y1_struggle'].fillna(False).astype(int)
    
    # RQ2: AJC case
    if len(ajc) > 0:
        ajc_students = set(ajc['StudentRef'].unique())
        targets['target_ajc_case'] = targets['StudentRef'].isin(ajc_students).astype(int)
        
        # AJC guilty verdict
        guilty_students = set(ajc[ajc['Verdict'] == 'Guilty']['StudentRef'].unique())
        targets['target_ajc_guilty'] = targets['StudentRef'].isin(guilty_students).astype(int)
    else:
        targets['target_ajc_case'] = 0
        targets['target_ajc_guilty'] = 0
    
    # RQ3/5: Major success (Graduation CGPA >= 3.0)
    final_cgpa = cgpa.groupby('StudentRef')['CGPA'].last()
    targets = targets.merge(
        (final_cgpa >= 3.0).rename('target_major_success').reset_index(),
        on='StudentRef', how='left'
    )
    targets['target_major_success'] = targets['target_major_success'].fillna(False).astype(int)
    
    # RQ4/6: Ever on probation (proxy for major struggle)
    ever_probation = cgpa.groupby('StudentRef')['CGPA'].min() < 2.0
    targets = targets.merge(
        ever_probation.rename('target_ever_probation').reset_index(),
        on='StudentRef', how='left'
    )
    targets['target_ever_probation'] = targets['target_ever_probation'].fillna(False).astype(int)
    
    # RQ9: Extended graduation (> 8 semesters)
    semesters_count = cgpa.groupby('StudentRef')['semester_num'].max()
    targets = targets.merge(
        (semesters_count > 8).rename('target_extended_graduation').reset_index(),
        on='StudentRef', how='left'
    )
    targets['target_extended_graduation'] = targets['target_extended_graduation'].fillna(False).astype(int)
    
    # Final CGPA for RQ7 analysis (continuous outcome)
    targets = targets.merge(
        final_cgpa.rename('target_final_cgpa').reset_index(),
        on='StudentRef', how='left'
    )
    
    return targets

# Create targets
target_variables = create_target_variables(datasets['cgpa'], datasets['transcript'], datasets['ajc'])
print(f"Target variables shape: {target_variables.shape}")
print(f"\nTarget variable distributions:")
for col in target_variables.columns[1:]:
    if col.startswith('target_') and col != 'target_final_cgpa':
        rate = target_variables[col].mean() * 100
        print(f"  {col}: {rate:.1f}% positive")
target_variables.head()

Target variables shape: (3718, 8)

Target variable distributions:
  target_y1_struggle: 11.4% positive
  target_ajc_case: 3.6% positive
  target_ajc_guilty: 3.0% positive
  target_major_success: 55.5% positive
  target_ever_probation: 11.4% positive
  target_extended_graduation: 0.0% positive


Unnamed: 0,StudentRef,target_y1_struggle,target_ajc_case,target_ajc_guilty,target_major_success,target_ever_probation,target_extended_graduation,target_final_cgpa
0,S13a5dd89e5905708,0,0,0,0,0,0,2.62
1,Sa91bf01410e46f3f,0,0,0,0,0,0,2.52
2,S404a286fdbf52269,0,0,0,0,0,0,2.48
3,Sfef16c3ae009e655,0,0,0,1,0,0,3.65
4,S5c99ba5f5b27e81d,1,0,0,0,1,0,2.42


## 6. Combine All Features

Merge all feature sets into final datasets for modeling.

In [14]:
def create_modeling_dataset(admissions, hs_exam, year1, math_track, targets):
    """Combine all features into modeling datasets."""
    
    # Start with targets
    full_data = targets.copy()
    
    # Merge admissions features
    full_data = full_data.merge(admissions, on='StudentRef', how='left')
    
    # Merge HS exam features
    full_data = full_data.merge(hs_exam, on='StudentRef', how='left')
    
    # Merge Year 1 features
    full_data = full_data.merge(year1, on='StudentRef', how='left')
    
    # Merge math track features
    full_data = full_data.merge(math_track, on='StudentRef', how='left')
    
    return full_data

# Create full dataset
full_features = create_modeling_dataset(
    admissions_features, 
    hs_exam_features, 
    year1_features, 
    math_track_features, 
    target_variables
)

print(f"Full features dataset: {full_features.shape}")
print(f"\nColumns ({len(full_features.columns)}):")
for i, col in enumerate(full_features.columns):
    dtype = full_features[col].dtype
    missing = full_features[col].isna().sum()
    print(f"  {i+1}. {col} ({dtype}, {missing} missing)")

Full features dataset: (35130, 46)

Columns (46):
  1. StudentRef (object, 0 missing)
  2. target_y1_struggle (int64, 0 missing)
  3. target_ajc_case (int64, 0 missing)
  4. target_ajc_guilty (int64, 0 missing)
  5. target_major_success (int64, 0 missing)
  6. target_ever_probation (int64, 0 missing)
  7. target_extended_graduation (int64, 0 missing)
  8. target_final_cgpa (float64, 0 missing)
  9. gender_male (float64, 15376 missing)
  10. is_international (float64, 15376 missing)
  11. offer_enrolled (float64, 15376 missing)
  12. intended_cs (float64, 15376 missing)
  13. intended_engineering (float64, 15376 missing)
  14. intended_business (float64, 15376 missing)
  15. intended_mis (float64, 15376 missing)
  16. needs_financial_aid (float64, 15376 missing)
  17. exam_wassce (float64, 15376 missing)
  18. exam_ib (float64, 15376 missing)
  19. exam_alevel (float64, 15376 missing)
  20. has_previous_application (float64, 15376 missing)
  21. disadvantaged_background (float64, 15376 

In [15]:
# Summary statistics
print("\nDataset Summary:")
print(f"Total students: {len(full_features)}")
print(f"Students with HS exam data: {full_features['exam_source'].notna().sum()}")
print(f"Students with Year 1 data: {full_features['y1_gpa_mean'].notna().sum()}")
print(f"Students with math track: {full_features['math_track'].notna().sum()}")

# Check class balance
print("\nTarget Variable Balance:")
for col in ['target_y1_struggle', 'target_ajc_case', 'target_major_success', 
            'target_ever_probation', 'target_extended_graduation']:
    positive = full_features[col].sum()
    total = full_features[col].notna().sum()
    print(f"  {col}: {positive}/{total} ({positive/total*100:.1f}%)")


Dataset Summary:
Total students: 35130
Students with HS exam data: 13562
Students with Year 1 data: 35115
Students with math track: 34635

Target Variable Balance:
  target_y1_struggle: 3868/35130 (11.0%)
  target_ajc_case: 1604/35130 (4.6%)
  target_major_success: 19825/35130 (56.4%)
  target_ever_probation: 3868/35130 (11.0%)
  target_extended_graduation: 0/35130 (0.0%)


In [16]:
# Save processed datasets
print("\nSaving processed datasets...")

# Full feature set
full_features.to_csv(PROCESSED_DIR / 'full_features.csv', index=False)
print(f"  Saved: full_features.csv ({len(full_features)} rows)")

# Admissions-only features (for RQ1, RQ2)
admissions_only = full_features[['StudentRef'] + 
    [col for col in full_features.columns if col.startswith('target_')] +
    [col for col in admissions_features.columns if col != 'StudentRef'] +
    [col for col in hs_exam_features.columns if col != 'StudentRef']
].drop_duplicates('StudentRef')
admissions_only.to_csv(PROCESSED_DIR / 'admissions_features.csv', index=False)
print(f"  Saved: admissions_features.csv ({len(admissions_only)} rows)")

# Year 1 features
year1_features.to_csv(PROCESSED_DIR / 'year1_features.csv', index=False)
print(f"  Saved: year1_features.csv ({len(year1_features)} rows)")

# Math track features
math_track_features.to_csv(PROCESSED_DIR / 'math_track_features.csv', index=False)
print(f"  Saved: math_track_features.csv ({len(math_track_features)} rows)")

# Target variables
target_variables.to_csv(PROCESSED_DIR / 'targets.csv', index=False)
print(f"  Saved: targets.csv ({len(target_variables)} rows)")

print(f"\nAll files saved to: {PROCESSED_DIR}")


Saving processed datasets...
  Saved: full_features.csv (35130 rows)
  Saved: admissions_features.csv (3718 rows)
  Saved: year1_features.csv (35115 rows)
  Saved: math_track_features.csv (3636 rows)
  Saved: targets.csv (3718 rows)

All files saved to: /Users/user/coding/School/Ashesi/Semester-1/Machine-learning-&-data-science/final-project/project-claude/data/processed


In [17]:
print("\n" + "="*60)
print(" FEATURE ENGINEERING COMPLETE ")
print("="*60)
print(f"\nFeature sets created:")
print(f"  1. full_features.csv - All features combined")
print(f"  2. admissions_features.csv - Admissions + HS exam only")
print(f"  3. year1_features.csv - Year 1 academic features")
print(f"  4. math_track_features.csv - Math track detection")
print(f"  5. targets.csv - All target variables")
print(f"\nNext notebook: 04_unsupervised_learning.ipynb")


 FEATURE ENGINEERING COMPLETE 

Feature sets created:
  1. full_features.csv - All features combined
  2. admissions_features.csv - Admissions + HS exam only
  3. year1_features.csv - Year 1 academic features
  4. math_track_features.csv - Math track detection
  5. targets.csv - All target variables

Next notebook: 04_unsupervised_learning.ipynb
