In [2]:
import pandas as pd
import numpy as np  

def clean_data(df, cutoff=None, training_columns=None, is_training=True):

    df = df.copy()
    
    df["title_i_binary"] = df["title_i_status"].apply(
        lambda x: 1 if "eligible" in str(x).lower() else 0
    )
    
    school_level_map = {
        "Prekindergarten": "Elementary",
        "Primary": "Elementary",
        "Middle": "Middle",
        "Secondary": "High"
    }
    df["school_level_clean"] = df["school_level"].map(school_level_map).fillna("Other")
    
    df["school_type_clean"] = df["school_type"].replace({
        "Special education school": "Specialized",
        "Vocational school": "Specialized"
    })
    
    df["meps_poverty_pct"] = pd.to_numeric(df["meps_poverty_pct"], errors="coerce")
    df["poverty_sq"] = df["meps_poverty_pct"] ** 2
    
    
    if is_training:
        df["math_test_pct_prof_midpt"] = pd.to_numeric(
        df["math_test_pct_prof_midpt"], errors="coerce"
        )

        if cutoff is None:
            cutoff = df["math_test_pct_prof_midpt"].dropna().quantile(0.25)

        df["y"] = (df["math_test_pct_prof_midpt"] <= cutoff).astype(int)
        df = df.dropna(subset=["y"])
    
    # Select predictors
    predictors = [
        "enrollment",
        "direct_certification",  
        "meps_poverty_pct",
        "meps_mod_poverty_pct", 
        "poverty_sq",  
        "school_level_clean",
        "school_type_clean",  
        "charter",
        "magnet",
        "title_i_binary"
    ]
    
    X = df[predictors]
    X = pd.get_dummies(X, drop_first=True)
    
    # Align columns (for prediction)
    if not is_training and training_columns is not None:
        X = X.reindex(columns=training_columns, fill_value=0)
    
    # Create interaction features
    for col in X.columns:
        if col.startswith("school_level_clean_"):
            X[f"enroll_{col}"] = X["enrollment"] * X[col]
    
    # Return appropriate values
    if is_training:
        y = df["y"]
        return X, y, cutoff, X.columns.tolist()
    else:
        return X