In [8]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import sys
import os

from CAD.config import Config

In [4]:
cfg = Config()
MIN_YEAR, MAX_YEAR = cfg.getMinMaxYear()

raw_yearly_df: dict[int, pd.DataFrame] = {}
for yr in range(MIN_YEAR, MAX_YEAR + 1):
    path = cfg.RAW_DATA_DIR / f"{yr}_attendance_data.xlsx"
    raw_yearly_df[yr] = pd.read_excel(path, engine="openpyxl")

In [9]:
col_rename_map = {
    "TOTAL_DAYS_UNEXCUSED_ABSENT": "Total_Days_Unexcused_Absent",
    "TOTAL_DAYS_ENROLLED": "Total_Days_Enrolled",
    "TOTAL_DAYS_PRESENT": "Total_Days_Present",
}
raw_yearly_df[2024] = raw_yearly_df[2024].rename(col_rename_map, axis=1)

ethnicity_map = {
    "Asian": "A",
    "Black or African American": "B",
    "Caucasian": "C",
    "American Indian/Alaskan Native": "I",
    "Native Hawaiian or Other Pacific Islander": "P",
    "Hispanic": "H",
    "Multi-Racial": "MR",
}

recoded_2024 = raw_yearly_df[2024].copy()
recoded_2024["ETHNIC_CODE"] = recoded_2024["ETHNIC_CODE"].replace(ethnicity_map)
recoded_2024["ECONOMIC_CODE"].replace({0: "N", 1: "F"}, inplace=True)
recoded_2024["SPECIAL_ED_CODE"].replace({0: "N", 1: "Y"}, inplace=True)
recoded_2024["HISPANIC_IND"].replace({0: "No", 1: "Yes"}, inplace=True)
recoded_2024["STUDENT_GENDER"].replace({"M": "Male", "F": "Female"}, inplace=True)
recoded_2024["STUDENT_GRADE_LEVEL"].replace({"KF": 0, "PK": -1}, inplace=True)
recoded_2024["STUDENT_GRADE_LEVEL"] = recoded_2024["STUDENT_GRADE_LEVEL"].astype(int)

In [None]:
dup_2024 = recoded_2024[recoded_2024["STUDENT_ID"].duplicated(keep=False)].copy()

bucket_grade_ids = []              
bucket_school_ids = []              
bucket_grade_school_ids = []        
bucket_other_ids = []              

for sid, group in dup_2024.groupby("STUDENT_ID"):
    if group["STUDENT_GRADE_LEVEL"].nunique() > 1 and group["SCHOOL_NAME"].nunique() == 1:
        bucket_grade_ids.append(sid)
    elif group["SCHOOL_NAME"].nunique() > 1 and group["STUDENT_GRADE_LEVEL"].nunique() == 1:
        bucket_school_ids.append(sid)
    elif group["SCHOOL_NAME"].nunique() > 1 and group["STUDENT_GRADE_LEVEL"].nunique() > 1:
        bucket_grade_school_ids.append(sid)
    else:
        bucket_other_ids.append(sid)

grade_df = dup_2024[dup_2024["STUDENT_ID"].isin(bucket_grade_ids)]
school_df = dup_2024[dup_2024["STUDENT_ID"].isin(bucket_school_ids)]
grade_school_df = dup_2024[dup_2024["STUDENT_ID"].isin(bucket_grade_school_ids)]
other_df = dup_2024[dup_2024["STUDENT_ID"].isin(bucket_other_ids)]


prev_year_df = raw_yearly_df[2023].copy()
prev_year_df["STUDENT_GRADE_LEVEL"].replace({"KF": 0, "PK": -1}, inplace=True)
prev_unique = prev_year_df.drop_duplicates("STUDENT_ID", keep="first")

merge_df = grade_df.merge(prev_unique[["STUDENT_ID", "STUDENT_GRADE_LEVEL"]], on="STUDENT_ID", how="left", suffixes=("", "_prev"))
merge_df["expected_grade"] = merge_df["STUDENT_GRADE_LEVEL_prev"] + 1
merge_df["has_prev"] = merge_df["STUDENT_GRADE_LEVEL_prev"].notna()

keepers_df = merge_df[(merge_df["STUDENT_GRADE_LEVEL"] == merge_df["expected_grade"]) | (~merge_df["has_prev"] & (merge_df["STUDENT_GRADE_LEVEL"] == -1))]
keepers_df = keepers_df[grade_df.columns]

flagged_df = merge_df.drop(index=keepers_df.index)
flagged_df = flagged_df.loc[flagged_df.groupby("STUDENT_ID")["STUDENT_GRADE_LEVEL"].idxmax()]
flagged_df = flagged_df[grade_df.columns]


ids_to_drop = bucket_grade_ids + bucket_school_ids + bucket_grade_school_ids + bucket_other_ids
clean_2024 = recoded_2024[~recoded_2024["STUDENT_ID"].isin(ids_to_drop)].copy()

clean_2024 = pd.concat([
    clean_2024,
    other_df.drop_duplicates("STUDENT_ID", keep="first"),         
    school_df.drop_duplicates(["STUDENT_ID", "SCHOOL_NAME"], keep="last").fillna("C"),
    keepers_df,
    flagged_df,
    grade_school_df.drop_duplicates("STUDENT_ID", keep="last"),
], ignore_index=True)

In [None]:
lookup_gender = (
    prev_year_df.dropna(subset=["STUDENT_GENDER"])
    .groupby(["SCHOOL_NAME", "STUDENT_ID"], sort=False)["STUDENT_GENDER"]
    .first()
)
mask_gender_na = clean_2024["STUDENT_GENDER"].isna()
keys = list(zip(clean_2024.loc[mask_gender_na, "SCHOOL_NAME"], clean_2024.loc[mask_gender_na, "STUDENT_ID"]))
clean_2024.loc[mask_gender_na, "STUDENT_GENDER"] = [lookup_gender.get(k, pd.NA) for k in keys]
clean_2024["STUDENT_GENDER"].fillna(clean_2024["STUDENT_GENDER"].mode().iloc[0], inplace=True)
clean_2024["ETHNIC_CODE"].fillna("C", inplace=True)


clean_2024.drop(index=72159, inplace=True)
raw_yearly_df[2024] = clean_2024

In [13]:
merged_df = pd.concat([raw_yearly_df[yr] for yr in sorted(raw_yearly_df)], ignore_index=True)
output_path = cfg.INTERIM_DATA_DIR / "Merged_Data.csv"
merged_df.to_csv(output_path, index=False)