In [1]:
import pandas as pd

# ==============================
# FILE PATHS
# ==============================
ENROLMENT_PATH = "../data/api_data_aadhar_enrolment_0_500000.csv"
BIOMETRIC_PATH = "../data/api_data_aadhar_biometric_0_500000.csv"
DEMOGRAPHIC_PATH = "../data/api_data_aadhar_demographic_0_500000.csv"

# OUTPUT PATHS
CLEAN_ENROLMENT_PATH = "../data/clean_enrolment.csv"
CLEAN_BIOMETRIC_PATH = "../data/clean_biometric.csv"
CLEAN_DEMOGRAPHIC_PATH = "../data/clean_demographic.csv"


# ==============================
# COMMON CLEANING FUNCTIONS
# ==============================
def parse_date(df):
    df["date"] = pd.to_datetime(
        df["date"],
        format="%d-%m-%Y",
        errors="coerce"
    )
    return df


def standardize_text(df):
    df["state"] = df["state"].str.strip().str.title()
    df["district"] = df["district"].str.strip().str.title()
    return df


def validate_non_negative(df, cols):
    for col in cols:
        if (df[col] < 0).any():
            raise ValueError(f"Negative values found in column: {col}")
    return df


# ==============================
# CLEAN ENROLMENT DATA
# ==============================
enrolment_df = pd.read_csv(ENROLMENT_PATH)

enrolment_df = parse_date(enrolment_df)
enrolment_df = standardize_text(enrolment_df)

# Drop rows with invalid dates
enrolment_df = enrolment_df.dropna(subset=["date"])

# Validate numeric columns
enrolment_cols = ["age_0_5", "age_5_17", "age_18_greater"]
validate_non_negative(enrolment_df, enrolment_cols)

# Create total enrolment column
enrolment_df["total_enrolment"] = (
    enrolment_df["age_0_5"]
    + enrolment_df["age_5_17"]
    + enrolment_df["age_18_greater"]
)

# Save cleaned enrolment data
enrolment_df.to_csv(CLEAN_ENROLMENT_PATH, index=False)


# ==============================
# CLEAN BIOMETRIC DATA
# ==============================
biometric_df = pd.read_csv(BIOMETRIC_PATH)

biometric_df = parse_date(biometric_df)
biometric_df = standardize_text(biometric_df)

biometric_df = biometric_df.dropna(subset=["date"])

biometric_cols = ["bio_age_5_17", "bio_age_17_"]
validate_non_negative(biometric_df, biometric_cols)

# Create total biometric activity column
biometric_df["total_biometric_activity"] = (
    biometric_df["bio_age_5_17"] + biometric_df["bio_age_17_"]
)

biometric_df.to_csv(CLEAN_BIOMETRIC_PATH, index=False)


# ==============================
# CLEAN DEMOGRAPHIC DATA
# ==============================
demographic_df = pd.read_csv(DEMOGRAPHIC_PATH)

demographic_df = parse_date(demographic_df)
demographic_df = standardize_text(demographic_df)

demographic_df = demographic_df.dropna(subset=["date"])

demographic_cols = ["demo_age_5_17", "demo_age_17_"]
validate_non_negative(demographic_df, demographic_cols)

# Create total demographic update column
demographic_df["total_demographic_updates"] = (
    demographic_df["demo_age_5_17"] + demographic_df["demo_age_17_"]
)

demographic_df.to_csv(CLEAN_DEMOGRAPHIC_PATH, index=False)


print("✅ All datasets cleaned and saved successfully.")


✅ All datasets cleaned and saved successfully.
