In [None]:
#UiDAI_11371 hacakthon_2026 analysis file

In [2]:
import pandas as pd


In [3]:
biometric = pd.read_csv("biometric_merged.csv")
demographic = pd.read_csv("demographic_merged.csv")
enrolment = pd.read_csv("enrollment_merged.csv")

print("Files loaded successfully")


Files loaded successfully


In [4]:
def clean_dataframe(df, name):
    print(f"\nCleaning {name}")

    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(" ", "_")
    )

    for col in df.select_dtypes(include="object").columns:
        df[col] = df[col].str.strip()

    if "date" in df.columns:
      df["date"] = df["date"].astype(str).str.strip()
      df["date"] = pd.to_datetime(
        df["date"],
        format="mixed",
        dayfirst=True
    )


    print(df.info())
    print("Missing values:\n", df.isnull().sum())

    return df


In [5]:
biometric = clean_dataframe(biometric, "Biometric Updates")
demographic = clean_dataframe(demographic, "Demographic Updates")
enrolment = clean_dataframe(enrolment, "Enrolment Data")



Cleaning Biometric Updates
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1205406 entries, 0 to 1205405
Data columns (total 6 columns):
 #   Column        Non-Null Count    Dtype         
---  ------        --------------    -----         
 0   date          1205406 non-null  datetime64[ns]
 1   state         1205406 non-null  object        
 2   district      1205405 non-null  object        
 3   pincode       1205405 non-null  float64       
 4   bio_age_5_17  1205405 non-null  float64       
 5   bio_age_17_   1205405 non-null  float64       
dtypes: datetime64[ns](1), float64(3), object(2)
memory usage: 55.2+ MB
None
Missing values:
 date            0
state           0
district        1
pincode         1
bio_age_5_17    1
bio_age_17_     1
dtype: int64

Cleaning Demographic Updates
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900700 entries, 0 to 900699
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   

In [9]:
CANONICAL_STATES = {
    "andhra pradesh", "arunachal pradesh", "assam", "bihar", "chhattisgarh",
    "goa", "gujarat", "haryana", "himachal pradesh", "jharkhand",
    "karnataka", "kerala", "madhya pradesh", "maharashtra", "manipur",
    "meghalaya", "mizoram", "nagaland", "odisha", "punjab", "rajasthan",
    "sikkim", "tamil nadu", "telangana", "tripura", "uttar pradesh",
    "uttarakhand", "west bengal",
    "andaman and nicobar islands", "chandigarh",
    "dadra and nagar haveli and daman and diu",
    "delhi", "jammu and kashmir", "ladakh",
    "lakshadweep", "puducherry"
}


In [10]:
CANONICAL_STATES = {
    "andaman and nicobar islands",
    "andhra pradesh", "arunachal pradesh", "assam", "bihar",
    "chhattisgarh", "goa", "gujarat", "haryana", "himachal pradesh",
    "jharkhand", "karnataka", "kerala", "madhya pradesh",
    "maharashtra", "manipur", "meghalaya", "mizoram", "nagaland",
    "odisha", "punjab", "rajasthan", "sikkim", "tamil nadu",
    "telangana", "tripura", "uttar pradesh", "uttarakhand",
    "west bengal", "chandigarh", "delhi", "jammu and kashmir",
    "ladakh", "lakshadweep", "puducherry",
    "dadra and nagar haveli and daman and diu"
}


In [11]:
STATE_CANONICAL_MAP = {
    "orissa": "odisha",
    "odisha": "odisha",
    "pondicherry": "puducherry",
    "uttaranchal": "uttarakhand",
    "jammu & kashmir": "jammu and kashmir",
    "jammu and kashmir": "jammu and kashmir",
    "jammu and kashmir ": "jammu and kashmir",
    "andaman & nicobar islands": "andaman and nicobar islands",
    "daman & diu": "dadra and nagar haveli and daman and diu",
    "dadra & nagar haveli": "dadra and nagar haveli and daman and diu",
    "the dadra and nagar haveli and daman and diu":
        "dadra and nagar haveli and daman and diu",
    "west bengal": "west bengal",
    "west bengal ": "west bengal",
    "westbengal": "west bengal"
}


In [12]:
DISTRICT_CANONICAL_MAP = {
    # Gujarat
    "ahmadabad": "ahmedabad",
    "ahmed abad": "ahmedabad",
    "ahmednagar": "ahmednagar",
    "ahmadnagar": "ahmednagar",

    # Karnataka
    "bangalore": "bengaluru",
    "bangalore urban": "bengaluru urban",
    "mysore": "mysuru",
    "shimoga": "shivamogga",
    "bellary": "ballari",
    "bijapur": "vijayapura",

    # UP
    "allahabad": "prayagraj",
    "faizabad": "ayodhya",
    "raebareli": "rae bareli",

    # WB
    "hooghly": "hoogly",
    "hugli": "hoogly",
    "hoogli": "hoogly",
    "puruliya": "purulia",

    # Telangana
    "rangareddi": "rangareddy",
    "karim nagar": "karimnagar"
}


In [13]:
import re
import unicodedata
import pandas as pd

def normalize_text(x):
    if pd.isna(x):
        return x
    x = unicodedata.normalize("NFKD", str(x))
    x = x.lower().strip()
    x = x.replace("&", "and")
    x = re.sub(r"[^\w\s]", "", x)
    x = re.sub(r"\s+", " ", x)
    return x


In [14]:
def clean_dataset(df, name):
    print(f"\nCleaning {name}")

    df["state_clean"] = df["state"].apply(normalize_text)
    df["district_clean"] = df["district"].apply(normalize_text)

    # Apply canonical mappings
    df["state_clean"] = df["state_clean"].replace(STATE_CANONICAL_MAP)
    df["district_clean"] = df["district_clean"].replace(DISTRICT_CANONICAL_MAP)

    # Drop invalid states
    df = df[df["state_clean"].isin(CANONICAL_STATES)]

    # Drop garbage district rows
    df = df[~df["district_clean"].str.contains(
        r"\b(near|road|colony|cross|hospital|thana)\b", na=False
    )]

    return df


In [15]:


biometric_clean = clean_dataset(biometric, "Biometric")
demographic_clean = clean_dataset(demographic, "Demographic")
enrolment_clean = clean_dataset(enrolment, "Enrolment")



Cleaning Biometric


  df = df[~df["district_clean"].str.contains(



Cleaning Demographic


  df = df[~df["district_clean"].str.contains(



Cleaning Enrolment


  df = df[~df["district_clean"].str.contains(


In [16]:
print("Biometric district:")
print(sorted(biometric_clean["district_clean"].unique()))

print("\nDemographic district:")
print(sorted(demographic_clean["district_clean"].unique()))

print("\nEnrolment district:")
print(sorted(enrolment_clean["district_clean"].unique()))

Biometric district:
['adilabad', 'agar malwa', 'agra', 'ahilyanagar', 'ahmed nagar', 'ahmedabad', 'ahmednagar', 'aizawl', 'ajmer', 'akhera', 'akola', 'alappuzha', 'aligarh', 'alipurduar', 'alirajpur', 'alluri sitharama raju', 'almora', 'alwar', 'ambala', 'ambedkar nagar', 'amethi', 'amravati', 'amreli', 'amritsar', 'amroha', 'anakapalli', 'anand', 'anantapur', 'ananthapur', 'ananthapuramu', 'anantnag', 'andamans', 'angul', 'anjaw', 'annamayya', 'anugal', 'anugul', 'anugul ', 'anuppur', 'araria', 'ariyalur', 'arvalli', 'arwal', 'ashok nagar', 'auraiya', 'auraiya ', 'aurangabad', 'aurangabadbh', 'ayodhya', 'azamgarh', 'badgam', 'bagalkot', 'bagalkot ', 'bageshwar', 'baghpat', 'bagpat', 'bahraich', 'bajali', 'baksa', 'balaghat', 'balangir', 'baleshwar', 'baleswar', 'ballari', 'ballia', 'bally jagachha', 'balod', 'baloda bazar', 'balotra', 'balrampur', 'banas kantha', 'banaskantha', 'banda', 'bandipore', 'bandipur', 'bangalore rural', 'banka', 'bankura', 'banswara', 'bapatla', 'bara banki'

In [17]:
print("Biometric states:")
print(sorted(biometric_clean["state_clean"].unique()))

print("\nDemographic states:")
print(sorted(demographic_clean["state_clean"].unique()))

print("\nEnrolment states:")
print(sorted(enrolment_clean["state_clean"].unique()))


Biometric states:
['andaman and nicobar islands', 'andhra pradesh', 'arunachal pradesh', 'assam', 'bihar', 'chandigarh', 'chhattisgarh', 'dadra and nagar haveli and daman and diu', 'delhi', 'goa', 'gujarat', 'haryana', 'himachal pradesh', 'jammu and kashmir', 'jharkhand', 'karnataka', 'kerala', 'ladakh', 'lakshadweep', 'madhya pradesh', 'maharashtra', 'manipur', 'meghalaya', 'mizoram', 'nagaland', 'odisha', 'puducherry', 'punjab', 'rajasthan', 'sikkim', 'tamil nadu', 'telangana', 'tripura', 'uttar pradesh', 'uttarakhand', 'west bengal']

Demographic states:
['andaman and nicobar islands', 'andhra pradesh', 'arunachal pradesh', 'assam', 'bihar', 'chandigarh', 'chhattisgarh', 'dadra and nagar haveli and daman and diu', 'delhi', 'goa', 'gujarat', 'haryana', 'himachal pradesh', 'jammu and kashmir', 'jharkhand', 'karnataka', 'kerala', 'ladakh', 'lakshadweep', 'madhya pradesh', 'maharashtra', 'manipur', 'meghalaya', 'mizoram', 'nagaland', 'odisha', 'puducherry', 'punjab', 'rajasthan', 'sikki

In [18]:
def count_unique_geo(df, name):
    print(f"\n{name}")
    print("Unique states   :", df["state_clean"].nunique())
    print("Unique districts:", df["district_clean"].nunique())


In [19]:
count_unique_geo(biometric_clean, "Biometric")
count_unique_geo(demographic_clean, "Demographic")
count_unique_geo(enrolment_clean, "Enrolment")



Biometric
Unique states   : 36
Unique districts: 924

Demographic
Unique states   : 36
Unique districts: 923

Enrolment
Unique states   : 36
Unique districts: 937


In [20]:
def monthly_aggregate(df, dataset_type):
    """
    dataset_type: 'biometric', 'demographic', or 'enrolment'
    """

    # Add month column
    df["year_month"] = df["date"].dt.to_period("M").astype(str)

    if dataset_type == "biometric":
        out = (
            df.groupby(["year_month", "state_clean", "district_clean"], as_index=False)
              .agg({
                  "bio_age_5_17": "sum",
                  "bio_age_17_": "sum"
              })
        )
        out["total_updates"] = (
            out["bio_age_5_17"] + out["bio_age_17_"]
        )

    elif dataset_type == "demographic":
        out = (
            df.groupby(["year_month", "state_clean", "district_clean"], as_index=False)
              .agg({
                  "demo_age_5_17": "sum",
                  "demo_age_17_": "sum"
              })
        )
        out["total_updates"] = (
            out["demo_age_5_17"] + out["demo_age_17_"]
        )

    elif dataset_type == "enrolment":
        out = (
            df.groupby(["year_month", "state_clean", "district_clean"], as_index=False)
              .agg({
                  "age_0_5": "sum",
                  "age_5_17": "sum",
                  "age_18_greater": "sum"
              })
        )
        out["total_updates"] = (
            out["age_0_5"] +
            out["age_5_17"] +
            out["age_18_greater"]
        )

    else:
        raise ValueError("dataset_type must be biometric, demographic, or enrolment")

    return out


In [21]:
biometric_monthly   = monthly_aggregate(biometric_clean, "biometric")
demographic_monthly = monthly_aggregate(demographic_clean, "demographic")
enrolment_monthly   = monthly_aggregate(enrolment_clean, "enrolment")


In [22]:
print("Biometric   :", biometric_monthly["year_month"].nunique())
print("Demographic :", demographic_monthly["year_month"].nunique())
print("Enrolment   :", enrolment_monthly["year_month"].nunique())


Biometric   : 9
Demographic : 9
Enrolment   : 9


In [23]:
biometric_agg=biometric_monthly
demographic_agg=demographic_monthly
enrolment_agg=enrolment_monthly

In [24]:
def sort_ts(df):
    return df.sort_values(["state_clean", "district_clean", "year_month"])

biometric = sort_ts(biometric_agg)
demographic = sort_ts(demographic_agg)
enrolment = sort_ts(enrolment_agg)


In [25]:
biometric_monthly["total_updates"] = (
    biometric_monthly["bio_age_5_17"] +
    biometric_monthly["bio_age_17_"]
)


In [26]:
demographic_monthly["total_updates"] = (
    demographic_monthly["demo_age_5_17"] +
    demographic_monthly["demo_age_17_"]
)


In [27]:
enrolment_monthly["total_updates"] = (
    enrolment_monthly["age_0_5"] +
    enrolment_monthly["age_5_17"] +
    enrolment_monthly["age_18_greater"]
)


In [29]:
# Rename columns for clarity
biometric_ts = biometric_monthly.rename(
    columns={"total_updates": "biometric_updates"}
)

demographic_ts = demographic_monthly.rename(
    columns={"total_updates": "demographic_updates"}
)

enrolment_ts = enrolment_monthly.rename(
    columns={"total_updates": "enrolments"}
)


In [35]:
# Merge all three datasets (outer join = honest coverage)
district_ts = (
    biometric_ts
    .merge(
        demographic_ts,
        on=["year_month", "state_clean", "district_clean"],
        how="outer"
    )
    .merge(
        enrolment_ts,
        on=["year_month", "state_clean", "district_clean"],
        how="outer"
    )
)
# Replace missing activity with zero
district_ts = district_ts.fillna(0)


In [40]:
district_ts["total_activity"] = (
    district_ts["biometric_updates"] +
    district_ts["demographic_updates"] +
    district_ts["enrolments"]
)
baseline = (
    district_ts
    .groupby(["state_clean", "district_clean"])
    .agg(
        mean_activity=("total_activity", "mean"),
        std_activity=("total_activity", "std")
    )
    .reset_index()
)
# Avoid divide-by-zero later
baseline["std_activity"] = baseline["std_activity"].fillna(1)


In [41]:
district_ts = district_ts.merge(
    baseline,
    on=["state_clean", "district_clean"],
    how="left"
)
district_ts["stress_signal"] = (
    (district_ts["total_activity"] - district_ts["mean_activity"])
    / district_ts["std_activity"]
)
district_ts["stress_signal"] = district_ts["stress_signal"].fillna(0)



In [45]:
stress_summary = (
    district_ts
    .groupby(["state_clean", "district_clean"])
    .agg(
        anomaly_count=("stress_signal", lambda x: (x > 2).sum()),
        mean_severity=("stress_signal", "mean"),
        max_severity=("stress_signal", "max")
    )
    .reset_index()
)
for col in ["anomaly_count", "mean_severity", "max_severity"]:
    stress_summary[col] = (
        stress_summary[col] - stress_summary[col].min()
    ) / (stress_summary[col].max() - stress_summary[col].min())
stress_summary["DSI"] = (
    0.4 * stress_summary["anomaly_count"] +
    0.3 * stress_summary["mean_severity"] +
    0.3 * stress_summary["max_severity"]
)


In [46]:
def classify_stress(x):
    if x >= 0.7:
        return "High"
    elif x >= 0.4:
        return "Moderate"
    else:
        return "Low"
stress_summary["stress_level"] = stress_summary["DSI"].apply(classify_stress)
stress_summary = stress_summary.sort_values(
    ["state_clean", "DSI"],
    ascending=[True, False]
)


In [47]:
stress_summary["stress_level"].value_counts()


Unnamed: 0_level_0,count
stress_level,Unnamed: 1_level_1
Low,664
High,202
Moderate,119


In [48]:
stress_summary.head(10)


Unnamed: 0,state_clean,district_clean,anomaly_count,mean_severity,max_severity,DSI,stress_level
3,andaman and nicobar islands,north and middle andaman,0.0,0.75,0.589734,0.40192,Moderate
4,andaman and nicobar islands,south andaman,0.0,0.543478,0.57472,0.33546,Low
1,andaman and nicobar islands,nicobar,0.0,0.380435,0.670969,0.315421,Low
2,andaman and nicobar islands,nicobars,0.0,0.543478,0.383755,0.27817,Low
0,andaman and nicobar islands,andamans,0.0,0.326087,0.564167,0.267076,Low
6,andhra pradesh,alluri sitharama raju,1.0,0.585258,0.895235,0.844148,High
43,andhra pradesh,srikakulam,1.0,0.532609,0.779245,0.793556,High
47,andhra pradesh,vizianagaram,0.0,0.842391,0.647277,0.4469,Moderate
27,andhra pradesh,mahabub nagar,0.0,0.652174,0.727903,0.414023,Moderate
24,andhra pradesh,kurnool,0.0,0.826087,0.516377,0.402739,Moderate


In [49]:
district_ts.to_csv("district_time_series.csv", index=False)
stress_summary.to_csv("district_stress_index.csv", index=False)


In [None]:
#plots are in streamlit link check below

In [None]:
https://udidhackathon-11371.streamlit.app/