In [387]:
import polars as pl

df = pl.read_csv(
    "enrollment.csv",
    infer_schema_length=1000,   
    low_memory=True
)

print(df.shape)


(1006029, 7)


In [388]:
print(df.schema)


Schema([('date', String), ('state', String), ('district', String), ('pincode', Int64), ('age_0_5', Int64), ('age_5_17', Int64), ('age_18_greater', Int64)])


In [389]:
import polars as pl

pl.Config.set_tbl_rows(1000)     # number of rows to display
pl.Config.set_tbl_cols(50)       # number of columns
pl.Config.set_tbl_width_chars(200)


polars.config.Config

In [390]:
df.select(pl.col("state").unique().sort())


state
str
"""100000"""
"""Andaman & Nicobar Islands"""
"""Andaman and Nicobar Islands"""
"""Andhra Pradesh"""
"""Arunachal Pradesh"""
"""Assam"""
"""Bihar"""
"""Chandigarh"""
"""Chhattisgarh"""
"""Dadra & Nagar Haveli"""


In [391]:
import polars as pl

state_map = {
    "andaman & nicobar islands": "Andaman and Nicobar Islands",
    "andaman and nicobar islands": "Andaman and Nicobar Islands",

    "andhra pradesh": "Andhra Pradesh",
    "arunachal pradesh": "Arunachal Pradesh",
    "assam": "Assam",
    "bihar": "Bihar",
    "chandigarh": "Chandigarh",

    "chhatisgarh": "Chhattisgarh",
    "chhattisgarh": "Chhattisgarh",

    "dadra & nagar haveli": "Dadra and Nagar Haveli and Daman and Diu",
    "dadra and nagar haveli": "Dadra and Nagar Haveli and Daman and Diu",
    "daman & diu": "Dadra and Nagar Haveli and Daman and Diu",
    "daman and diu": "Dadra and Nagar Haveli and Daman and Diu",
    "dadra and nagar haveli and daman and diu": "Dadra and Nagar Haveli and Daman and Diu",
    "the dadra and nagar haveli and daman and diu" : "Dadra and Nagar Haveli and Daman and Diu",
    "delhi": "Delhi",
    "goa": "Goa",
    "gujarat": "Gujarat",
    "haryana": "Haryana",
    "himachal pradesh": "Himachal Pradesh",

    "jammu & kashmir": "Jammu and Kashmir",
    "jammu and kashmir": "Jammu and Kashmir",

    "jharkhand": "Jharkhand",
    "karnataka": "Karnataka",
    "kerala": "Kerala",
    "ladakh": "Ladakh",
    "lakshadweep": "Lakshadweep",

    "madhya pradesh": "Madhya Pradesh",
    "maharashtra": "Maharashtra",
    "manipur": "Manipur",
    "meghalaya": "Meghalaya",
    "mizoram": "Mizoram",
    "nagaland": "Nagaland",

    "odisha": "Odisha",
    "orissa": "Odisha",

    "pondicherry": "Puducherry",
    "puducherry": "Puducherry",

    "punjab": "Punjab",
    "rajasthan": "Rajasthan",
    "sikkim": "Sikkim",

    "tamil nadu": "Tamil Nadu",
    "tamilnadu": "Tamil Nadu",

    "telangana": "Telangana",
    "tripura": "Tripura",

    "uttar pradesh": "Uttar Pradesh",

    "uttarakhand": "Uttarakhand",
    "uttaranchal": "Uttarakhand",

    "west bengal": "West Bengal",
    "west  bengal": "West Bengal",
    "westbengal": "West Bengal",
    "west bangal": "West Bengal"
}

df = df.with_columns(
    pl.col("state")
      .str.to_lowercase()
      .str.replace_all("&", "and")
      .str.replace_all(r"\s+", " ")
      .map_elements(lambda x: state_map.get(x, x))
      .alias("state")
)


In [392]:
df = df.filter(pl.col("state") != "100000")


In [393]:
df.select(pl.col("state").unique().sort())


state
str
"""Andaman and Nicobar Islands"""
"""Andhra Pradesh"""
"""Arunachal Pradesh"""
"""Assam"""
"""Bihar"""
"""Chandigarh"""
"""Chhattisgarh"""
"""Dadra and Nagar Haveli and Dam…"
"""Delhi"""
"""Goa"""


In [394]:

df = df.with_columns([
    pl.col(pl.Utf8).str.to_lowercase()
])


Andaman

In [395]:
df.filter(
    pl.col("state") == "andaman and nicobar islands"
).select(
    pl.col("district").unique().sort()
)


district
str
"""andamans"""
"""nicobar"""
"""nicobars"""
"""north and middle andaman"""
"""south andaman"""


In [396]:
district_map = {
    "andamans": "North Middle Andaman",
    "north and middle andaman": "North Middle Andaman",

    "nicobar": "Nicobar",
    "nicobars": "Nicobar",

    "south andaman": "South Andaman"
}


In [397]:
df = df.with_columns(
    pl.when(pl.col("state") == "andaman and nicobar islands")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all(r"\s+", " ")
            .map_elements(lambda x: district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [398]:
df.filter(
    pl.col("state") == "andaman and nicobar islands"
).select(
    pl.col("district").unique().sort()
)


district
str
"""Nicobar"""
"""North Middle Andaman"""
"""South Andaman"""


andhra pradesh

In [399]:
df.filter(
    pl.col("state") == "andhra pradesh"
).select(
    pl.col("district").unique().sort()
)


district
str
"""adilabad"""
"""alluri sitharama raju"""
"""anakapalli"""
"""anantapur"""
"""ananthapur"""
"""ananthapuramu"""
"""annamayya"""
"""bapatla"""
"""chittoor"""
"""cuddapah"""


In [400]:
telangana_district_map = {
    "adilabad": "Adilabad",
    "hyderabad": "Hyderabad",
    "karim nagar": "Karimnagar",
    "karimnagar": "Karimnagar",
    "khammam": "Khammam",
    "mahabub nagar": "Mahbubnagar",
    "mahabubnagar": "Mahbubnagar",
    "mahbubnagar": "Mahbubnagar",
    "medak": "Medak",
    "nalgonda": "Nalgonda",
    "nizamabad": "Nizamabad",
    "rangareddi": "Ranga Reddy",
    "k.v. rangareddy": "Ranga Reddy",
    "k.v.rangareddy": "Ranga Reddy",
    "warangal": "Warangal"
}


In [401]:
andhra_district_map = {
    # North Andhra
    "alluri sitharama raju": "Alluri Sitharama Raju",
    "anakapalli": "Anakapalli",
    "parvathipuram manyam": "Parvathipuram Manyam",
    "polavaram": "Polavaram",
    "srikakulam": "Srikakulam",
    "visakhapatnam": "Visakhapatnam",
    "visakhapatanam" : "Visakhapatnam",
    "vizianagaram": "Vizianagaram",

    # Coastal Andhra
    "bapatla": "Bapatla",
    "dr. b. r. ambedkar konaseema": "Konaseema",
    "konaseema": "Konaseema",
    "east godavari": "East Godavari",
    "eluru": "Eluru",
    "guntur": "Guntur",
    "kakinada": "Kakinada",
    "krishna": "Krishna",
    "markapuram": "Markapuram",
    "n. t. r": "NTR",
    "ntr": "NTR",
    "palnadu": "Palnadu",
    "prakasam": "Prakasam",
    "nellore": "Nellore",
    "spsr nellore":"Nellore",
    "sri potti sriramulu nellore": "Nellore",
    "west godavari": "West Godavari",

    # Rayalaseema
    "ananthapur": "Ananthapuramu",
    "ananthapuramu": "Ananthapuramu",
    "anantapur": "Ananthapuramu",
    "annamayya": "Annamayya",
    "chittoor": "Chittoor",
    "cuddapah": "Kadapa",
    "y. s. r": "Kadapa",
    "kadapa": "Kadapa",
    "kurnool": "Kurnool",
    "nandyal": "Nandyal",
    "sri sathya sai": "Sri Sathya Sai",
    "tirupati": "Tirupati",
}


In [402]:

df = df.with_columns([
    # Normalize district (Andhra / Telangana)
    pl.when(pl.col("district").str.to_lowercase().is_in(telangana_district_map.keys()))
      .then(
          pl.col("district")
            .str.to_lowercase()
            .map_elements(lambda x: telangana_district_map.get(x, x))
      )
      .otherwise(
          pl.col("district")
            .str.to_lowercase()
            .map_elements(lambda x: andhra_district_map.get(x, x))
      )
      .alias("district"),

    # Normalize state (force Telangana where district is Telangana)
    pl.when(
        pl.col("district").str.to_lowercase().is_in(telangana_district_map.keys()) 

    )
    .then(pl.lit("telangana"))
    .otherwise(pl.col("state"))
    .alias("state")
])


In [403]:
df.filter(pl.col("state") == "andhra pradesh")\
  .select(pl.col("district").unique().sort())


district
str
"""Alluri Sitharama Raju"""
"""Anakapalli"""
"""Ananthapuramu"""
"""Annamayya"""
"""Bapatla"""
"""Chittoor"""
"""East Godavari"""
"""Eluru"""
"""Guntur"""
"""Kadapa"""


ArunachalPradesh

In [404]:
df.filter(pl.col("state") == "arunachal pradesh")\
  .select(pl.col("district").unique().sort())


district
str
"""anjaw"""
"""changlang"""
"""dibang valley"""
"""east kameng"""
"""east siang"""
"""kamle"""
"""kra daadi"""
"""kurung kumey"""
"""leparada"""
"""lohit"""


In [405]:
arunachal_district_map = {
    "anjaw": "Anjaw",
    "changlang": "Changlang",
    "dibang valley": "Dibang Valley",
    "east kameng": "East Kameng",
    "east siang": "East Siang",
    "kamle": "Kamle",
    "kra daadi": "Kra Daadi",
    "kurung kumey": "Kurung Kumey",
    "leparada": "Leparada",
    "lohit": "Lohit",
    "longding": "Longding",
    "lower dibang valley": "Lower Dibang Valley",
    "lower siang": "Lower Siang",
    "lower subansiri": "Lower Subansiri",
    "namsai": "Namsai",
    "pakke kessang": "Pakke Kessang",
    "papum pare": "Papum Pare",
    "shi-yomi": "Shi Yomi",
    "shi yomi": "Shi Yomi",
    "siang": "Siang",
    "tawang": "Tawang",
    "tirap": "Tirap",
    "upper siang": "Upper Siang",
    "upper subansiri": "Upper Subansiri",
    "west kameng": "West Kameng",
    "west siang": "West Siang",

}


In [406]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "arunachal pradesh")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all("-", " ")
            .str.replace_all(r"\s+", " ")
     
            .map_elements(lambda x: arunachal_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [407]:
df.filter(pl.col("state") == "arunachal pradesh") \
  .select(pl.col("district").unique().sort())


district
str
"""Anjaw"""
"""Changlang"""
"""Dibang Valley"""
"""East Kameng"""
"""East Siang"""
"""Kamle"""
"""Kra Daadi"""
"""Kurung Kumey"""
"""Leparada"""
"""Lohit"""


assam

In [408]:
df.filter(pl.col("state") == "assam") \
  .select(pl.col("district").unique().sort())


district
str
"""bajali"""
"""baksa"""
"""barpeta"""
"""biswanath"""
"""bongaigaon"""
"""cachar"""
"""charaideo"""
"""chirang"""
"""darrang"""
"""dhemaji"""


In [409]:
assam_district_map = {
    "bajali": "Bajali",
    "baksa": "Baksa",
    "barpeta": "Barpeta",
    "biswanath": "Biswanath",
    "bongaigaon": "Bongaigaon",
    "cachar": "Cachar",
    "charaideo": "Charaideo",
    "chirang": "Chirang",
    "darrang": "Darrang",
    "dhemaji": "Dhemaji",
    "dhubri": "Dhubri",
    "dibrugarh": "Dibrugarh",
    "dima hasao": "Dima Hasao",
    "north cachar hills": "Dima Hasao",   # legacy name
    "goalpara": "Goalpara",
    "golaghat": "Golaghat",
    "hailakandi": "Hailakandi",
    "hojai": "Hojai",
    "jorhat": "Jorhat",
    "kamrup": "Kamrup",
    "kamrup metro": "Kamrup Metro",
    "karbi anglong": "Karbi Anglong",
    "kokrajhar": "Kokrajhar",
    "lakhimpur": "Lakhimpur",
    "majuli": "Majuli",
    "marigaon": "Marigaon",
    "nagaon": "Nagaon",
    "nalbari": "Nalbari",
    "sibsagar": "Sivasagar",
    "sivasagar": "Sivasagar",
    "sonitpur": "Sonitpur",
    "south salmara mankachar": "South Salmara Mancachar",
    "sribhumi": "Sribhumi",
    "tamulpur district": "Tamulpur",
    "tamulpur": "Tamulpur",
    "tinsukia": "Tinsukia",
    "udalguri": "Udalguri",
    "west karbi anglong": "West Karbi Anglong",
    "karimganj" : "Sribhumi"
}


In [410]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "assam")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all("-", " ")
            .str.replace_all(r"\s+", " ")
            .map_elements(lambda x: assam_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [411]:
df.filter(pl.col("state") == "assam") \
  .select(pl.col("district").unique().sort())


district
str
"""Bajali"""
"""Baksa"""
"""Barpeta"""
"""Biswanath"""
"""Bongaigaon"""
"""Cachar"""
"""Charaideo"""
"""Chirang"""
"""Darrang"""
"""Dhemaji"""


Bihar

In [412]:
df.filter(pl.col("state") == "bihar") \
  .select(pl.col("district").unique().sort())


district
str
"""araria"""
"""arwal"""
"""aurangabad"""
"""aurangabad(bh)"""
"""banka"""
"""begusarai"""
"""bhabua"""
"""bhagalpur"""
"""bhojpur"""
"""buxar"""


In [413]:
bihar_district_map = {
    "araria": "Araria",
    "arwal": "Arwal",

    "aurangabad": "Aurangabad",
    "aurangabad(bh)": "Aurangabad",

    "banka": "Banka",
    "begusarai": "Begusarai",

    "bhagalpur": "Bhagalpur",
    "bhojpur": "Bhojpur",
    "buxar": "Buxar",

    "darbhanga": "Darbhanga",

    "east champaran": "Purbi Champaran",
    "purba champaran": "Purbi Champaran",
    "purbi champaran": "Purbi Champaran",

    "pashchim champaran": "Pashchim Champaran",
    "west champaran": "Pashchim Champaran",

    "gaya": "Gaya",
    "gopalganj": "Gopalganj",
    "jamui": "Jamui",
    "jehanabad": "Jehanabad",

    "bhabua": "Kaimur (Bhabua)",
    "kaimur (bhabua)": "Kaimur (Bhabua)",
    "kaimur": "Kaimur (Bhabua)",

    "katihar": "Katihar",
    "khagaria": "Khagaria",
    "kishanganj": "Kishanganj",
    "lakhisarai": "Lakhisarai",
    "madhepura": "Madhepura",
    "madhubani": "Madhubani",

    "monghyr": "Munger",
    "munger": "Munger",

    "muzaffarpur": "Muzaffarpur",
    "nalanda": "Nalanda",
    "nawada": "Nawada",
    "patna": "Patna",

    "purnea": "Purnia",
    "purnia": "Purnia",

    "rohtas": "Rohtas",
    "saharsa": "Saharsa",

    "samastipur": "Samastipur",
    "samstipur": "Samastipur",

    "saran": "Saran",

    "sheikhpura": "Sheikhpura",
    "sheikpura": "Sheikhpura",

    "sheohar": "Sheohar",
    "sitamarhi": "Sitamarhi",
    "siwan": "Siwan",
    "supaul": "Supaul",
    "vaishali": "Vaishali"
}


In [414]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "bihar")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all(r"\s+", " ")
            .map_elements(lambda x: bihar_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [415]:
df.filter(pl.col("state") == "bihar") \
  .select(pl.col("district").unique().sort())


district
str
"""Araria"""
"""Arwal"""
"""Aurangabad"""
"""Banka"""
"""Begusarai"""
"""Bhagalpur"""
"""Bhojpur"""
"""Buxar"""
"""Darbhanga"""
"""Gaya"""


chandigarh

In [416]:
df.filter(pl.col("state") == "chandigarh") \
  .select(pl.col("district").unique().sort())


district
str
"""chandigarh"""
"""rupnagar"""


In [417]:
chandigarh_punjab_district_map = {
    "chandigarh": ("chandigarh", "Chandigarh"),
    "mohali": ("punjab", "S.A.S. Nagar"),
    "rupnagar": ("punjab", "Rupnagar"),
}


In [418]:
import polars as pl

df = df.with_columns([
    # Fix district
    pl.when(
        (pl.col("state") == "chandigarh") &
        (pl.col("district").str.to_lowercase() == "mohali")
    )
    .then(pl.lit("S.A.S. Nagar"))
    .otherwise(pl.col("district"))
    .alias("district"),

    # Fix state
    pl.when(
        (pl.col("state") == "chandigarh") &
        (pl.col("district").str.to_lowercase().is_in(["mohali", "rupnagar"]))
    )
    .then(pl.lit("punjab"))
    .otherwise(pl.col("state"))
    .alias("state")
])


chhattisgarh

In [419]:
df.filter(pl.col("state") == "chhattisgarh") \
  .select(pl.col("district").unique().sort())


district
str
"""balod"""
"""baloda bazar"""
"""balrampur"""
"""bastar"""
"""bemetara"""
"""bijapur"""
"""bilaspur"""
"""dakshin bastar dantewada"""
"""dantewada"""
"""dhamtari"""


In [420]:
chhattisgarh_district_map = {
    "balod": "Balod",

    "baloda bazar": "Balodabazar-Bhatapara",
    "balodabazar": "Balodabazar-Bhatapara",
    "balodabazar-bhatapara": "Balodabazar-Bhatapara",

    "balrampur": "Balrampur-Ramanujganj",
    "balrampur-ramanujganj": "Balrampur-Ramanujganj",

    "bastar": "Bastar",
    "bemetara": "Bemetara",
    "bijapur": "Bijapur",
    "bilaspur": "Bilaspur",

    "dakshin bastar dantewada": "Dakshin Bastar Dantewada",
    "dantewada": "Dakshin Bastar Dantewada",

    "dhamtari": "Dhamtari",
    "durg": "Durg",
    "gariyaband": "Gariyaband",

    "gaurela-pendra-marwahi": "Gaurela-Pendra-Marwahi",
    "gaurela pendra marwahi": "Gaurela-Pendra-Marwahi",
    "gaurella pendra marwahi":"Gaurela-Pendra-Marwahi",
    "janjgir - champa": "Janjgir-Champa",
    "janjgir champa": "Janjgir-Champa",
    "janjgir-champa": "Janjgir-Champa",

    "jashpur": "Jashpur",

    "kabeerdham": "Kabeerdham",
    "kawardha": "Kabeerdham",

    "kanker": "Uttar Bastar Kanker",
    "uttar bastar kanker": "Uttar Bastar Kanker",

    "khairagarh chhuikhadan gandai": "Khairagarh-Chhuikhadan-Gandai",
    "khairagarh-chhuikhadan-gandai": "Khairagarh-Chhuikhadan-Gandai",

    "kondagaon": "Kondagaon",
    "korba": "Korba",

    "koriya": "Korea",
    "korea": "Korea",

    "mahasamund": "Mahasamund",

    "manendragarhchirmiribharatpur": "Manendragarh-Chirmiri-Bharatpur",
    "manendragarh–chirmiri–bharatpur": "Manendragarh-Chirmiri-Bharatpur",
    "manendragarh-chirmiri-bharatpur": "Manendragarh-Chirmiri-Bharatpur",

    "mohalla-manpur-ambagarh chowki": "Mohla-Manpur-Ambagarh Chouki",
    "mohla-manpur-ambagarh chouki": "Mohla-Manpur-Ambagarh Chouki",

    "mungeli": "Mungeli",
    "narayanpur": "Narayanpur",
    "raigarh": "Raigarh",
    "raipur": "Raipur",
    "rajnandgaon": "Rajnandgaon",

    "sakti": "Sakti",
    "sarangarh-bilaigarh": "Sarangarh-Bilaigarh",

    "sukma": "Sukma",
    "surajpur": "Surajpur",
    "surguja": "Surguja"
}


In [421]:


df = df.with_columns(
    pl.when(pl.col("state") == "chhattisgarh")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all("–", "-")
            .str.replace_all(r"\s+", " ")
            .map_elements(lambda x: chhattisgarh_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [422]:
df.filter(pl.col("state") == "chhattisgarh") \
  .select(pl.col("district").unique().sort())


district
str
"""Balod"""
"""Balodabazar-Bhatapara"""
"""Balrampur-Ramanujganj"""
"""Bastar"""
"""Bemetara"""
"""Bijapur"""
"""Bilaspur"""
"""Dakshin Bastar Dantewada"""
"""Dhamtari"""
"""Durg"""


dadra and nagar haveli and dam… 

In [423]:
df.filter(pl.col("state") == "dadra and nagar haveli and daman and diu") \
  .select(pl.col("district").unique().sort())


district
str
"""dadra & nagar haveli"""
"""dadra and nagar haveli"""
"""daman"""
"""diu"""


In [424]:
dadra_daman_diu_district_map = {
    "dadra & nagar haveli": "Dadra and Nagar Haveli",
    "dadra and nagar haveli": "Dadra and Nagar Haveli",

    "daman": "Daman",
    "diu": "Diu"
}


In [425]:
import polars as pl

df = df.with_columns([
    # Normalize district
    pl.when(
        pl.col("district").str.to_lowercase().is_in(dadra_daman_diu_district_map.keys())
    )
    .then(
        pl.col("district")
          .str.to_lowercase()
          .str.replace_all("&", "and")
          .str.replace_all(r"\s+", " ")
          .map_elements(lambda x: dadra_daman_diu_district_map.get(x, x))
    )
    .otherwise(pl.col("district"))
    .alias("district"),

])


In [426]:
df.filter(pl.col("state") == "dadra and nagar haveli and daman and diu") \
  .select(pl.col("district").unique().sort())


district
str
"""Dadra and Nagar Haveli"""
"""Daman"""
"""Diu"""


Delhi

In [427]:
df.filter(pl.col("state") == "delhi") \
  .select(pl.col("district").unique().sort())


district
str
"""central delhi"""
"""east delhi"""
"""najafgarh"""
"""new delhi"""
"""north delhi"""
"""north east"""
"""north east *"""
"""north east delhi"""
"""north west delhi"""
"""shahdara"""


In [428]:
delhi_district_map = {
    "central delhi": "Central",
    "central": "Central",

    "east delhi": "East",
    "east": "East",

    "new delhi": "New Delhi",

    "north delhi": "North",
    "north": "North",

    "north east": "North East",
    "north east delhi": "North East",
    "north east *":"North East",
    "north west delhi": "North West",
    "north west": "North West",

    "shahdara": "Shahdara",

    "south delhi": "South",
    "south": "South",

    "south east delhi": "South East",
    "south east": "South East",

    "south west delhi": "South West",
    "south west": "South West",

    "west delhi": "West",
    "west": "West",

    # sub-division, not district
    "najafgarh": "South West"
}


In [429]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "delhi")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all(r"\s+", " ")
            .map_elements(lambda x: delhi_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [430]:
df.filter(pl.col("state") == "delhi") \
  .select(pl.col("district").unique().sort())


district
str
"""Central"""
"""East"""
"""New Delhi"""
"""North"""
"""North East"""
"""North West"""
"""Shahdara"""
"""South"""
"""South East"""
"""South West"""


Goa

In [431]:
df.filter(pl.col("state") == "goa") \
  .select(pl.col("district").unique().sort())


district
str
"""bardez"""
"""north goa"""
"""south goa"""


In [432]:
goa_district_map = {
    "north goa": "North Goa",
    "south goa": "South Goa",

    # talukas → district
    "bardez": "North Goa",
    "tiswadi": "North Goa"
}


In [433]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "goa")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all(r"\s+", " ")
            .map_elements(lambda x: goa_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [434]:
df.filter(pl.col("state") == "goa") \
  .select(pl.col("district").unique().sort())


district
str
"""North Goa"""
"""South Goa"""


Gujarat

In [435]:
df.filter(pl.col("state") == "gujarat") \
  .select(pl.col("district").unique().sort())


district
str
"""ahmadabad"""
"""ahmedabad"""
"""amreli"""
"""anand"""
"""arvalli"""
"""banas kantha"""
"""banaskantha"""
"""bharuch"""
"""bhavnagar"""
"""botad"""


In [436]:
gujarat_district_map = {
    "ahmadabad": "Ahmedabad",
    "ahmedabad": "Ahmedabad",

    "amreli": "Amreli",
    "anand": "Anand",
    "arvalli": "Arvalli",

    "banas kantha": "Banas Kantha",
    "banaskantha": "Banas Kantha",

    "bharuch": "Bharuch",
    "bhavnagar": "Bhavnagar",
    "botad": "Botad",

    "chhotaudepur": "Chhotaudepur",

    "dahod": "Dahod",
    "dohad": "Dahod",

    "the dangs": "Dangs",
    "dangs": "Dangs",

    "devbhumi dwarka": "Devbhumi Dwarka",

    "gandhinagar": "Gandhinagar",
    "gir somnath": "Gir Somnath",
    "jamnagar": "Jamnagar",
    "junagadh": "Junagadh",

    "kachchh": "Kachchh",

    "kheda": "Kheda",

    "mahesana": "Mahesana",
    "mahisagar": "Mahisagar",

    "morbi": "Morbi",
    "narmada": "Narmada",
    "navsari": "Navsari",

    "panch mahals": "Panch Mahals",
    "panchmahals": "Panch Mahals",

    "patan": "Patan",
    "porbandar": "Porbandar",
    "rajkot": "Rajkot",

    "sabar kantha": "Sabar Kantha",
    "sabarkantha": "Sabar Kantha",

    "surat": "Surat",

    "surendra nagar": "Surendranagar",
    "surendranagar": "Surendranagar",

    "tapi": "Tapi",

    "vadodara": "Vadodara",
    "valsad": "Valsad",

    # future-proof (only applied if present)
    "vav-tharad": "Vav-Tharad",
    "vav tharad": "Vav-Tharad"
}


In [437]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "gujarat")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all("-", " ")
            .str.replace_all(r"\s+", " ")
            .map_elements(lambda x: gujarat_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [438]:
df.filter(pl.col("state") == "gujarat") \
  .select(pl.col("district").unique().sort())


district
str
"""Ahmedabad"""
"""Amreli"""
"""Anand"""
"""Arvalli"""
"""Banas Kantha"""
"""Bharuch"""
"""Bhavnagar"""
"""Botad"""
"""Chhotaudepur"""
"""Dahod"""


haryana

In [439]:
df.filter(pl.col("state") == "haryana") \
  .select(pl.col("district").unique().sort())


district
str
"""ambala"""
"""bhiwani"""
"""charkhi dadri"""
"""faridabad"""
"""fatehabad"""
"""gurgaon"""
"""gurugram"""
"""hisar"""
"""jhajjar"""
"""jhajjar *"""


In [440]:
haryana_district_map = {
    "ambala": "Ambala",
    "bhiwani": "Bhiwani",
    "charkhi dadri": "Charkhi Dadri",
    "faridabad": "Faridabad",
    "fatehabad": "Fatehabad",

    "gurgaon": "Gurugram",
    "gurugram": "Gurugram",

    "hisar": "Hisar",
    "jhajjar": "Jhajjar",
    "jhajjar *":"Jhajjar",
    "jind": "Jind",
    "kaithal": "Kaithal",
    "karnal": "Karnal",
    "kurukshetra": "Kurukshetra",
    "mahendragarh": "Mahendragarh",
    "akhera":"Nuh",
    "mewat": "Nuh",
    "nuh": "Nuh",

    "palwal": "Palwal",
    "panchkula": "Panchkula",
    "panipat": "Panipat",
    "rewari": "Rewari",
    "rohtak": "Rohtak",
    "sirsa": "Sirsa",
    "sonipat": "Sonipat",

    "yamuna nagar": "Yamunanagar",
    "yamunanagar": "Yamunanagar"
}


In [441]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "haryana")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all(r"\s+", " ")
         
            .map_elements(lambda x: haryana_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [442]:
df.filter(pl.col("state") == "haryana") \
  .select(pl.col("district").unique().sort())


district
str
"""Ambala"""
"""Bhiwani"""
"""Charkhi Dadri"""
"""Faridabad"""
"""Fatehabad"""
"""Gurugram"""
"""Hisar"""
"""Jhajjar"""
"""Jind"""
"""Kaithal"""


himachal pradesh

In [443]:
df.filter(pl.col("state") == "himachal pradesh") \
  .select(pl.col("district").unique().sort())


district
str
"""bilaspur"""
"""chamba"""
"""hamirpur"""
"""kangra"""
"""kinnaur"""
"""kullu"""
"""lahaul and spiti"""
"""lahul & spiti"""
"""lahul and spiti"""
"""mandi"""


In [444]:
himachal_district_map = {
    "bilaspur": "Bilaspur",
    "chamba": "Chamba",
    "hamirpur": "Hamirpur",
    "kangra": "Kangra",
    "kinnaur": "Kinnaur",
    "kullu": "Kullu",

    "lahaul and spiti": "Lahaul And Spiti",
    "lahul & spiti": "Lahaul And Spiti",
    "lahul and spiti": "Lahaul And Spiti",

    "mandi": "Mandi",
    "shimla": "Shimla",
    "sirmaur": "Sirmaur",
    "solan": "Solan",
    "una": "Una"
}


In [445]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "himachal pradesh")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all("&", "and")
            .str.replace_all(r"\s+", " ")

            .map_elements(lambda x: himachal_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [446]:
df.filter(pl.col("state") == "himachal pradesh") \
  .select(pl.col("district").unique().sort())


district
str
"""Bilaspur"""
"""Chamba"""
"""Hamirpur"""
"""Kangra"""
"""Kinnaur"""
"""Kullu"""
"""Lahaul And Spiti"""
"""Mandi"""
"""Shimla"""
"""Sirmaur"""


Jammu

In [447]:
df.filter(pl.col("state") == "jammu and kashmir") \
  .select(pl.col("district").unique().sort())


district
str
"""anantnag"""
"""badgam"""
"""bandipore"""
"""bandipur"""
"""baramula"""
"""budgam"""
"""doda"""
"""ganderbal"""
"""jammu"""
"""kargil"""


In [448]:
jammu_kashmir_district_map = {
    "?": "Jammu",

    "anantnag": "Anantnag",

    "bandipore": "Bandipora",
    "bandipur": "Bandipora",

    "baramula": "Baramulla",

    "badgam": "Budgam",
    "budgam": "Budgam",

    "doda": "Doda",
    "ganderbal": "Ganderbal",
    "jammu": "Jammu",
    "kathua": "Kathua",
    "kishtwar": "Kishtwar",
    "kulgam": "Kulgam",
    "kupwara": "Kupwara",

    "poonch": "Poonch",
    "punch": "Poonch",

    "pulwama": "Pulwama",

    "rajauri": "Rajouri",
    "rajouri": "Rajouri",

    "ramban": "Ramban",
    "reasi": "Reasi",
    "samba": "Samba",

    "shupiyan": "Shopian",
    "shopian": "Shopian",

    "srinagar": "Srinagar",
    "udhampur": "Udhampur"
}


In [449]:
ladakh_district_map = {
    "leh": "Leh",
    "leh ladakh": "Leh",
    "leh (ladakh)": "Leh",
    "kargil": "Kargil"
}


In [450]:
import polars as pl

df = df.with_columns([
    # Fix district names
    pl.when(pl.col("district").str.to_lowercase().is_in(ladakh_district_map.keys()))
      .then(
          pl.col("district")
            .str.to_lowercase()
            .map_elements(lambda x: ladakh_district_map.get(x, x))
      )
      .otherwise(
          pl.col("district")
            .str.to_lowercase()
            .map_elements(lambda x: jammu_kashmir_district_map.get(x, x))
      )
      .alias("district"),

    # Fix state (move Ladakh districts)
    pl.when(pl.col("district").str.to_lowercase().is_in(ladakh_district_map.values()))
      .then(pl.lit("ladakh"))
      .otherwise(pl.col("state"))
      .alias("state")
])


In [451]:
ladakh_districts = {"leh", "kargil"}


In [452]:
import polars as pl

df = df.with_columns([
    # Normalize district names (J&K + Ladakh)
    pl.col("district")
      .str.to_lowercase()
      .str.replace_all(r"\s+", " ")

      .map_elements(
          lambda x: (
              "Leh" if x in ["leh", "leh ladakh", "leh (ladakh)"] else
              "Kargil" if x == "kargil" else
              x.title()
          )
      )
      .alias("district"),

    # Fix state for Ladakh districts
    pl.when(pl.col("district").str.to_lowercase().is_in(ladakh_districts))
      .then(pl.lit("ladakh"))
      .otherwise(pl.col("state"))
      .alias("state")
])


In [453]:
df.filter(pl.col("state") == "jammu and kashmir") \
  .select(pl.col("district").unique().sort())


district
str
"""Anantnag"""
"""Bandipora"""
"""Baramulla"""
"""Budgam"""
"""Doda"""
"""Ganderbal"""
"""Jammu"""
"""Kathua"""
"""Kishtwar"""
"""Kulgam"""


In [454]:
df.filter(pl.col("state") == "ladakh") \
  .select(pl.col("district").unique().sort())


district
str
"""Kargil"""
"""Leh"""


jharkhand

In [455]:
df.filter(pl.col("state") == "jharkhand") \
  .select(pl.col("district").unique().sort())

district
str
"""Bokaro"""
"""Bokaro *"""
"""Chatra"""
"""Deoghar"""
"""Dhanbad"""
"""Dumka"""
"""East Singhbhum"""
"""East Singhbum"""
"""Garhwa"""
"""Garhwa *"""


In [555]:
jharkhand_district_map = {
    "bokaro": "Bokaro",
    "bokaro *": "Bokaro",

    "chatra": "Chatra",
    "deoghar": "Deoghar",
    "dhanbad": "Dhanbad",
    "dumka": "Dumka",

    "east singhbhum": "East Singhbhum",
    "purbi singhbhum": "East Singhbhum",
    "eastsinghbum":"East Singhbhum",
    "eastsinghbum": "East Singhbhum",
    "garhwa": "Garhwa",
    "garhwa *": "Garhwa",

    "giridih": "Giridih",
    "godda": "Godda",
    "gumla": "Gumla",

    "hazaribag": "Hazaribagh",
    "hazaribagh": "Hazaribagh",

    "jamtara": "Jamtara",
    "khunti": "Khunti",

    "kodarma": "Koderma",
    "koderma": "Koderma",

    "latehar": "Latehar",
    "lohardaga": "Lohardaga",

    "pakaur": "Pakur",
    "pakur": "Pakur",

    "palamau": "Palamu",
    "palamu": "Palamu",

    "ramgarh": "Ramgarh",
    "ranchi": "Ranchi",

    "sahebganj": "Sahebganj",
    "sahibganj": "Sahebganj",

    "seraikela-kharsawan": "Saraikela Kharsawan",
    "seraikela kharsawan": "Saraikela Kharsawan",

    "simdega": "Simdega",

    "west singhbhum": "West Singhbhum",
    "pashchimi singhbhum": "West Singhbhum"
}


In [556]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "jharkhand")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all("\*","")
            .str.replace_all("-", " ")
            .str.replace_all(r"\s+", " ")
            .map_elements(lambda x: jharkhand_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [557]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "jharkhand")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all(" ","")
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [558]:
df.filter(pl.col("state") == "jharkhand") \
  .select(pl.col("district").unique().sort())


district
str
"""bokaro"""
"""chatra"""
"""deoghar"""
"""dhanbad"""
"""dumka"""
"""eastsinghbhum"""
"""garhwa"""
"""giridih"""
"""godda"""
"""gumla"""


karnataka

In [460]:
df.filter(pl.col("state") == "karnataka") \
  .select(pl.col("district").unique().sort())

district
str
"""Bagalkot"""
"""Bagalkot *"""
"""Ballari"""
"""Bangalore"""
"""Bangalore Rural"""
"""Belagavi"""
"""Belgaum"""
"""Bellary"""
"""Bengaluru"""
"""Bengaluru Rural"""


In [461]:
karnataka_district_map = {
    # Bagalkote
    "bagalkot": "Bagalkote",
    "bagalkot *": "Bagalkote",

    # Ballari
    "ballari": "Ballari",
    "bellary": "Ballari",

    # Belagavi
    "belagavi": "Belagavi",
    "belgaum": "Belagavi",

    # Bengaluru
    "bangalore": "Bengaluru Urban",
    "bengaluru": "Bengaluru Urban",
    "bangalore urban": "Bengaluru Urban",
    "bengaluru urban": "Bengaluru Urban",

    "bangalore rural": "Bengaluru Rural",
    "bengaluru rural": "Bengaluru Rural",

    "bengaluru south": "Bengaluru South",

    # Bidar
    "bidar": "Bidar",

    # Chamarajanagar
    "chamarajanagar": "Chamarajanagar",
    "chamarajanagar *": "Chamarajanagar",
    "chamrajanagar": "Chamarajanagar",
    "chamrajnagar": "Chamarajanagar",

    # Chikkaballapura
    "chikkaballapur": "Chikkaballapura",

    # Chikkamagaluru
    "chickmagalur": "Chikkamagaluru",
    "chikmagalur": "Chikkamagaluru",
    "chikkamagaluru": "Chikkamagaluru",

    # Chitradurga
    "chitradurga": "Chitradurga",

    # Dakshina Kannada
    "dakshina kannada": "Dakshina Kannada",

    # Davanagere
    "davanagere": "Davanagere",
    "davangere": "Davanagere",

    # Dharwad
    "dharwad": "Dharwad",

    # Gadag
    "gadag": "Gadag",
    "gadag *": "Gadag",

    # Hassan
    "hasan": "Hassan",
    "hassan": "Hassan",

    # Haveri
    "haveri": "Haveri",
    "haveri *": "Haveri",

    # Kalaburagi
    "gulbarga": "Kalaburagi",
    "kalaburagi": "Kalaburagi",

    # Kodagu
    "kodagu": "Kodagu",

    # Kolar
    "kolar": "Kolar",

    # Koppal
    "koppal": "Koppal",

    # Mandya
    "mandya": "Mandya",

    # Mysuru
    "mysore": "Mysuru",
    "mysuru": "Mysuru",

    # Raichur
    "raichur": "Raichur",

    # Shivamogga
    "shimoga": "Shivamogga",
    "shivamogga": "Shivamogga",

    # Tumakuru
    "tumakuru": "Tumakuru",
    "tumkur": "Tumakuru",

    # Udupi
    "udupi": "Udupi",
    "udupi *": "Udupi",

    # Uttara Kannada
    "uttara kannada": "Uttara Kannada",

    # Vijayanagara
    "vijayanagara": "Vijayanagara",

    # Vijayapura
    "bijapur": "Vijayapura",
    "bijapur(kar)": "Vijayapura",
    "vijayapura": "Vijayapura",

    # Yadgir
    "yadgir": "Yadgir"
}


In [462]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "karnataka")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all("\ *", "")
            .str.replace_all(r"\s+", " ")
       
            .map_elements(lambda x: karnataka_district_map.get(x, x))
          
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [463]:
df.filter(pl.col("state") == "karnataka") \
  .select(pl.col("district").unique().sort())


district
str
"""Bagalkote"""
"""Ballari"""
"""Belagavi"""
"""Bengaluru Urban"""
"""Bidar"""
"""Chamarajanagar"""
"""Chikkaballapura"""
"""Chikkamagaluru"""
"""Chitradurga"""
"""Davanagere"""


In [464]:
karnataka_extra_map = {
    "bagalkot*": "Bagalkote",

    "bangalorerural": "Bengaluru Rural",
    "bengalururural": "Bengaluru Rural",
    "bengalurusouth": "Bengaluru South",

    "chamarajanagar*": "Chamarajanagar",

    "dakshinakannada": "Dakshina Kannada",

    "gadag*": "Gadag",
    "haveri*": "Haveri",

    "ramanagar": "Ramanagara",

    "udupi*": "Udupi",

    "uttarakannada": "Uttara Kannada"
}


In [465]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "karnataka")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all("*", "", literal=True)  # safe
            .str.replace_all(r"\s+", "")
            .map_elements(lambda x: karnataka_extra_map.get(x, x.title()))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [466]:
df.filter(pl.col("state") == "karnataka") \
  .select(pl.col("district").unique().sort())


district
str
"""Bagalkot"""
"""Bagalkote"""
"""Ballari"""
"""Belagavi"""
"""Bengaluru Rural"""
"""Bengaluru South"""
"""Bengaluruurban"""
"""Bidar"""
"""Chamarajanagar"""
"""Chikkaballapura"""


In [467]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "karnataka")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .map_elements(lambda x: {
                "bagalkot": "Bagalkote",
                "bagalkote": "Bagalkote",
                "bengaluruurban": "Bengaluru Urban"
            }.get(x, x.title()))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [468]:
df.filter(pl.col("state") == "karnataka") \
  .select(pl.col("district").unique().sort())


district
str
"""Bagalkote"""
"""Ballari"""
"""Belagavi"""
"""Bengaluru Rural"""
"""Bengaluru South"""
"""Bengaluru Urban"""
"""Bidar"""
"""Chamarajanagar"""
"""Chikkaballapura"""
"""Chikkamagaluru"""


In [469]:
import polars as pl

df = df.with_columns(
    pl.when(
        (pl.col("state") == "karnataka") &
        (pl.col("district") == "Ramanagara")
    )
    .then(pl.lit("Bengaluru South"))
    .otherwise(pl.col("district"))
    .alias("district")
)


In [470]:
df.filter(pl.col("state") == "karnataka") \
  .select(pl.col("district").unique().sort())


district
str
"""Bagalkote"""
"""Ballari"""
"""Belagavi"""
"""Bengaluru Rural"""
"""Bengaluru South"""
"""Bengaluru Urban"""
"""Bidar"""
"""Chamarajanagar"""
"""Chikkaballapura"""
"""Chikkamagaluru"""


kerala

In [471]:
df.filter(pl.col("state") == "kerala") \
  .select(pl.col("district").unique().sort())


district
str
"""Alappuzha"""
"""Ernakulam"""
"""Idukki"""
"""Kannur"""
"""Kasaragod"""
"""Kasargod"""
"""Kollam"""
"""Kottayam"""
"""Kozhikode"""
"""Malappuram"""


In [472]:
kerala_district_map = {
    "alappuzha": "Alappuzha",
    "ernakulam": "Ernakulam",
    "idukki": "Idukki",
    "kannur": "Kannur",

    "kasargod": "Kasaragod",
    "kasaragod": "Kasaragod",

    "kollam": "Kollam",
    "kottayam": "Kottayam",
    "kozhikode": "Kozhikode",
    "malappuram": "Malappuram",
    "palakkad": "Palakkad",
    "pathanamthitta": "Pathanamthitta",
    "thiruvananthapuram": "Thiruvananthapuram",
    "thrissur": "Thrissur",
    "wayanad": "Wayanad"
}


In [473]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "kerala")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all(r"\s+", " ")
            .map_elements(lambda x: kerala_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [474]:
df.filter(pl.col("state") == "kerala") \
  .select(pl.col("district").unique().sort())


district
str
"""Alappuzha"""
"""Ernakulam"""
"""Idukki"""
"""Kannur"""
"""Kasaragod"""
"""Kollam"""
"""Kottayam"""
"""Kozhikode"""
"""Malappuram"""
"""Palakkad"""


lakshadweep

In [475]:
df.filter(pl.col("state") == "lakshadweep") \
  .select(pl.col("district").unique().sort())

district
str
"""Lakshadweep"""


madhya pradesh

In [476]:
df.filter(pl.col("state") == "madhya pradesh") \
  .select(pl.col("district").unique().sort())

district
str
"""Agar Malwa"""
"""Alirajpur"""
"""Anuppur"""
"""Ashok Nagar"""
"""Ashoknagar"""
"""Balaghat"""
"""Barwani"""
"""Betul"""
"""Bhind"""
"""Bhopal"""


In [477]:
madhya_pradesh_district_map = {
    "agar malwa": "Agar-Malwa",
    "agar-malwa": "Agar-Malwa",

    "alirajpur": "Alirajpur",
    "anuppur": "Anuppur",

    "ashok nagar": "Ashoknagar",
    "ashoknagar": "Ashoknagar",

    "balaghat": "Balaghat",
    "barwani": "Barwani",
    "betul": "Betul",
    "bhind": "Bhind",
    "bhopal": "Bhopal",
    "burhanpur": "Burhanpur",
    "chhatarpur": "Chhatarpur",
    "chhindwara": "Chhindwara",
    "damoh": "Damoh",
    "datia": "Datia",
    "dewas": "Dewas",
    "dhar": "Dhar",
    "dindori": "Dindori",

    "guna": "Guna",
    "gwalior": "Gwalior",

    "harda": "Harda",
    "harda *": "Harda",

    "hoshangabad": "Narmadapuram",
    "narmadapuram": "Narmadapuram",

    "indore": "Indore",
    "jabalpur": "Jabalpur",
    "jhabua": "Jhabua",
    "katni": "Katni",

    "east nimar": "Khandwa (East Nimar)",
    "khandwa": "Khandwa (East Nimar)",

    "west nimar": "Khargone (West Nimar)",
    "khargone": "Khargone (West Nimar)",

    "maihar": "Maihar",
    "mandla": "Mandla",
    "mandsaur": "Mandsaur",
    "mauganj": "Mauganj",
    "morena": "Morena",

    "narsinghpur": "Narsimhapur",
    "narsimhapur": "Narsimhapur",

    "neemuch": "Neemuch",
    "niwari": "Niwari",

    "pandhurna": "Pandhurna",
    "panna": "Panna",
    "raisen": "Raisen",
    "rajgarh": "Rajgarh",
    "ratlam": "Ratlam",
    "rewa": "Rewa",
    "sagar": "Sagar",
    "satna": "Satna",
    "sehore": "Sehore",
    "seoni": "Seoni",
    "shahdol": "Shahdol",
    "shajapur": "Shajapur",
    "sheopur": "Sheopur",
    "shivpuri": "Shivpuri",
    "sidhi": "Sidhi",
    "singrauli": "Singrauli",
    "tikamgarh": "Tikamgarh",
    "ujjain": "Ujjain",
    "umaria": "Umaria",
    "vidisha": "Vidisha"
}


In [478]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "madhya pradesh")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all("*", "", literal=True)
            .str.replace_all(r"\s+", " ")
            .map_elements(lambda x: madhya_pradesh_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [479]:
df.filter(pl.col("state") == "madhya pradesh") \
  .select(pl.col("district").unique().sort())


district
str
"""Agar-Malwa"""
"""Alirajpur"""
"""Anuppur"""
"""Ashoknagar"""
"""Balaghat"""
"""Barwani"""
"""Betul"""
"""Bhind"""
"""Bhopal"""
"""Burhanpur"""


In [480]:
df = df.with_columns(
    pl.when(
        (pl.col("state") == "madhya pradesh") &
        (pl.col("district") == "harda ")
    )
    .then(pl.lit("Harda"))
    .otherwise(pl.col("district"))
    .alias("district")
)


maharashtra

In [481]:
df.filter(pl.col("state") == "maharashtra") \
  .select(pl.col("district").unique().sort())



district
str
"""Ahilyanagar"""
"""Ahmadnagar"""
"""Ahmed Nagar"""
"""Ahmednagar"""
"""Akola"""
"""Amravati"""
"""Aurangabad"""
"""Beed"""
"""Bhandara"""
"""Bid"""


In [482]:
maharashtra_district_map = {
    # Ahilyanagar
    "ahilyanagar": "Ahilyanagar",
    "ahmadnagar": "Ahilyanagar",
    "ahmed nagar": "Ahilyanagar",
    "ahmednagar":"Ahilyanagar",

    # Akola
    "akola": "Akola",

    # Amravati
    "amravati": "Amravati",

    # Beed
    "beed": "Beed",
    "bid": "Beed",

    # Bhandara
    "bhandara": "Bhandara",

    # Buldhana
    "buldana": "Buldhana",
    "buldhana": "Buldhana",

    # Chandrapur
    "chandrapur": "Chandrapur",

    # Chhatrapati Sambhajinagar
    "aurangabad": "Chhatrapati Sambhajinagar",
    "chatrapati sambhaji nagar": "Chhatrapati Sambhajinagar",
    "chhatrapati sambhajinagar": "Chhatrapati Sambhajinagar",

    # Dharashiv
    "osmanabad": "Dharashiv",
    "dharashiv": "Dharashiv",

    # Dhule
    "dhule": "Dhule",

    # Gadchiroli
    "gadchiroli": "Gadchiroli",

    # Gondia
    "gondia": "Gondia",
    "gondiya": "Gondia",
    "gondiya *": "Gondia",

    # Hingoli
    "hingoli": "Hingoli",
    "hingoli *": "Hingoli",

    # Jalgaon
    "jalgaon": "Jalgaon",

    # Jalna
    "jalna": "Jalna",

    # Kolhapur
    "kolhapur": "Kolhapur",

    # Latur
    "latur": "Latur",

    # Mumbai
    "mumbai": "Mumbai",
    "mumbai city": "Mumbai",

    # Mumbai Suburban
    "mumbai suburban": "Mumbai Suburban",
    "mumbai( sub urban )": "Mumbai Suburban",

    # Nagpur
    "nagpur": "Nagpur",

    # Nanded
    "nanded": "Nanded",

    # Nandurbar
    "nandurbar": "Nandurbar",
    "nandurbar *": "Nandurbar",

    # Nashik
    "nashik": "Nashik",

    # Palghar
    "palghar": "Palghar",

    # Parbhani
    "parbhani": "Parbhani",

    # Pune
    "pune": "Pune",

    # Raigad
    "raigad": "Raigad",
    "raigarh": "Raigad",
    "raigarh(mh)": "Raigad",

    # Ratnagiri
    "ratnagiri": "Ratnagiri",

    # Sangli
    "sangli": "Sangli",

    # Satara
    "satara": "Satara",

    # Sindhudurg
    "sindhudurg": "Sindhudurg",

    # Solapur
    "solapur": "Solapur",

    # Thane
    "thane": "Thane",

    # Wardha
    "wardha": "Wardha",

    # Washim
    "washim": "Washim",
    "washim *": "Washim",

    # Yavatmal
    "yavatmal": "Yavatmal"
}


In [483]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "maharashtra")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all(r"\s+", " ")
            .map_elements(lambda x: maharashtra_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [484]:
df.filter(pl.col("state") == "maharashtra") \
  .select(pl.col("district").unique().sort())


district
str
"""Ahilyanagar"""
"""Akola"""
"""Amravati"""
"""Beed"""
"""Bhandara"""
"""Buldhana"""
"""Chandrapur"""
"""Chhatrapati Sambhajinagar"""
"""Dharashiv"""
"""Dhule"""


manipur

In [485]:
df.filter(pl.col("state") == "manipur") \
  .select(pl.col("district").unique().sort())


district
str
"""Bishnupur"""
"""Chandel"""
"""Churachandpur"""
"""Imphal East"""
"""Imphal West"""
"""Jiribam"""
"""Kakching"""
"""Pherzawl"""
"""Senapati"""
"""Tamenglong"""


In [486]:
manipur_district_map = {
    "bishnupur": "Bishnupur",
    "chandel": "Chandel",
    "churachandpur": "Churachandpur",
    "imphal east": "Imphal East",
    "imphal west": "Imphal West",
    "jiribam": "Jiribam",
    "kakching": "Kakching",
    "pherzawl": "Pherzawl",
    "senapati": "Senapati",
    "tamenglong": "Tamenglong",
    "thoubal": "Thoubal",
    "ukhrul": "Ukhrul"
}


In [487]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "manipur")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all(r"\s+", " ")

            .map_elements(lambda x: manipur_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [488]:
df.filter(pl.col("state") == "manipur") \
  .select(pl.col("district").unique().sort())


district
str
"""Bishnupur"""
"""Chandel"""
"""Churachandpur"""
"""Imphal East"""
"""Imphal West"""
"""Jiribam"""
"""Kakching"""
"""Pherzawl"""
"""Senapati"""
"""Tamenglong"""


meghalaya

In [489]:
df.filter(pl.col("state") == "meghalaya") \
  .select(pl.col("district").unique().sort())


district
str
"""East Garo Hills"""
"""East Jaintia Hills"""
"""East Khasi Hills"""
"""Eastern West Khasi Hills"""
"""Jaintia Hills"""
"""Kamrup"""
"""North Garo Hills"""
"""Ri Bhoi"""
"""South Garo Hills"""
"""South West Garo Hills"""


In [490]:
meghalaya_district_map = {
    "east garo hills": "East Garo Hills",
    "east jaintia hills": "East Jaintia Hills",
    "east khasi hills": "East Khasi Hills",

    "eastern west khasi hills": "Eastern West Khasi Hills",

    "north garo hills": "North Garo Hills",
    "ri bhoi": "Ri Bhoi",

    "south garo hills": "South Garo Hills",
    "south west garo hills": "South West Garo Hills",
    "south west khasi hills": "South West Khasi Hills",

    "west garo hills": "West Garo Hills",
    "west jaintia hills": "West Jaintia Hills",
    "west khasi hills": "West Khasi Hills",

    # legacy (pre-split)
    "jaintia hills": "West Jaintia Hills"
}


In [491]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "meghalaya")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all(r"\s+", " ")
            .map_elements(lambda x: meghalaya_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [492]:
df = df.with_columns(
    pl.when(
        (pl.col("state") == "meghalaya") &
        (pl.col("district") == "kamrup")
    )
    .then(pl.lit("assam"))
    .otherwise(pl.col("state"))
    .alias("state")
)


In [493]:
df.filter(pl.col("state") == "meghalaya") \
  .select(pl.col("district").unique().sort())


district
str
"""East Garo Hills"""
"""East Jaintia Hills"""
"""East Khasi Hills"""
"""Eastern West Khasi Hills"""
"""North Garo Hills"""
"""Ri Bhoi"""
"""South Garo Hills"""
"""South West Garo Hills"""
"""South West Khasi Hills"""
"""West Garo Hills"""


mizoram

In [494]:
df.filter(pl.col("state") == "mizoram") \
  .select(pl.col("district").unique().sort())


district
str
"""Aizawl"""
"""Champhai"""
"""Hnahthial"""
"""Khawzawl"""
"""Kolasib"""
"""Lawngtlai"""
"""Lunglei"""
"""Mamit"""
"""Mammit"""
"""Saiha"""


In [495]:
mizoram_district_map = {
    "aizawl": "Aizawl",
    "champhai": "Champhai",
    "hnahthial": "Hnahthial",
    "khawzawl": "Khawzawl",
    "kolasib": "Kolasib",
    "lawngtlai": "Lawngtlai",
    "lunglei": "Lunglei",

    "mamit": "Mamit",
    "mammit": "Mamit",

    "saitual": "Saitual",
    "serchhip": "Serchhip",

    "saiha": "Siaha",
    "siaha": "Siaha"
}


In [496]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "mizoram")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all(r"\s+", " ")
            .map_elements(lambda x: mizoram_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [497]:
df.filter(pl.col("state") == "mizoram") \
  .select(pl.col("district").unique().sort())


district
str
"""Aizawl"""
"""Champhai"""
"""Hnahthial"""
"""Khawzawl"""
"""Kolasib"""
"""Lawngtlai"""
"""Lunglei"""
"""Mamit"""
"""Saitual"""
"""Serchhip"""


nagaland

In [498]:
df.filter(pl.col("state") == "nagaland") \
  .select(pl.col("district").unique().sort())


district
str
"""Chumukedima"""
"""Dimapur"""
"""Kiphire"""
"""Kohima"""
"""Longleng"""
"""Meluri"""
"""Mokokchung"""
"""Mon"""
"""Niuland"""
"""Noklak"""


In [499]:
nagaland_district_map = {
    "chumukedima": "Chumoukedima",
    "chumoukedima": "Chumoukedima",

    "dimapur": "Dimapur",
    "kiphire": "Kiphire",
    "kohima": "Kohima",
    "longleng": "Longleng",
    "meluri": "Meluri",
    "mokokchung": "Mokokchung",
    "mon": "Mon",
    "niuland": "Niuland",
    "noklak": "Noklak",
    "peren": "Peren",
    "phek": "Phek",
    "shamator": "Shamator",
    "tseminyu": "Tseminyu",
    "tuensang": "Tuensang",
    "wokha": "Wokha",
    "zunheboto": "Zunheboto"
}


In [500]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "nagaland")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all(r"\s+", " ")
            .map_elements(lambda x: nagaland_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [501]:
df.filter(pl.col("state") == "nagaland") \
  .select(pl.col("district").unique().sort())


district
str
"""Chumoukedima"""
"""Dimapur"""
"""Kiphire"""
"""Kohima"""
"""Longleng"""
"""Meluri"""
"""Mokokchung"""
"""Mon"""
"""Niuland"""
"""Noklak"""


odisha

In [502]:
df.filter(pl.col("state") == "odisha") \
  .select(pl.col("district").unique().sort())


district
str
"""Angul"""
"""Anugal"""
"""Anugul"""
"""Balangir"""
"""Baleshwar"""
"""Baleswar"""
"""Bargarh"""
"""Baudh"""
"""Bhadrak"""
"""Boudh"""


In [503]:
odisha_district_map = {
    "angul": "Angul",
    "anugal": "Angul",
    "anugul": "Angul",
    "anugul *": "Angul",

    "balangir": "Balangir",

    "baleshwar": "Balasore",
    "baleswar": "Balasore",
    "balasore": "Balasore",

    "bargarh": "Bargarh",

    "bhadrak": "Bhadrak",

    "baudh": "Boudh",
    "boudh": "Boudh",

    "cuttack": "Cuttack",

    "debagarh": "Deogarh",
    "deogarh": "Deogarh",

    "dhenkanal": "Dhenkanal",

    "gajapati": "Gajapati",
    "ganjam": "Ganjam",

    "jagatsinghapur": "Jagatsinghapur",
    "jagatsinghpur": "Jagatsinghapur",

    "jajapur": "Jajpur",
    "jajpur": "Jajpur",

    "jharsuguda": "Jharsuguda",
    "kalahandi": "Kalahandi",
    "kandhamal": "Kandhamal",

    "kendrapara": "Kendrapara",

    "kendujhar": "Keonjhar",
    "keonjhar": "Keonjhar",

    "khorda": "Khordha",
    "khordha": "Khordha",
    "khordha *": "Khordha",

    "koraput": "Koraput",
    "malkangiri": "Malkangiri",
    "mayurbhanj": "Mayurbhanj",

    "nabarangapur": "Nabarangpur",
    "nabarangpur": "Nabarangpur",

    "nayagarh": "Nayagarh",
    "nuapada": "Nuapada",

    "puri": "Puri",
    "rayagada": "Rayagada",
    "sambalpur": "Sambalpur",

    "sonapur": "Sonepur",
    "subarnapur": "Sonepur",
    "sonepur": "Sonepur",

    "sundargarh": "Sundargarh",
    "sundergarh": "Sundargarh"
}


In [504]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "odisha")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all(" *", "", literal=True)
            .str.replace_all(r"\s+", " ")
            .map_elements(lambda x: odisha_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [505]:
df.filter(pl.col("state") == "odisha") \
  .select(pl.col("district").unique().sort())


district
str
"""Angul"""
"""Balangir"""
"""Balasore"""
"""Bargarh"""
"""Bhadrak"""
"""Boudh"""
"""Cuttack"""
"""Deogarh"""
"""Dhenkanal"""
"""Gajapati"""


puducherry

In [506]:
df.filter(pl.col("state") == "puducherry") \
  .select(pl.col("district").unique().sort())


district
str
"""Karaikal"""
"""Pondicherry"""
"""Puducherry"""
"""Yanam"""


In [507]:
import polars as pl

df = df.with_columns([
    # Fix district
    pl.when(pl.col("district").str.to_lowercase().is_in(
        ["pondicherry", "puducherry", "yanam"]
    ))
    .then(pl.lit("Puducherry"))

    .when(pl.col("district").str.to_lowercase() == "karaikal")
    .then(pl.lit("Karaikal"))

    .otherwise(pl.col("district"))
    .alias("district"),

    # Fix state
    pl.when(pl.col("district").str.to_lowercase().is_in(
        ["pondicherry", "puducherry", "yanam", "karaikal"]
    ))
    .then(pl.lit("puducherry"))

    .when(pl.col("district").str.to_lowercase().is_in(
        ["cuddalore", "viluppuram"]
    ))
    .then(pl.lit("tamil nadu"))

    .otherwise(pl.col("state"))
    .alias("state")
])


In [508]:
df.filter(pl.col("state") == "puducherry") \
  .select(pl.col("district").unique().sort())


district
str
"""Karaikal"""
"""Puducherry"""


punjab

In [509]:
df.filter(pl.col("state") == "punjab") \
  .select(pl.col("district").unique().sort())


district
str
"""Amritsar"""
"""Barnala"""
"""Bathinda"""
"""Faridkot"""
"""Fatehgarh Sahib"""
"""Fazilka"""
"""Ferozepur"""
"""Firozpur"""
"""Gurdaspur"""
"""Hoshiarpur"""


In [510]:
punjab_district_map = {
    "amritsar": "Amritsar",
    "barnala": "Barnala",
    "bathinda": "Bathinda",
    "faridkot": "Faridkot",
    "fatehgarh sahib": "Fatehgarh Sahib",
    "fazilka": "Fazilka",

    "ferozepur": "Ferozepur",
    "firozpur": "Ferozepur",

    "gurdaspur": "Gurdaspur",
    "hoshiarpur": "Hoshiarpur",
    "jalandhar": "Jalandhar",
    "kapurthala": "Kapurthala",
    "ludhiana": "Ludhiana",
    "malerkotla": "Malerkotla",
    "mansa": "Mansa",
    "moga": "Moga",

    "muktsar": "Sri Muktsar Sahib",
    "sri muktsar sahib": "Sri Muktsar Sahib",

    "pathankot": "Pathankot",
    "patiala": "Patiala",
    "rupnagar": "Rupnagar",

    "s.a.s nagar(mohali)": "S.A.S Nagar",
    "s.a.s. nagar": "S.A.S Nagar",
    "sas nagar (mohali)": "S.A.S Nagar",
    "s.a.s nagar": "S.A.S Nagar",

    "sangrur": "Sangrur",

    "nawanshahr": "Shaheed Bhagat Singh Nagar",
    "shahid bhagat singh nagar": "Shaheed Bhagat Singh Nagar",
    "shaheed bhagat singh nagar": "Shaheed Bhagat Singh Nagar",

    "tarn taran": "Tarn Taran"
}


In [511]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "punjab")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all(r"\s+", " ")
            .map_elements(lambda x: punjab_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [512]:
df.filter(pl.col("state") == "punjab") \
  .select(pl.col("district").unique().sort())


district
str
"""Amritsar"""
"""Barnala"""
"""Bathinda"""
"""Faridkot"""
"""Fatehgarh Sahib"""
"""Fazilka"""
"""Ferozepur"""
"""Gurdaspur"""
"""Hoshiarpur"""
"""Jalandhar"""


rajasthan

In [513]:
df.filter(pl.col("state") == "rajasthan") \
  .select(pl.col("district").unique().sort())


district
str
"""Ajmer"""
"""Alwar"""
"""Balotra"""
"""Banswara"""
"""Baran"""
"""Barmer"""
"""Beawar"""
"""Bharatpur"""
"""Bhilwara"""
"""Bikaner"""


In [514]:
rajasthan_district_map = {
    # A
    "ajmer": "Ajmer",
    "alwar": "Alwar",

    # B
    "balotra": "Balotra",
    "banswara": "Banswara",
    "baran": "Baran",
    "barmer": "Barmer",
    "beawar": "Beawar",
    "bharatpur": "Bharatpur",
    "bhilwara": "Bhilwara",
    "bikaner": "Bikaner",
    "bundi": "Bundi",

    # C
    "chittaurgarh": "Chittorgarh",
    "chittorgarh": "Chittorgarh",
    "churu": "Churu",

    # D
    "dausa": "Dausa",
    "deeg": "Deeg",
    "deeg ": "Deeg",
    "dhaulpur": "Dholpur",
    "dholpur": "Dholpur",
    "didwana-kuchaman": "Didwana-Kuchaman",
    "dungarpur": "Dungarpur",

    # G–J
    "ganganagar": "Ganganagar",
    "hanumangarh": "Hanumangarh",
    "jaipur": "Jaipur",
    "jaisalmer": "Jaisalmer",
    "jalor": "Jalore",
    "jalore": "Jalore",
    "jhalawar": "Jhalawar",
    "jhunjhunu": "Jhunjhunu",
    "jhunjhunun": "Jhunjhunu",
    "jodhpur": "Jodhpur",

    # K
    "karauli": "Karauli",
    "khairthal-tijara": "Khairthal-Tijara",
    "kota": "Kota",
    "kotputli-behror": "Kotputli-Behror",

    # N–P
    "nagaur": "Nagaur",
    "pali": "Pali",
    "phalodi": "Phalodi",
    "pratapgarh": "Pratapgarh",

    # R–U
    "rajsamand": "Rajsamand",
    "salumbar": "Salumbar",
    "sawai madhopur": "Sawai Madhopur",
    "sikar": "Sikar",
    "sirohi": "Sirohi",
    "tonk": "Tonk",
    "udaipur": "Udaipur",
}


In [515]:
# rajasthan_district_map = {
#     "ajmer": "Ajmer",
#     "alwar": "Alwar",
#     "balotra": "Balotra",
#     "banswara": "Banswara",
#     "baran": "Baran",
#     "barmer": "Barmer",
#     "beawar": "Beawar",
#     "bharatpur": "Bharatpur",
#     "bhilwara": "Bhilwara",
#     "bikaner": "Bikaner",
#     "bundi": "Bundi",

#     "chittaurgarh": "Chittorgarh",
#     "chittorgarh": "Chittorgarh",

#     "churu": "Churu",
#     "dausa": "Dausa",

#     "deeg": "Deeg",
#     "deeg ": "Deeg",

#     "dhaulpur": "Dholpur",
#     "dholpur": "Dholpur",

#     "didwana-kuchaman": "Didwana-Kuchaman",
#     "dungarpur": "Dungarpur",

#     "ganganagar": "Ganganagar",
#     "hanumangarh": "Hanumangarh",
#     "jaipur": "Jaipur",
#     "jaisalmer": "Jaisalmer",

#     "jalor": "Jalore",
#     "jalore": "Jalore",

#     "jhalawar": "Jhalawar",

#     "jhunjhunu": "Jhunjhunu",
#     "jhunjhunun": "Jhunjhunu",

#     "jodhpur": "Jodhpur",
#     "karauli": "Karauli",

#     "khairthal-tijara": "Khairthal-Tijara",
#     "kota": "Kota",
#     "kotputli-behror": "Kotputli-Behror",
#     "nagaur": "Nagaur",
#     "pali": "Pali",
#     "phalodi": "Phalodi",
#     "pratapgarh": "Pratapgarh",
#     "rajsamand": "Rajsamand",
#     "salumbar": "Salumbar",
#     "sawai madhopur": "Sawai Madhopur",
#     "sikar": "Sikar",
#     "sirohi": "Sirohi",
#     "tonk": "Tonk",
#     "udaipur": "Udaipur"
# }


In [516]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "rajasthan")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all(r"\s+", " ")
            .map_elements(lambda x: rajasthan_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [517]:
df.filter(pl.col("state") == "rajasthan") \
  .select(pl.col("district").unique().sort())


district
str
"""Ajmer"""
"""Alwar"""
"""Balotra"""
"""Banswara"""
"""Baran"""
"""Barmer"""
"""Beawar"""
"""Bharatpur"""
"""Bhilwara"""
"""Bikaner"""


sikkim

In [518]:
df.filter(pl.col("state") == "sikkim") \
  .select(pl.col("district").unique().sort())


district
str
"""East"""
"""East Sikkim"""
"""Mangan"""
"""Namchi"""
"""North"""
"""North Sikkim"""
"""South"""
"""South Sikkim"""
"""West"""
"""West Sikkim"""


In [519]:
sikkim_district_map = {
    "east": "Gangtok",
    "east sikkim": "Gangtok",

    "north": "Mangan",
    "north sikkim": "Mangan",

    "south": "Namchi",
    "south sikkim": "Namchi",
    "namchi": "Namchi",

    "west": "Gyalshing",
    "west sikkim": "Gyalshing"
}


In [520]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "sikkim")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all(r"\s+", " ")
            .map_elements(lambda x: sikkim_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [521]:
df.filter(pl.col("state") == "sikkim") \
  .select(pl.col("district").unique().sort())


district
str
"""Gangtok"""
"""Gyalshing"""
"""Mangan"""
"""Namchi"""
"""mangan"""


tamil nadu

In [522]:
df.filter(pl.col("state") == "tamil nadu") \
  .select(pl.col("district").unique().sort())


district
str
"""Ariyalur"""
"""Chengalpattu"""
"""Chennai"""
"""Coimbatore"""
"""Cuddalore"""
"""Dharmapuri"""
"""Dindigul"""
"""Erode"""
"""Kallakurichi"""
"""Kancheepuram"""


In [523]:
tamil_nadu_district_map = {
    "ariyalur": "Ariyalur",
    "chengalpattu": "Chengalpattu",
    "chennai": "Chennai",
    "coimbatore": "Coimbatore",
    "cuddalore": "Cuddalore",
    "dharmapuri": "Dharmapuri",
    "dindigul": "Dindigul",
    "erode": "Erode",
    "kallakurichi": "Kallakurichi",

    "kancheepuram": "Kancheepuram",
    "kanchipuram": "Kancheepuram",

    "kanniyakumari": "Kanniyakumari",
    "kanyakumari": "Kanniyakumari",

    "karur": "Karur",
    "krishnagiri": "Krishnagiri",
    "madurai": "Madurai",
    "mayiladuthurai": "Mayiladuthurai",
    "nagapattinam": "Nagapattinam",
    "namakkal": "Namakkal",
    "namakkal *": "Namakkal",
    "perambalur": "Perambalur",
    "pudukkottai": "Pudukkottai",
    "ramanathapuram": "Ramanathapuram",
    "ranipet": "Ranipet",
    "salem": "Salem",
    "sivaganga": "Sivaganga",
    "tenkasi": "Tenkasi",
    "thanjavur": "Thanjavur",

    "the nilgiris": "The Nilgiris",
    "nilgiris": "The Nilgiris",

    "theni": "Theni",

    "thiruvallur": "Thiruvallur",
    "tiruvallur": "Thiruvallur",

    "thiruvarur": "Thiruvarur",
    "tiruvarur": "Thiruvarur",

    "thoothukkudi": "Thoothukkudi",
    "tuticorin": "Thoothukkudi",

    "tiruchirappalli": "Tiruchirappalli",
    "tirunelveli": "Tirunelveli",

    "tirupathur": "Tirupathur",
    "tirupattur": "Tirupathur",

    "tiruppur": "Tiruppur",
    "tiruvannamalai": "Tiruvannamalai",
    "vellore": "Vellore",

    "viluppuram": "Viluppuram",
    "villupuram": "Viluppuram",

    "virudhunagar": "Virudhunagar"
}


In [524]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "tamil nadu")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all(r"\s+", " ")
            .map_elements(lambda x: tamil_nadu_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [525]:
df.filter(pl.col("state") == "tamil nadu") \
  .select(pl.col("district").unique().sort())


district
str
"""Ariyalur"""
"""Chengalpattu"""
"""Chennai"""
"""Coimbatore"""
"""Cuddalore"""
"""Dharmapuri"""
"""Dindigul"""
"""Erode"""
"""Kallakurichi"""
"""Kancheepuram"""


telangana

In [526]:

df.filter(pl.col("state") == "telangana") \
  .select(pl.col("district").unique().sort())


district
str
"""Adilabad"""
"""Bhadradri Kothagudem"""
"""Hanumakonda"""
"""Hyderabad"""
"""Jagitial"""
"""Jangaon"""
"""Jangoan"""
"""Jayashankar Bhupalpally"""
"""Jogulamba Gadwal"""
"""Kamareddy"""


In [527]:
telangana_district_map = {
    "adilabad": "Adilabad",

    "bhadradri kothagudem": "Bhadradri Kothagudem",

    "hanumakonda": "Hanumakonda",

    "hyderabad": "Hyderabad",

    "jagitial": "Jagtial",
    "jagtial": "Jagtial",

    "jangoan": "Jangaon",
    "jangaon": "Jangaon",

    "jayashankar bhupalpally": "Jayashankar Bhupalpally",
    "jayashankar bhupalapally": "Jayashankar Bhupalpally",

    "jogulamba gadwal": "Jogulamba Gadwal",

    "kamareddy": "Kamareddy",
    "karimnagar": "Karimnagar",
    "khammam": "Khammam",

    "komaram bheem": "Kumuram Bheem Asifabad",
    "kumuram bheem asifabad": "Kumuram Bheem Asifabad",

    "mahabubabad": "Mahabubabad",
    "mahabubnagar": "Mahabubnagar",

    "mancherial": "Mancherial",
    "medak": "Medak",

    "medchal-malkajgiri": "Medchal Malkajgiri",
    "medchal malkajgiri": "Medchal Malkajgiri",
    "medchal?malkajgiri": "Medchal Malkajgiri",
    "medchal−malkajgiri": "Medchal Malkajgiri",

    "mulugu": "Mulugu",
    "nagarkurnool": "Nagarkurnool",
    "nalgonda": "Nalgonda",
    "narayanpet": "Narayanpet",
    "nirmal": "Nirmal",
    "nizamabad": "Nizamabad",
    "peddapalli": "Peddapalli",

    "rajanna sircilla": "Rajanna Sircilla",

    "ranga reddy": "Ranga Reddy",
    "rangareddy": "Ranga Reddy",

    "sangareddy": "Sangareddy",
    "siddipet": "Siddipet",
    "suryapet": "Suryapet",
    "vikarabad": "Vikarabad",
    "wanaparthy": "Wanaparthy",

    "warangal": "Warangal",
    "warangal urban": "Warangal",
    "warangal rural": "Warangal",
    "warangal (urban)": "Warangal",

    "yadadri": "Yadadri Bhuvanagiri",
    "yadadri.": "Yadadri Bhuvanagiri",
    "yadadri bhuvanagiri": "Yadadri Bhuvanagiri"
}


In [528]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "telangana")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all(r"[−–—]", "-")
            .str.replace_all(r"\s+", " ")
            .map_elements(lambda x: telangana_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [529]:
df.filter(pl.col("state") == "telangana") \
  .select(pl.col("district").unique().sort())


district
str
"""Adilabad"""
"""Bhadradri Kothagudem"""
"""Hanumakonda"""
"""Hyderabad"""
"""Jagtial"""
"""Jangaon"""
"""Jayashankar Bhupalpally"""
"""Jogulamba Gadwal"""
"""Kamareddy"""
"""Karimnagar"""


tripura

In [530]:
df.filter(pl.col("state") == "tripura") \
  .select(pl.col("district").unique().sort())


district
str
"""Dhalai"""
"""Dhalai *"""
"""Gomati"""
"""Khowai"""
"""North Tripura"""
"""Sepahijala"""
"""South Tripura"""
"""Unakoti"""
"""West Tripura"""


In [531]:
tripura_district_map = {
    "dhalai": "Dhalai",
    "dhalai *":"Dhalai",
    "gomati": "Gomati",
    "khowai": "Khowai",
    "north tripura": "North Tripura",
    "sepahijala": "Sepahijala",
    "south tripura": "South Tripura",
    "unakoti": "Unakoti",
    "west tripura": "West Tripura"
}


In [532]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "tripura")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all(r"\s+", " ")
            .map_elements(lambda x: tripura_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [533]:
df.filter(pl.col("state") == "tripura") \
  .select(pl.col("district").unique().sort())


district
str
"""Dhalai"""
"""Gomati"""
"""Khowai"""
"""North Tripura"""
"""Sepahijala"""
"""South Tripura"""
"""Unakoti"""
"""West Tripura"""


uttar pradesh

In [534]:
df.filter(pl.col("state") == "uttar pradesh") \
  .select(pl.col("district").unique().sort())


district
str
"""Agra"""
"""Aligarh"""
"""Allahabad"""
"""Ambedkar Nagar"""
"""Amethi"""
"""Amroha"""
"""Auraiya"""
"""Ayodhya"""
"""Azamgarh"""
"""Baghpat"""


In [535]:
uttar_pradesh_district_map = {
    "agra": "Agra",
    "aligarh": "Aligarh",

    "allahabad": "Prayagraj",
    "prayagraj": "Prayagraj",

    "ambedkar nagar": "Ambedkar Nagar",
    "amethi": "Amethi",
    "amroha": "Amroha",
    "jyotiba phule nagar": "Amroha",

    "auraiya": "Auraiya",

    "ayodhya": "Ayodhya",
    "faizabad": "Ayodhya",

    "azamgarh": "Azamgarh",

    "baghpat": "Baghpat",
    "bagpat": "Baghpat",

    "bahraich": "Bahraich",
    "ballia": "Ballia",
    "balrampur": "Balrampur",
    "banda": "Banda",

    "bara banki": "Barabanki",
    "barabanki": "Barabanki",

    "bareilly": "Bareilly",
    "basti": "Basti",

    "bhadohi": "Bhadohi",
    "sant ravidas nagar": "Bhadohi",
    "sant ravidas nagar bhadohi": "Bhadohi",

    "bijnor": "Bijnor",
    "budaun": "Budaun",

    "bulandshahar": "Bulandshahr",
    "bulandshahr": "Bulandshahr",

    "chandauli": "Chandauli",
    "chitrakoot": "Chitrakoot",
    "deoria": "Deoria",
    "etah": "Etah",
    "etawah": "Etawah",
    "farrukhabad": "Farrukhabad",
    "fatehpur": "Fatehpur",
    "firozabad": "Firozabad",

    "gautam buddha nagar": "Gautam Buddha Nagar",
    "ghaziabad": "Ghaziabad",
    "ghazipur": "Ghazipur",
    "gonda": "Gonda",
    "gorakhpur": "Gorakhpur",

    "hamirpur": "Hamirpur",
    "hapur": "Hapur",
    "hardoi": "Hardoi",
    "hathras": "Hathras",
    "jalaun": "Jalaun",
    "jaunpur": "Jaunpur",
    "jhansi": "Jhansi",

    "kannauj": "Kannauj",
    "kanpur dehat": "Kanpur Dehat",
    "kanpur nagar": "Kanpur Nagar",
    "kasganj": "Kasganj",
    "kaushambi": "Kaushambi",

    "kheri": "Lakhimpur Kheri",
    "lakhimpur kheri": "Lakhimpur Kheri",

    "kushinagar": "Kushinagar",
    "kushi nagar": "Kushinagar",
    "lalitpur": "Lalitpur",
    "lucknow": "Lucknow",

    "maharajganj": "Mahrajganj",
    "mahrajganj": "Mahrajganj",

    "mahoba": "Mahoba",
    "mainpuri": "Mainpuri",
    "mathura": "Mathura",
    "mau": "Mau",
    "meerut": "Meerut",
    "mirzapur": "Mirzapur",
    "moradabad": "Moradabad",
    "muzaffarnagar": "Muzaffarnagar",

    "pilibhit": "Pilibhit",
    "pratapgarh": "Pratapgarh",

    "rae bareli": "Rae Bareli",
    "raebareli": "Rae Bareli",

    "rampur": "Rampur",
    "saharanpur": "Saharanpur",
    "sambhal": "Sambhal",

    "sant kabir nagar": "Sant Kabir Nagar",

    "shahjahanpur": "Shahjahanpur",
    "shamli": "Shamli",

    "shrawasti": "Shravasti",
    "shravasti": "Shravasti",

    "siddharthnagar": "Siddharthnagar",
    "siddharth nagar": "Siddharthnagar",
    "sitapur": "Sitapur",
    "sonbhadra": "Sonbhadra",
    "sultanpur": "Sultanpur",
    "unnao": "Unnao",
    "varanasi": "Varanasi"
}


In [536]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "uttar pradesh")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all(" *", "", literal=True)
            .str.replace_all(r"\s+", " ")
            .map_elements(lambda x: uttar_pradesh_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [537]:
df.filter(pl.col("state") == "uttar pradesh") \
  .select(pl.col("district").unique().sort())


district
str
"""Agra"""
"""Aligarh"""
"""Ambedkar Nagar"""
"""Amethi"""
"""Amroha"""
"""Auraiya"""
"""Ayodhya"""
"""Azamgarh"""
"""Baghpat"""
"""Bahraich"""


uttarakhand

In [538]:
df.filter(pl.col("state") == "uttarakhand") \
  .select(pl.col("district").unique().sort())


district
str
"""Almora"""
"""Bageshwar"""
"""Chamoli"""
"""Champawat"""
"""Dehradun"""
"""Garhwal"""
"""Hardwar"""
"""Haridwar"""
"""Nainital"""
"""Pauri Garhwal"""


In [539]:
uttarakhand_district_map = {
    "almora": "Almora",
    "bageshwar": "Bageshwar",
    "chamoli": "Chamoli",
    "champawat": "Champawat",
    "dehradun": "Dehradun",

    "haridwar": "Haridwar",
    "hardwar": "Haridwar",

    "nainital": "Nainital",

    "pauri garhwal": "Pauri Garhwal",
    "garhwal": "Pauri Garhwal",

    "pithoragarh": "Pithoragarh",
    "rudraprayag": "Rudraprayag",
    "tehri garhwal": "Tehri Garhwal",
    "udham singh nagar": "Udham Singh Nagar",
    "uttarkashi": "Uttarkashi"
}


In [540]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "uttarakhand")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all(r"\s+", " ")
            .map_elements(lambda x: uttarakhand_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [541]:
df.filter(pl.col("state") == "uttarakhand") \
  .select(pl.col("district").unique().sort())


district
str
"""Almora"""
"""Bageshwar"""
"""Chamoli"""
"""Champawat"""
"""Dehradun"""
"""Haridwar"""
"""Nainital"""
"""Pauri Garhwal"""
"""Pithoragarh"""
"""Rudraprayag"""


west bengal

In [542]:
df.filter(pl.col("state") == "west bengal") \
  .select(pl.col("district").unique().sort())


district
str
"""24 Paraganas North"""
"""24 Paraganas South"""
"""Alipurduar"""
"""Bankura"""
"""Barddhaman"""
"""Bardhaman"""
"""Birbhum"""
"""Burdwan"""
"""Cooch Behar"""
"""Coochbehar"""


In [543]:
west_bengal_district_map = {
    "alipurduar": "Alipurduar",
    "bankura": "Bankura",
    "birbhum": "Birbhum",

    "cooch behar": "Cooch Behar",
    "coochbehar": "Cooch Behar",
    "koch bihar": "Cooch Behar",

    "dakshin dinajpur": "Dakshin Dinajpur",
    "dinajpur dakshin": "Dakshin Dinajpur",

    "darjeeling": "Darjeeling",
    "darjiling": "Darjeeling",

    "hooghly": "Hooghly",
    "hooghiy": "Hooghly",
    "hugli": "Hooghly",

    "howrah": "Howrah",
    "haora": "Howrah",
    "hawrah": "Howrah",

    "jalpaiguri": "Jalpaiguri",
    "jhargram": "Jhargram",
    "kalimpong": "Kalimpong",
    "kolkata": "Kolkata",

    "malda": "Malda",
    "maldah": "Malda",

    "murshidabad": "Murshidabad",
    "nadia": "Nadia",

    "north 24 parganas": "North 24 Parganas",
    "north twenty four parganas": "North 24 Parganas",

    "uttar dinajpur": "Uttar Dinajpur",
    "north dinajpur": "Uttar Dinajpur",
    "dinajpur uttar": "Uttar Dinajpur",

    "paschim bardhaman": "Paschim Bardhaman",
    "purba bardhaman": "Purba Bardhaman",

    "bardhaman": "Purba Bardhaman",
    "barddhaman": "Purba Bardhaman",
    "burdwan": "Purba Bardhaman",

    "paschim medinipur": "Paschim Medinipur",
    "west medinipur": "Paschim Medinipur",
    "west midnapore": "Paschim Medinipur",
    "medinipur west": "Paschim Medinipur",
    "medinipur": "Paschim Medinipur",

    "purba medinipur": "Purba Medinipur",
    "east midnapore": "Purba Medinipur",
    "east midnapur": "Purba Medinipur",

    "purulia": "Purulia",
    "puruliya": "Purulia",

    "south 24 parganas": "South 24 Parganas",
    "south 24 pargana": "South 24 Parganas",
    "24 paraganas north": "North 24 Parganas",
    "24 paraganas south": "South 24 Parganas",
    "south twenty four parganas": "South 24 Parganas",

    "south dinajpur": "Dakshin Dinajpur",

    # municipality leakage → Howrah / Kolkata
    "bally jagachha": "Howrah",
    "domjur": "Howrah",
    "south dumdum(m)": "North 24 Parganas"
}


In [544]:
import polars as pl

df = df.with_columns(
    pl.when(pl.col("state") == "west bengal")
      .then(
          pl.col("district")
            .str.to_lowercase()
            .str.replace_all(r"\s+", " ")
            .map_elements(lambda x: west_bengal_district_map.get(x, x))
      )
      .otherwise(pl.col("district"))
      .alias("district")
)


In [545]:
df.filter(pl.col("state") == "west bengal") \
  .select(pl.col("district").unique().sort())


district
str
"""Alipurduar"""
"""Bankura"""
"""Birbhum"""
"""Cooch Behar"""
"""Dakshin Dinajpur"""
"""Darjeeling"""
"""Hooghly"""
"""Howrah"""
"""Jalpaiguri"""
"""Jhargram"""


In [546]:
import polars as pl

df = df.with_columns(
    pl.col(pl.Utf8).str.to_lowercase()
)


In [547]:
print(df.count())

shape: (1, 7)
┌─────────┬─────────┬──────────┬─────────┬─────────┬──────────┬────────────────┐
│ date    ┆ state   ┆ district ┆ pincode ┆ age_0_5 ┆ age_5_17 ┆ age_18_greater │
│ ---     ┆ ---     ┆ ---      ┆ ---     ┆ ---     ┆ ---      ┆ ---            │
│ u32     ┆ u32     ┆ u32      ┆ u32     ┆ u32     ┆ u32      ┆ u32            │
╞═════════╪═════════╪══════════╪═════════╪═════════╪══════════╪════════════════╡
│ 1006007 ┆ 1006007 ┆ 1006007  ┆ 1006007 ┆ 1006007 ┆ 1006007  ┆ 1006007        │
└─────────┴─────────┴──────────┴─────────┴─────────┴──────────┴────────────────┘


In [560]:
df.write_csv("normalized_enrollment.csv")


In [549]:
s = df.select("state").unique().to_series().to_list()


In [550]:
print(df)

shape: (1_006_007, 7)
┌────────────┬────────────────┬───────────────────────────┬─────────┬─────────┬──────────┬────────────────┐
│ date       ┆ state          ┆ district                  ┆ pincode ┆ age_0_5 ┆ age_5_17 ┆ age_18_greater │
│ ---        ┆ ---            ┆ ---                       ┆ ---     ┆ ---     ┆ ---      ┆ ---            │
│ str        ┆ str            ┆ str                       ┆ i64     ┆ i64     ┆ i64      ┆ i64            │
╞════════════╪════════════════╪═══════════════════════════╪═════════╪═════════╪══════════╪════════════════╡
│ 02-03-2025 ┆ meghalaya      ┆ east khasi hills          ┆ 793121  ┆ 11      ┆ 61       ┆ 37             │
│ 09-03-2025 ┆ karnataka      ┆ bengaluru urban           ┆ 560043  ┆ 14      ┆ 33       ┆ 39             │
│ 09-03-2025 ┆ uttar pradesh  ┆ kanpur nagar              ┆ 208001  ┆ 29      ┆ 82       ┆ 12             │
│ 09-03-2025 ┆ uttar pradesh  ┆ aligarh                   ┆ 202133  ┆ 62      ┆ 29       ┆ 15             │
│ 09-0

In [559]:
state_district_counts = (
    df.select([
        pl.col("state").str.to_lowercase().alias("state"),
        pl.col("district").str.to_lowercase().alias("district")
    ])
    .drop_nulls()
    .unique()  # unique (state, district)
    .group_by("state")
    .agg(pl.col("district").n_unique().alias("district_count"))
    .sort("state")
)

# Print state-wise counts
for state, count in state_district_counts.iter_rows():
    print(f"{state} -> {count}")

# Print grand total
total_districts = state_district_counts.select(
    pl.sum("district_count")
).item()

print("\nTotal unique districts (India):", total_districts)


andaman and nicobar islands -> 3
andhra pradesh -> 26
arunachal pradesh -> 25
assam -> 35
bihar -> 38
chandigarh -> 1
chhattisgarh -> 33
dadra and nagar haveli and daman and diu -> 3
delhi -> 11
goa -> 2
gujarat -> 34
haryana -> 22
himachal pradesh -> 12
jammu and kashmir -> 20
jharkhand -> 24
karnataka -> 31
kerala -> 14
ladakh -> 2
lakshadweep -> 1
madhya pradesh -> 55
maharashtra -> 36
manipur -> 12
meghalaya -> 12
mizoram -> 11
nagaland -> 17
odisha -> 30
puducherry -> 2
punjab -> 23
rajasthan -> 38
sikkim -> 4
tamil nadu -> 38
telangana -> 33
tripura -> 8
uttar pradesh -> 75
uttarakhand -> 13
west bengal -> 23

Total unique districts (India): 767
