<H1> Cleaning Demographic Dataset <H1>

In [1]:
import pandas as pd
import numpy as np
import os
import openpyxl
import seaborn as sns
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder


<H2> Operational Stuffs <H2>

In [2]:
# List all CSV files from the directory
data_dir = r"D:\Project\Hackathons\Aadhar_Hackathon\Data_Sets\api_data_aadhar_demographic"
csv_files = list(Path(data_dir).glob("*.csv"))
print(f"Found {len(csv_files)} CSV files.")


# Load each CSV file and print its shape
dataframes = []
for csv_file in csv_files:
    df_temp = pd.read_csv(csv_file)
    print(f"{csv_file.name}: {df_temp.shape[0]} rows, {df_temp.shape[1]} columns")
    dataframes.append(df_temp)

print(f"\nTotal dataframes loaded: {len(dataframes)}")

# Combine all dataframes
df = pd.concat(dataframes, ignore_index=True)
print(f"\nFinal combined dataframe: {df.shape[0]} rows, {df.shape[1]} columns")

df.to_csv('demographic_combined.csv', index=False)
print("Combined dataframe exported to 'demographic_combined.csv'")

Found 5 CSV files.
api_data_aadhar_demographic_0_500000.csv: 500000 rows, 6 columns
api_data_aadhar_demographic_1000000_1500000.csv: 500000 rows, 6 columns
api_data_aadhar_demographic_1500000_2000000.csv: 500000 rows, 6 columns
api_data_aadhar_demographic_2000000_2071700.csv: 71700 rows, 6 columns
api_data_aadhar_demographic_500000_1000000.csv: 500000 rows, 6 columns

Total dataframes loaded: 5

Final combined dataframe: 2071700 rows, 6 columns
Combined dataframe exported to 'demographic_combined.csv'


In [2]:
# Load the exported combined dataframe
df = pd.read_csv('Data_Sets_Processed/demographic_combined.csv')
print(f"Loaded dataframe: {df.shape[0]} rows, {df.shape[1]} columns")
print(df.head())

Loaded dataframe: 2071700 rows, 6 columns
         date           state    district  pincode  demo_age_5_17  \
0  01-03-2025   Uttar Pradesh   Gorakhpur   273213             49   
1  01-03-2025  Andhra Pradesh    Chittoor   517132             22   
2  01-03-2025         Gujarat      Rajkot   360006             65   
3  01-03-2025  Andhra Pradesh  Srikakulam   532484             24   
4  01-03-2025       Rajasthan     Udaipur   313801             45   

   demo_age_17_  
0           529  
1           375  
2           765  
3           314  
4           785  


In [3]:
print(df.dtypes)
print("\nNull Values in the dataframe :- \n",df.isnull().sum())

date             object
state            object
district         object
pincode           int64
demo_age_5_17     int64
demo_age_17_      int64
dtype: object

Null Values in the dataframe :- 
 date             0
state            0
district         0
pincode          0
demo_age_5_17    0
demo_age_17_     0
dtype: int64


In [27]:
# Count rows where state is NaN
nan_state_count = df['state'].isna().sum()
print(f"Number of rows where state is NaN: {nan_state_count}")
nan_count = df['district'].isna().sum()
print(f"Number of rows where district is NaN: {nan_count}")
df = df[df['district'].notna()]
df.to_csv(file_path, index=False)
print("Rows with NaN districts dropped and changes saved to file.")


Number of rows where state is NaN: 0
Number of rows where district is NaN: 17727
Rows with NaN districts dropped and changes saved to file.


<h2> Cleaning the dataset <h2>

<h3>Cleaning on the state column<h3>

In [4]:
# Number of unique states
num_unique_states = df['state'].nunique()
print(f"Number of unique states: {num_unique_states}")

# List all unique states
unique_states = df['state'].unique()
print(f"\nUnique states:\n{sorted(unique_states)}")

Number of unique states: 65

Unique states:
['100000', 'Andaman & Nicobar Islands', 'Andaman and Nicobar Islands', 'Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'BALANAGAR', 'Bihar', 'Chandigarh', 'Chhatisgarh', 'Chhattisgarh', 'Dadra & Nagar Haveli', 'Dadra and Nagar Haveli', 'Dadra and Nagar Haveli and Daman and Diu', 'Daman & Diu', 'Daman and Diu', 'Darbhanga', 'Delhi', 'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh', 'Jaipur', 'Jammu & Kashmir', 'Jammu and Kashmir', 'Jharkhand', 'Karnataka', 'Kerala', 'Ladakh', 'Lakshadweep', 'Madanapalle', 'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Nagpur', 'ODISHA', 'Odisha', 'Orissa', 'Pondicherry', 'Puducherry', 'Punjab', 'Puttenahalli', 'Raja Annamalai Puram', 'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana', 'Tripura', 'Uttar Pradesh', 'Uttarakhand', 'Uttaranchal', 'WEST BENGAL', 'WESTBENGAL', 'West  Bengal', 'West Bangal', 'West Bengal', 'West Bengli', 'West bengal', 'Westbengal', 'andhra pradesh', 'od

In [5]:
# List all rows where the state column is '100000'
invalid_state_rows = df[df['state'] == '100000']
print(f"Number of rows with state='100000': {len(invalid_state_rows)}")
print(invalid_state_rows)

Number of rows with state='100000': 2
               date   state district  pincode  demo_age_5_17  demo_age_17_
1507370  20-12-2025  100000   100000   100000              0             1
1866861  23-12-2025  100000   100000   100000              0             1


In [6]:
# Remove rows where state is '100000'
df = df[df['state'] != '100000']
print(f"Dataframe after removing invalid state rows: {df.shape[0]} rows, {df.shape[1]} columns")

Dataframe after removing invalid state rows: 2071698 rows, 6 columns


In [11]:


# Define the standardization mapping
state_mapping = {
    # Andaman & Nicobar Islands (UT)
    'Andaman & Nicobar Islands': 'Andaman and Nicobar Islands',
    'Andaman and Nicobar Islands': 'Andaman and Nicobar Islands',
    
    # Andhra Pradesh (State)
    'Andhra Pradesh': 'Andhra Pradesh',
    'andhra pradesh': 'Andhra Pradesh',
    
    # Arunachal Pradesh (State)
    'Arunachal Pradesh': 'Arunachal Pradesh',
    
    # Assam (State)
    'Assam': 'Assam',
    
    # Bihar (State)
    'Bihar': 'Bihar',
    'Darbhanga': 'Bihar',  # Darbhanga is a city in Bihar
    
    # Chandigarh (UT)
    'Chandigarh': 'Chandigarh',
    
    # Chhattisgarh (State)
    'Chhatisgarh': 'Chhattisgarh',
    'Chhattisgarh': 'Chhattisgarh',
    
    # Dadra and Nagar Haveli and Daman and Diu (UT - merged in 2020)
    'Dadra & Nagar Haveli': 'Dadra and Nagar Haveli and Daman and Diu',
    'Dadra and Nagar Haveli': 'Dadra and Nagar Haveli and Daman and Diu',
    'Dadra and Nagar Haveli and Daman and Diu': 'Dadra and Nagar Haveli and Daman and Diu',
    'Daman & Diu': 'Dadra and Nagar Haveli and Daman and Diu',
    'Daman and Diu': 'Dadra and Nagar Haveli and Daman and Diu',
    
    # Delhi (UT)
    'Delhi': 'Delhi',
    
    # Goa (State)
    'Goa': 'Goa',
    
    # Gujarat (State)
    'Gujarat': 'Gujarat',
    
    # Haryana (State)
    'Haryana': 'Haryana',
    
    # Himachal Pradesh (State)
    'Himachal Pradesh': 'Himachal Pradesh',
    
    # Jammu and Kashmir (UT - became UT in 2019)
    'Jammu & Kashmir': 'Jammu and Kashmir',
    'Jammu and Kashmir': 'Jammu and Kashmir',
    
    # Jharkhand (State)
    'Jharkhand': 'Jharkhand',
    
    # Karnataka (State)
    'Karnataka': 'Karnataka',
    'Puttenahalli': 'Karnataka',  # Puttenahalli is in Karnataka
    
    # Kerala (State)
    'Kerala': 'Kerala',
    
    # Ladakh (UT - created in 2019)
    'Ladakh': 'Ladakh',
    
    # Lakshadweep (UT)
    'Lakshadweep': 'Lakshadweep',
    
    # Madhya Pradesh (State)
    'Madhya Pradesh': 'Madhya Pradesh',
    
    # Maharashtra (State)
    'Maharashtra': 'Maharashtra',
    'Nagpur': 'Maharashtra',  # Nagpur is a city in Maharashtra
    
    # Manipur (State)
    'Manipur': 'Manipur',
    
    # Meghalaya (State)
    'Meghalaya': 'Meghalaya',
    
    # Mizoram (State)
    'Mizoram': 'Mizoram',
    
    # Nagaland (State)
    'Nagaland': 'Nagaland',
    
    # Odisha (State)
    'ODISHA': 'Odisha',
    'Odisha': 'Odisha',
    'Orissa': 'Odisha',  # Old name for Odisha
    'odisha': 'Odisha',
    
    # Puducherry (UT)
    'Pondicherry': 'Puducherry',  # Old name for Puducherry
    'Puducherry': 'Puducherry',
    
    # Punjab (State)
    'Punjab': 'Punjab',
    
    # Rajasthan (State)
    'Rajasthan': 'Rajasthan',
    'Jaipur': 'Rajasthan',  # Jaipur is the capital of Rajasthan
    
    # Sikkim (State)
    'Sikkim': 'Sikkim',
    
    # Tamil Nadu (State)
    'Tamil Nadu': 'Tamil Nadu',
    'Madanapalle': 'Andhra Pradesh',  # Madanapalle is actually in Andhra Pradesh
    'Raja Annamalai Puram': 'Tamil Nadu',  # Raja Annamalai Puram is in Chennai, Tamil Nadu
    
    # Telangana (State - formed in 2014)
    'Telangana': 'Telangana',
    'BALANAGAR': 'Telangana',  # Balanagar is in Telangana
    
    # Tripura (State)
    'Tripura': 'Tripura',
    
    # Uttar Pradesh (State)
    'Uttar Pradesh': 'Uttar Pradesh',
    
    # Uttarakhand (State)
    'Uttarakhand': 'Uttarakhand',
    'Uttaranchal': 'Uttarakhand',  # Old name for Uttarakhand
    
    # West Bengal (State)
    'WEST BENGAL': 'West Bengal',
    'WESTBENGAL': 'West Bengal',
    'West  Bengal': 'West Bengal',
    'West Bangal': 'West Bengal',
    'West Bengal': 'West Bengal',
    'West Bengli': 'West Bengal',
    'West bengal': 'West Bengal',
    'Westbengal': 'West Bengal',
    'west Bengal': 'West Bengal',
}

# VERIFICATION: Check all 65 original values are mapped
original_values = ['Andaman & Nicobar Islands', 'Andaman and Nicobar Islands', 'Andhra Pradesh', 
                   'Arunachal Pradesh', 'Assam', 'BALANAGAR', 'Bihar', 'Chandigarh', 'Chhatisgarh', 
                   'Chhattisgarh', 'Dadra & Nagar Haveli', 'Dadra and Nagar Haveli', 
                   'Dadra and Nagar Haveli and Daman and Diu', 'Daman & Diu', 'Daman and Diu', 
                   'Darbhanga', 'Delhi', 'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh', 'Jaipur', 
                   'Jammu & Kashmir', 'Jammu and Kashmir', 'Jharkhand', 'Karnataka', 'Kerala', 
                   'Ladakh', 'Lakshadweep', 'Madanapalle', 'Madhya Pradesh', 'Maharashtra', 
                   'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Nagpur', 'ODISHA', 'Odisha', 
                   'Orissa', 'Pondicherry', 'Puducherry', 'Punjab', 'Puttenahalli', 
                   'Raja Annamalai Puram', 'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana', 
                   'Tripura', 'Uttar Pradesh', 'Uttarakhand', 'Uttaranchal', 'WEST BENGAL', 
                   'WESTBENGAL', 'West  Bengal', 'West Bangal', 'West Bengal', 'West Bengli', 
                   'West bengal', 'Westbengal', 'andhra pradesh', 'odisha', 'west Bengal']

print(f"Total original unique values: {len(original_values)}")

# Check if all values are in the mapping
missing_values = [val for val in original_values if val not in state_mapping]
if missing_values:
    print(f"\n❌ MISSING VALUES IN MAPPING: {missing_values}")
else:
    print("\n✓ All 65 values are mapped!")

# Apply the mapping to your dataframe
# Assuming your dataframe is named 'df' and the column is 'state'
df['state'] = df['state'].map(state_mapping)

# Verify the result
print(f"\nUnique states after mapping: {df['state'].nunique()}")
print("\nMapped values:")
for val in sorted(df['state'].unique()):
    print(f"  - {val}")

# Define the 28 states
states = [
    'Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar', 'Chhattisgarh',
    'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh', 'Jharkhand',
    'Karnataka', 'Kerala', 'Madhya Pradesh', 'Maharashtra', 'Manipur',
    'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Punjab',
    'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana', 'Tripura',
    'Uttar Pradesh', 'Uttarakhand', 'West Bengal'
]

# Define the 8 union territories
# Clean and standardize state and district columns
df['state'] = df['state'].str.strip().str.lower().str.replace(' ', '_')
df['district'] = df['district'].str.strip().str.lower().str.replace(' ', '_')

# Save to CSV
df.to_csv('demographic_processed.csv', index=False)
print("✓ Data processed and saved as 'demographic_processed.csv'")
print(f"\nSample of processed data:")
print(df.head())


Total original unique values: 64

✓ All 65 values are mapped!

Unique states after mapping: 36

Mapped values:
  - Andaman and Nicobar Islands
  - Andhra Pradesh
  - Arunachal Pradesh
  - Assam
  - Bihar
  - Chandigarh
  - Chhattisgarh
  - Dadra and Nagar Haveli and Daman and Diu
  - Delhi
  - Goa
  - Gujarat
  - Haryana
  - Himachal Pradesh
  - Jammu and Kashmir
  - Jharkhand
  - Karnataka
  - Kerala
  - Ladakh
  - Lakshadweep
  - Madhya Pradesh
  - Maharashtra
  - Manipur
  - Meghalaya
  - Mizoram
  - Nagaland
  - Odisha
  - Puducherry
  - Punjab
  - Rajasthan
  - Sikkim
  - Tamil Nadu
  - Telangana
  - Tripura
  - Uttar Pradesh
  - Uttarakhand
  - West Bengal
✓ Data processed and saved as 'demographic_processed.csv'

Sample of processed data:
         date           state    district  pincode  demo_age_5_17  \
0  01-03-2025   uttar_pradesh   gorakhpur   273213             49   
1  01-03-2025  andhra_pradesh    chittoor   517132             22   
2  01-03-2025         gujarat      ra

In [12]:
# Number of unique states
num_unique_states = df['state'].nunique()
print(f"Number of unique states: {num_unique_states}")

# List all unique states
unique_states = df['state'].unique()
print(f"\nUnique states:\n{sorted(unique_states)}")

Number of unique states: 36

Unique states:
['andaman_and_nicobar_islands', 'andhra_pradesh', 'arunachal_pradesh', 'assam', 'bihar', 'chandigarh', 'chhattisgarh', 'dadra_and_nagar_haveli_and_daman_and_diu', 'delhi', 'goa', 'gujarat', 'haryana', 'himachal_pradesh', 'jammu_and_kashmir', 'jharkhand', 'karnataka', 'kerala', 'ladakh', 'lakshadweep', 'madhya_pradesh', 'maharashtra', 'manipur', 'meghalaya', 'mizoram', 'nagaland', 'odisha', 'puducherry', 'punjab', 'rajasthan', 'sikkim', 'tamil_nadu', 'telangana', 'tripura', 'uttar_pradesh', 'uttarakhand', 'west_bengal']


<h3>Change on the district column<h3>

<h4> Processing States and it's Districts dataset <h4>

In [8]:
# Define the file path
xlsx_file = r"D:\Project\Hackathons\Aadhar_Hackathon\Data_Sets_Processed\Districts.xlsx"
csv_file = r"D:\Project\Hackathons\Aadhar_Hackathon\Data_Sets_Processed\Districts.csv"

# Read the Excel file and convert to CSV
df_districts = pd.read_excel(xlsx_file)
df_districts.to_csv(csv_file, index=False)

print(f"✓ Successfully converted Districts.xlsx to Districts.csv")
print(f"Location: {csv_file}")
print(f"Shape: {df_districts.shape[0]} rows, {df_districts.shape[1]} columns")
print(f"\nFirst few rows:")
print(df_districts.head())

✓ Successfully converted Districts.xlsx to Districts.csv
Location: D:\Project\Hackathons\Aadhar_Hackathon\Data_Sets_Processed\Districts.csv
Shape: 788 rows, 2 columns

First few rows:
  https://www.mudranidhi.com/districts-in-india/            Unnamed: 1
0                                          State              District
1                                Andaman Nicobar               Nicobar
2                                Andaman Nicobar  North Middle Andaman
3                                Andaman Nicobar         South Andaman
4                                 Andhra Pradesh  Alluri Sitarama Raju


In [9]:
print(f"Shape: {df_districts.shape[0]} rows, {df_districts.shape[1]} columns")

Shape: 788 rows, 2 columns


In [10]:
# Read the districts file
df_districts = pd.read_csv(csv_file)

# Count unique states and districts
num_unique_states = df_districts['State'].nunique()
num_unique_districts = df_districts['District'].nunique()

print(f"Number of unique states: {num_unique_states}")
print(f"Number of unique districts: {num_unique_districts}")

print("\n" + "="*60)
print("DISTRICTS BY STATE")
print("="*60)

# Group by state and list all districts
for state in sorted(df_districts['State'].unique()):
    districts = df_districts[df_districts['State'] == state]['District'].unique()
    print(f"\n{state}: ({len(districts)} districts)")
    for district in sorted(districts):
        print(f"  - {district}")

Number of unique states: 36
Number of unique districts: 782

DISTRICTS BY STATE

Andaman Nicobar: (3 districts)
  - Nicobar
  - North Middle Andaman
  - South Andaman

Andhra Pradesh: (26 districts)
  - Alluri Sitarama Raju
  - Anakapalli
  - Anantapur
  - Annamaya
  - Bapatla
  - Chittoor
  - East Godavari
  - Eluru
  - Guntur
  - Kadapa
  - Kakinada
  - Konaseema
  - Krishna
  - Kurnool
  - Manyam
  - N T Rama Rao
  - Nandyal
  - Nellore
  - Palnadu
  - Prakasam
  - Sri Balaji
  - Sri Satya Sai
  - Srikakulam
  - Visakhapatnam
  - Vizianagaram
  - West Godavari

Arunachal Pradesh: (27 districts)
  - Anjaw
  - Bichom
  - Changlang
  - Dibang Valley
  - East Kameng
  - East Siang
  - Kamle
  - Keyi Panyor
  - Kra Daadi
  - Kurung Kumey
  - Lepa Rada
  - Lohit
  - Longding
  - Lower Dibang Valley
  - Lower Siang
  - Lower Subansiri
  - Namsai
  - Pakke Kessang
  - Papum Pare
  - Shi Yomi
  - Siang
  - Tawang
  - Tirap
  - Upper Siang
  - Upper Subansiri
  - West Kameng
  - West Siang

A

In [11]:
#These are districts which present in two different states with same district names

# Find districts that appear in multiple states
district_state_mapping = df_districts.groupby('District')['State'].apply(list).to_dict()

# Filter districts that appear in more than one state
duplicate_districts = {district: states for district, states in district_state_mapping.items() if len(states) > 1}

if duplicate_districts:
    print(f"Found {len(duplicate_districts)} districts that appear in multiple states:\n")
    for district, states in sorted(duplicate_districts.items()):
        print(f"  {district}:")
        for state in states:
            print(f"    - {state}")
else:
    print("All districts are unique across states.")

Found 5 districts that appear in multiple states:

  Aurangabad:
    - Bihar
    - Maharashtra
  Balrampur:
    - Chhattisgarh
    - Uttar Pradesh
  Bilaspur:
    - Chhattisgarh
    - Himachal Pradesh
  Hamirpur:
    - Himachal Pradesh
    - Uttar Pradesh
  Pratapgarh:
    - Rajasthan
    - Uttar Pradesh


In [19]:
# Count unique districts in df
num_unique_districts_df = df['district'].nunique()
print(f"Number of unique districts in df: {num_unique_districts_df}")

# List all unique districts
unique_districts_df = sorted(df['district'].unique())
print(f"\nAll unique districts in df:")
for district in unique_districts_df:
    print(f"  - {district}")

Number of unique districts in df: 982

All unique districts in df:
  - 5th cross
  - ANGUL
  - ANUGUL
  - Adilabad
  - Agar Malwa
  - Agra
  - Ahilyanagar
  - Ahmadabad
  - Ahmadnagar
  - Ahmed Nagar
  - Ahmedabad
  - Aizawl
  - Ajmer
  - Akola
  - Alappuzha
  - Aligarh
  - Alipurduar
  - Alirajpur
  - Allahabad
  - Alluri Sitharama Raju
  - Almora
  - Alwar
  - Ambala
  - Ambedkar Nagar
  - Amethi
  - Amravati
  - Amreli
  - Amritsar
  - Amroha
  - Anakapalli
  - Anand
  - Anantapur
  - Ananthapur
  - Ananthapuramu
  - Anantnag
  - Andamans
  - Angul
  - Anjaw
  - Annamayya
  - Anugal
  - Anugul
  - Anuppur
  - Araria
  - Ariyalur
  - Arvalli
  - Arwal
  - Ashok Nagar
  - Auraiya
  - Aurangabad
  - Aurangabad(BH)
  - Aurangabad(bh)
  - Ayodhya
  - Azamgarh
  - Badgam
  - Bagalkot
  - Bagalkot *
  - Bageshwar
  - Baghpat
  - Baghpat *
  - Bagpat
  - Bahraich
  - Bajali
  - Baksa
  - Balaghat
  - Balangir
  - Baleshwar
  - Baleswar
  - Balianta
  - Ballari
  - Ballia
  - Bally Jagachha


<h3> Cleaning the districts <h3>

In [None]:
file_path = r"D:\Project\Hackathons\Aadhar_Hackathon\demographic_processed.csv"

In [304]:
# Count rows where state is NaN
nan_state_count = df['state'].isna().sum()
print(f"Number of rows where state is NaN: {nan_state_count}")
nan_count = df['district'].isna().sum()
print(f"Number of rows where district is NaN: {nan_count}")
df = df[df['district'].notna()]
df.to_csv(file_path, index=False)
print("Rows with NaN districts dropped and changes saved to file.")


Number of rows where state is NaN: 0
Number of rows where district is NaN: 0
Rows with NaN districts dropped and changes saved to file.


In [319]:
mizoram_districts = [
    "aizawl",
    "champhai",
    "hnahthial",
    "khawzawl",
    "kolasib",
    "lawngtlai",
    "lunglei",
    "mamit",
    "saitual",
    "serchhip",
    "saiha"
]

print(len(mizoram_districts))

11


In [320]:
file_path = r"D:\Project\Hackathons\Aadhar_Hackathon\demographic_processed.csv"

state_name = 'mizoram'
# Read the CSV file
df = pd.read_csv(file_path)

# Filter rows where state is Andaman and Nicobar Islands
andaman_df = df[df['state'] == state_name]

# Get unique districts
districts = andaman_df['district'].unique()

# Print district count
print("\nTotal number of districts:", len(districts))



Total number of districts: 11


In [322]:
# Print districts
print(f"Districts in {state_name.replace('_', ' ').title()}:")
for district in districts:
    print("-", district)

Districts in Mizoram:
- aizawl
- serchhip
- kolasib
- lunglei
- mamit
- saiha
- champhai
- lawngtlai
- saitual
- khawzawl
- hnahthial


In [317]:
district_mapping = {
    "aizawl": "aizawl",
    "serchhip": "serchhip",
    "kolasib": "kolasib",
    "lunglei": "lunglei",
    "mamit": "mamit",
    "mammit": "mamit",
    "saiha": "saiha",
    "champhai": "champhai",
    "lawngtlai": "lawngtlai",
    "saitual": "saitual",
    "khawzawl": "khawzawl",
    "hnahthial": "hnahthial"
}


In [318]:
# Apply mapping
mask = df['state'] == state_name
df.loc[mask, 'district'] = df.loc[mask, 'district'].map(district_mapping)

# Save changes to the same CSV
df.to_csv(file_path, index=False)

# Verify
print("District mapping applied and saved successfully.")
print("Updated districts:", df.loc[mask, 'district'].unique())
print("Total districts:", df.loc[mask, 'district'].nunique())

District mapping applied and saved successfully.
Updated districts: ['aizawl' 'serchhip' 'kolasib' 'lunglei' 'mamit' 'saiha' 'champhai'
 'lawngtlai' 'saitual' 'khawzawl' 'hnahthial']
Total districts: 11


In [124]:
ladakh_districts = ['leh', 'kargil','leh_(ladakh)']

In [127]:
# Mask for Telangana districts
tg_mask = df['district'].isin(ladakh_districts)

# Update state
df.loc[tg_mask, 'state'] = 'telangana'
df.to_csv(file_path, index=False)
