In [1]:
import pandas as pd

In [2]:
# Specify the path to the Excel file
file_path = 'fulldata/scraped/elections_india_data.xlsx'

# Read the entire Excel file
excel_data = pd.ExcelFile(file_path)

In [3]:
# Initialize empty DataFrames for consolidated data
consolidated_sheet1 = pd.DataFrame()
consolidated_sheet2 = pd.DataFrame()
consolidated_sheet3 = pd.DataFrame()

In [4]:
# List of years to process
years = [1971, 1977, 1980, 1984, 1989, 1991, 1996, 2001, 2006, 2011, 2016]

In [5]:
for year in years:
    # Process sheet1
    sheet1_name = f'Year_{year}_India_1'
    if sheet1_name in excel_data.sheet_names:
        sheet1_data = excel_data.parse(sheet1_name)
        sheet1_data.rename(columns={'Paty Name': 'Party Name'}, inplace=True)
        sheet1_data['Election Year'] = str(year)
        # Convert all data to string
        sheet1_data = sheet1_data.astype(str)
        # Remove byepoll data if column exists
        if 'Election Type' in sheet1_data.columns:
            sheet1_data = sheet1_data[~sheet1_data['Election Type'].str.contains('byepoll', case=False, na=False)]
        consolidated_sheet1 = pd.concat([consolidated_sheet1, sheet1_data], ignore_index=True)
    
    # Process sheet2
    sheet2_name = f'Year_{year}_India_2'
    if sheet2_name in excel_data.sheet_names:
        sheet2_data = excel_data.parse(sheet2_name)
        if 'A. C. NO.' in sheet2_data.columns:
            sheet2_data = sheet2_data[sheet2_data['A. C. NO.'].astype(str).str.isnumeric()]
            sheet2_data['Election Year'] = str(year)
            sheet2_data['Election Unique ID'] = f"{year}_" + sheet2_data["A. C. NO."].astype(str).str.zfill(3)
            # Convert all data to string
            sheet2_data = sheet2_data.astype(str)
            # Remove byepoll data if column exists
            if 'Election Type' in sheet2_data.columns:
                sheet2_data = sheet2_data[~sheet2_data['Election Type'].str.contains('byepoll', case=False, na=False)]
            consolidated_sheet2 = pd.concat([consolidated_sheet2, sheet2_data], ignore_index=True)
        else:
            print(f"Column 'A. C. NO.' not found in {sheet2_name}. Skipping...")
    
    # Process sheet3
    sheet3_name = f'Year_{year}_India_3'
    if sheet3_name in excel_data.sheet_names:
        sheet3_data = excel_data.parse(sheet3_name)
        sheet3_data['Party Category'] = None
        current_category = None
        for index, row in sheet3_data.iterrows():
            if row['Party Type Abbreviation'] == 'National Parties':
                current_category = 'National Party'
            elif row['Party Type Abbreviation'] == 'State Parties':
                current_category = 'State Party'
            elif row['Party Type Abbreviation'] == 'Registered(Unrecognised ) Parties':
                current_category = 'Unrecognized Party'
            elif pd.notna(current_category):
                sheet3_data.at[index, 'Party Category'] = current_category
        
        sheet3_data = sheet3_data[sheet3_data['S. No.'].astype(str).str.isnumeric()]
        sheet3_data['Election Year'] = str(year)
        sheet3_data.rename(columns={
            'S. No.': 'Serial No',
            'Party Type Abbreviation': 'Party Abbreviation',
            'Party': 'Party Name',
            'Party Category': 'Party Type'
        }, inplace=True)
        # Convert all data to string
        sheet3_data = sheet3_data.astype(str)
        # Remove byepoll data if column exists
        if 'Election Type' in sheet3_data.columns:
            sheet3_data = sheet3_data[~sheet3_data['Election Type'].str.contains('byepoll', case=False, na=False)]
        consolidated_sheet3 = pd.concat([consolidated_sheet3, sheet3_data], ignore_index=True)


In [6]:
# Sort all consolidated data by year
consolidated_sheet1 = consolidated_sheet1.sort_values('Election Year')
consolidated_sheet2 = consolidated_sheet2.sort_values('Election Unique ID')
consolidated_sheet3 = consolidated_sheet3.sort_values('Election Year')

In [7]:
# Save the consolidated data to an Excel file
output_path = 'fulldata/cleaned/Cleaned_Elections_Data.xlsx'
with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
    consolidated_sheet1.to_excel(writer, sheet_name="Consolidated Results", index=False)
    consolidated_sheet2.to_excel(writer, sheet_name="Election Results", index=False)
    consolidated_sheet3.to_excel(writer, sheet_name="All Party Categories", index=False)

print(f"Data has been successfully saved to {output_path}")


Data has been successfully saved to fulldata/cleaned/Cleaned_Elections_Data.xlsx
