## Import the necessary libraries

In [None]:
# Import relevant libraries
from bs4 import BeautifulSoup
import pandas as pd
import requests

## WIkipedia URLs for Chhattisgarh and Odisha

In [2]:
# Chhattisgarh
url_jharkhand = "https://en.wikipedia.org/wiki/2014_Indian_general_election_in_Chhattisgarh" 

# Odisha
url_orissa = "https://en.wikipedia.org/wiki/2014_Indian_general_election_in_Odisha"
url = [url_orissa,url_jharkhand]

## Initiate Lists and DataFrames

In [3]:

base_url = "https://en.wikipedia.org/wiki/"
# Initialize DataFrame and other variables
col_names = ['party', 'candidate', 'total_votes']
states = [a.split("_")[5] for a in url]
df = pd.DataFrame(columns = col_names)
turn_out = []
constituent_with_cat = []

## Scrapping the data of voter turnout

In [4]:

# Iterate through each URL
for h in url:
    page = requests.get(h)
    soup = BeautifulSoup(page.text,'html')
    
    # Find the tables containing the data
    table = soup.find_all('table',class_ = "sortable wikitable")

    # The turnout data is in the first table
    specific_table = table[0]

    # Extract data from each row of the table
    row = specific_table.find_all('tr')
    for i in row[1:]: # Skipping the header row
        row_dat = i.find_all('td')

        # Extracting relevant data from each cell
        constituent_with_cat.append([i.text.strip() for i in row_dat][0])
        row_data = [i.text.strip() for i in row_dat][1]

        # Appending to lists
        turn_out.append(row_data)

## Insert into Dataframe

In [5]:
# Create DataFrame from the extracted data
df_turnout = pd.DataFrame({"pc_name":constituent_with_cat,"turnout":turn_out})
df_turnout

Unnamed: 0,pc_name,turnout
0,Bargarh,78.71
1,Sundargarh,71.66
2,Sambalpur,75.92
3,Keonjhar,80.54
4,Mayurbhanj,79.44
5,Balasore,76.84
6,Bhadrak,73.63
7,Jajpur,75.31
8,Dhenkanal,76.43
9,Bolangir,74.92


## Scrapping the data

In [6]:
# Initialize empty lists to store data
cons = []
state = []
turnout = []

# Loop through each URL
for h in url:
    # Request the HTML content of the Wikipedia page
    page = requests.get(h)
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(page.text,'html')
    
    # Find the table containing constituency data
    table = soup.find('table', class_ = "sortable wikitable" )
    specific_table = table

    # Find all links within the table body
    table_body = table.find_all('a',class_='mw-redirect')

    # Extract the constituency names from the links
    constituents = [a.text.strip() for a in table_body if '(Lok Sabha constituency)' in a.get('title', '')]

    # Construct URLs for each constituency page
    urls = [base_url + i+"_Lok_Sabha_constituency" for i in constituents]

    # Loop through each constituency URL
    for j in urls:
        # Request the HTML content of the constituency page
        page = requests.get(j)
        
        # Parse the HTML content using BeautifulSoup
        soup_ = BeautifulSoup(page.text,'html')

        # Find all tables containing constituency data
        tables = soup_.find_all('table',class_ = 'wikitable plainrowheaders' )

        # The relevant data is in the third table, adjust this as necessary
        raw2014 = tables[2]

        # Find all rows within the table
        col_dat = raw2014.find_all('tr',class_ = 'vcard')

        # Loop through each row
        for i in col_dat:

            # Append the state name to the 'state' list
            state.append(states[url.index(h)])

            # Append the constituency name to the 'cons' list
            cons.append(constituents[urls.index(j)])

            # Find all cells within the row
            row_dat = i.find_all('td')

            # Extract the text from each cell and strip whitespace
            row_data = [i.text.strip() for i in row_dat[1:-2]]

            # Get the length of the DataFrame
            l= len(df)

            # Add the row data to the DataFrame at index 'l'
            df.loc[l] = row_data # Ensure this matches your DataFrame structure

# Add the 'pc_name' column to the DataFrame and populate it with data from the 'cons' list
df['pc_name'] = cons

# Add the 'state' column to the DataFrame and populate it with data from the 'state' list
df['state'] = state

In [8]:
print(df)

Unnamed: 0,party,candidate,total_votes,pc_name,state
0,BJD,Prabhas Kumar Singh,383230,Bargarh,Odisha
1,BJP,Subash Chouhan,372052,Bargarh,Odisha
2,INC,Sanjay Bhoi,274610,Bargarh,Odisha
3,PVP,Kulamani Urma,26216,Bargarh,Odisha
4,CPI,Ashok Bisi,21100,Bargarh,Odisha
...,...,...,...,...,...
191,BJP,Vikram Usendi,465215,Kanker,Chhattisgarh
192,INC,Phulo Devi Netam,430057,Kanker,Chhattisgarh
193,NOTA,None of the Above,31917,Kanker,Chhattisgarh
194,CPI,Ramesh Gawde,23482,Kanker,Chhattisgarh


## Export to CSV

In [157]:
# Save the DataFrame 'df' to a CSV file named 'missing.csv' without including the index
df.to_csv('missing.csv',index = False)

# Save the DataFrame 'df_turnout' to a CSV file named 'missing_turnout.csv' without including the index
df_turnout.to_csv('missing_turnout.csv',index = False)