In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv("/content/LS_2.0.csv", low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2263 entries, 0 to 2262
Data columns (total 19 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   STATE                                     2263 non-null   object 
 1   CONSTITUENCY                              2263 non-null   object 
 2   NAME                                      2263 non-null   object 
 3   WINNER                                    2263 non-null   int64  
 4   PARTY                                     2263 non-null   object 
 5   SYMBOL                                    2018 non-null   object 
 6   GENDER                                    2018 non-null   object 
 7   CRIMINAL
CASES                            2018 non-null   object 
 8   AGE                                       2018 non-null   float64
 9   CATEGORY                                  2018 non-null   object 
 10  EDUCATION                           

In [3]:
df.shape

(2263, 19)

In [4]:
df.head()

Unnamed: 0,STATE,CONSTITUENCY,NAME,WINNER,PARTY,SYMBOL,GENDER,CRIMINAL\nCASES,AGE,CATEGORY,EDUCATION,ASSETS,LIABILITIES,GENERAL\nVOTES,POSTAL\nVOTES,TOTAL\nVOTES,OVER TOTAL ELECTORS \nIN CONSTITUENCY,OVER TOTAL VOTES POLLED \nIN CONSTITUENCY,TOTAL ELECTORS
0,Telangana,ADILABAD,SOYAM BAPU RAO,1,BJP,Lotus,MALE,52.0,52.0,ST,12th Pass,"Rs 30,99,414\n ~ 30 Lacs+","Rs 2,31,450\n ~ 2 Lacs+",376892,482,377374,25.330684,35.468248,1489790
1,Telangana,ADILABAD,Godam Nagesh,0,TRS,Car,MALE,0.0,54.0,ST,Post Graduate,"Rs 1,84,77,888\n ~ 1 Crore+","Rs 8,47,000\n ~ 8 Lacs+",318665,149,318814,21.399929,29.96437,1489790
2,Telangana,ADILABAD,RATHOD RAMESH,0,INC,Hand,MALE,3.0,52.0,ST,12th Pass,"Rs 3,64,91,000\n ~ 3 Crore+","Rs 1,53,00,000\n ~ 1 Crore+",314057,181,314238,21.092771,29.534285,1489790
3,Telangana,ADILABAD,NOTA,0,NOTA,,,,,,,,,13030,6,13036,0.875023,1.225214,1489790
4,Uttar Pradesh,AGRA,Satyapal Singh Baghel,1,BJP,Lotus,MALE,5.0,58.0,SC,Doctorate,"Rs 7,42,74,036\n ~ 7 Crore+","Rs 86,06,522\n ~ 86 Lacs+",644459,2416,646875,33.383823,56.464615,1937690


In [5]:
df.isnull().sum()

Unnamed: 0,0
STATE,0
CONSTITUENCY,0
NAME,0
WINNER,0
PARTY,0
SYMBOL,245
GENDER,245
CRIMINAL\nCASES,245
AGE,245
CATEGORY,245


In [8]:
df_clean = df.copy()
df_clean.dropna(subset=['GENDER', 'AGE', 'EDUCATION', 'ASSETS'], inplace=True)
rows_dropped = df.shape[0] - df_clean.shape[0]
print(f"Total rows dropped due to incomplete candidate data: {rows_dropped}")
print(f"New clean data entries: {df_clean.shape[0]}")

Total rows dropped due to incomplete candidate data: 245
New clean data entries: 2018


In [9]:
# List of categorical columns to standardize
text_cols = ['STATE', 'PARTY', 'GENDER', 'EDUCATION']

# Apply lowercase and strip whitespace to all selected columns
for col in text_cols:
    df_clean[col] = df_clean[col].str.lower().str.strip()

# Check the unique values for GENDER to confirm standardization
print("--- GENDER VALUE COUNTS AFTER STANDARDIZATION ---")
print(df_clean['GENDER'].value_counts(dropna=False))

--- GENDER VALUE COUNTS AFTER STANDARDIZATION ---
GENDER
male      1760
female     258
Name: count, dtype: int64


In [10]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2018 entries, 0 to 2261
Data columns (total 19 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   STATE                                     2018 non-null   object 
 1   CONSTITUENCY                              2018 non-null   object 
 2   NAME                                      2018 non-null   object 
 3   WINNER                                    2018 non-null   int64  
 4   PARTY                                     2018 non-null   object 
 5   SYMBOL                                    2018 non-null   object 
 6   GENDER                                    2018 non-null   object 
 7   CRIMINAL
CASES                            2018 non-null   object 
 8   AGE                                       2018 non-null   float64
 9   CATEGORY                                  2018 non-null   object 
 10  EDUCATION                                

In [11]:
# Create the Contingency Table (Cross-tabulation) of Gender and Winner status
contingency_table = pd.crosstab(df_clean['GENDER'], df_clean['WINNER'])

print("--- OBSERVED CONTINGENCY TABLE (Gender vs. Winner) ---")
print(contingency_table)

--- OBSERVED CONTINGENCY TABLE (Gender vs. Winner) ---
WINNER     0    1
GENDER           
female   182   76
male    1297  463


In [14]:
# Define the Contingency Table (using the values from your Task 10.1 output)
# Female: Loss=182, Win=76 (Total=258)
# Male: Loss=1297, Win=463 (Total=1760)
data = [[182, 76], [1297, 463]]
contingency_table = pd.DataFrame(data, index=['female', 'male'], columns=[0, 1])
contingency_table

Unnamed: 0,0,1
female,182,76
male,1297,463


In [13]:
from scipy.stats import chi2_contingency
import pandas as pd

# Run the Chi-Square Test
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

# Calculate the observed success rates
female_win_rate = contingency_table.loc['female', 1] / contingency_table.loc['female'].sum()
male_win_rate = contingency_table.loc['male', 1] / contingency_table.loc['male'].sum()

print("--- FINAL PROJECT 3 METRICS ---")
print(f"Chi-Square Statistic: {chi2:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"Female Win Rate: {female_win_rate:.2%}")
print(f"Male Win Rate: {male_win_rate:.2%}")

--- FINAL PROJECT 3 METRICS ---
Chi-Square Statistic: 0.9857
P-value: 0.3208
Female Win Rate: 29.46%
Male Win Rate: 26.31%
