In [120]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report

In [206]:
url = './elections2019.csv'
df = pd.read_csv(url, low_memory=False)
df.drop(['Candidate', 'DelimID', 'ENOP', 'pid', 'Party_Type_TCPD', 'Last_Party', 'Last_Party_ID', 'Last_Constituency_Name', 'TCPD_Prof_Main', 'TCPD_Prof_Main_Desc', 'TCPD_Prof_Second', 'TCPD_Prof_Second_Desc'], axis=1, inplace=True)
# Remove data that isn't from 2014 or 2019
df = df[(df['Year'] == 2014) | (df['Year'] == 2019)]


# Making the reasonable assumption that we want to prioritize predicting the winner, 
# we discard candidates that were nowhere near winning
# Remove candidates that weren't in the top 5
#df = df[df['Position'].between(1, 5)] 

In [238]:
X = df[['Party', 'State_Name', 'Candidate_Type', 'Constituency_Type', 'Sub_Region', 'Same_Constituency', 'Same_Party', 'Turncoat', 'Incumbent', 'last_poll', 'MyNeta_education', 'Election_Type']]
y = df['Position']
# Convert categorical variables to numerical using LabelEncoder
pd.options.mode.chained_assignment = None
le = LabelEncoder()
X['Party'] = le.fit_transform(X['Party'])
X['State_Name'] = le.fit_transform(X['State_Name'])
X['Candidate_Type'] = le.fit_transform(X['Candidate_Type'])
X['Constituency_Type'] = le.fit_transform(X['Constituency_Type'])
X['Sub_Region'] = le.fit_transform(X['Sub_Region'])
X['Same_Constituency'] = le.fit_transform(X['Same_Constituency'])
X['Same_Party'] = le.fit_transform(X['Same_Party'])
X['Turncoat'] = le.fit_transform(X['Turncoat'])
X['Incumbent'] = le.fit_transform(X['Incumbent'])
X['last_poll'] = le.fit_transform(X['last_poll'])
X['MyNeta_education'] = le.fit_transform(X['MyNeta_education'])
X['Election_Type'] = le.fit_transform(X['Election_Type'])
y = y.apply(lambda x: 1 if x==1 else 0)
# Data Split into Training and Testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model (Random Forest Classifier)
model = RandomForestClassifier(random_state=42)

In [239]:
%%time
model.fit(X_train, y_train)

CPU times: user 406 ms, sys: 5.38 ms, total: 412 ms
Wall time: 412 ms


# Performance

In [230]:
model.score(X_test, y_test) * 100

96.53493699885452

While this seems like an amazing performance, it is important to realize since winners are much less common compared to losers, the model can achieve similar accuracies by simply predicting that every candidate is going to lose. 

Thus, in order to effectively gauge the model's performance, we need to look at accuracy for predicting winners and losers seperately. In order to do this, we can analyze a confusion matrix.

In [231]:
y_pred = model.predict(X_test)

In [232]:
confusion_matrix(y_test, y_pred)

array([[3242,   43],
       [  78,  129]])

In [233]:
def find_TP(y, y_hat):
   # counts the number of true positives (y = 1, y_hat = 1)
   return sum((y == 1) & (y_hat == 1))
def find_FN(y, y_hat):
   # counts the number of false negatives (y = 1, y_hat = 0) Type-II error
   return sum((y == 1) & (y_hat == 0))
def find_FP(y, y_hat):
   # counts the number of false positives (y = 0, y_hat = 1) Type-I error
   return sum((y == 0) & (y_hat == 1))
def find_TN(y, y_hat):
   # counts the number of true negatives (y = 0, y_hat = 0)
   return sum((y == 0) & (y_hat == 0))

In [234]:
TP = find_TP(y_test, y_pred)
FN = find_FN(y_test, y_pred)
FP = find_FP(y_test, y_pred)
TN = find_TN(y_test, y_pred)
print('TP:',TP)
print('FN:',FN)
print('FP:',FP)
print('TN:',TN)
precision = TP/(TP+FP)
print('Precision:',precision)

TP: 129
FN: 78
FP: 43
TN: 3242
Precision: 0.75


As we can observe, although it predicts losers quite accurately, the same cannot be said about predicting winners. And while 75% is not terrible, there is certainly room for improvement. 

Initially, I had removed candidates that hadn't placed in the top 5 as I thought the values would add little to the accruacy of the model, however including them increased accuracy, hence including voter share percentages will also have a positive impact on the model's accuracy. Let's rebuild the model this time with the voter share percentages. It will allow for the model to better understand the magnitude of the win. 

In [243]:
X = df[['Party', 'State_Name', 'Candidate_Type', 'Constituency_Type', 'Sub_Region', 'Same_Constituency', 'Vote_Share_Percentage','Same_Party', 'Turncoat', 'Incumbent', 'last_poll', 'MyNeta_education', 'Election_Type']]
y = df['Position']
# Convert categorical variables to numerical using LabelEncoder
pd.options.mode.chained_assignment = None
le = LabelEncoder()
X['Party'] = le.fit_transform(X['Party'])
X['State_Name'] = le.fit_transform(X['State_Name'])
X['Candidate_Type'] = le.fit_transform(X['Candidate_Type'])
X['Constituency_Type'] = le.fit_transform(X['Constituency_Type'])
X['Sub_Region'] = le.fit_transform(X['Sub_Region'])
X['Same_Constituency'] = le.fit_transform(X['Same_Constituency'])
X['Same_Party'] = le.fit_transform(X['Same_Party'])
X['Turncoat'] = le.fit_transform(X['Turncoat'])
X['Incumbent'] = le.fit_transform(X['Incumbent'])
X['last_poll'] = le.fit_transform(X['last_poll'])
X['MyNeta_education'] = le.fit_transform(X['MyNeta_education'])
X['Election_Type'] = le.fit_transform(X['Election_Type'])
y = y.apply(lambda x: 1 if x==1 else 0)
# Data Split into Training and Testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model (Random Forest Classifier)
model = RandomForestClassifier(random_state=42)

In [244]:
%%time
model.fit(X_train, y_train)

CPU times: user 382 ms, sys: 4.62 ms, total: 387 ms
Wall time: 386 ms


# Performance


In [245]:
model.score(X_test, y_test) * 100

98.73997709049256

The accuracy has slightly increased, but let's look at the increase in predicting winners by looking at the confusion matrix.

In [246]:
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)
TP = find_TP(y_test, y_pred)
FN = find_FN(y_test, y_pred)
FP = find_FP(y_test, y_pred)
TN = find_TN(y_test, y_pred)
print('TP:',TP)
print('FN:',FN)
print('FP:',FP)
print('TN:',TN)
precision = TP/(TP+FP)
print('Precision:',precision)

TP: 182
FN: 25
FP: 19
TN: 3266
Precision: 0.9054726368159204


This is a significant improvement of 15% from the previous 75% which indicates voter share percentage plays an important role in whether a candidate is a winner or not, and although normally when we want to classify we will not have this statistic, we can use exit polls or preliminary results or similar statistics in order to predict whether the candidate will win or not. In addition, this model doesn't take into account an extremely important factor, which is the candidate's wealth. From previous results we can observe that wealth has a signficiant impact on their chances of winning, whether directly or not, it is strongly correlated with a candidate winning. For example, in the 2019 elections, approximately 30% of candidates had a net worth of over a crore, 88% of winners were crorepatis, which is indicative of the strong correlation.(From: https://www.myneta.info/LokSabha2019/)