<a href="https://colab.research.google.com/github/NehaAgarwal2598/election_parameters/blob/master/election_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load data from Github repo

In [0]:
!git clone https://github.com/ayan59dutta/election_parameters.git

# Load data from file into dataframes

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [0]:
colmn_list = ['Candidate', 'Constituency', 'Criminal Case', 'Education', 'Total Assets', 'Liabilities']

def convert(x):
    return int(x.split()[1].replace(',', '').replace('~', ''))

df1_cand_details = pd.read_excel('election_parameters/data/MyNeta_UP_Assembly_ELections_2017.xlsx',
                          usecols=colmn_list,
                          converters={'Candidate': lambda x:x.upper(),
                                     'Total Assets': convert,
                                     'Liabilities': convert})

df1_cand_details['Net Assets'] = df1_cand_details['Total Assets'] - df1_cand_details['Liabilities']
df1_cand_details = df1_cand_details.drop(columns=['Total Assets', 'Liabilities'])
df1_cand_details.rename(columns={'Candidate': 'CAND_NAME', 
                                 'Constituency': 'AC_NAME', 
                                 'Criminal Case': 'C_CASE',
                                 'Education': 'EDU', 
                                 'Net Assets': 'NET_ASSETS'}, 
                        inplace=True)
df1_cand_details

In [0]:
colmn_list = ['ST_CODE', 'AC_NAME', 'AC_TYPE', 'CAND_NAME', 'CAND_SEX', 
              'CAND_CATEGORY', 'CAND_AGE', 'TOTVOTPOLL', 'POSITION']

df_ae12 = pd.read_excel('election_parameters/data/AE2012_8913.xls',
                        usecols=colmn_list,
                        header=1,
                        converters={'AC_NAME': lambda x: x.upper()})

df_ae12 = df_ae12[df_ae12.ST_CODE == 'S24']

df_ae12 = df_ae12.drop('ST_CODE', axis=1).reset_index().drop('index', axis=1)

df_ae12

In [0]:
la_2017 = pd.read_excel('election_parameters/data/LA_2017.xls', 
                        usecols = [1, 6, 7, 8, 9,10, 11, 13, 14], 
                        converters={'AC_NAME': lambda x:x.upper(),
                                    'CAND_NAME': lambda x: x.upper()})
la_2017  = la_2017.drop(la_2017[la_2017.ST_NAME != 'Uttar Pradesh'].index)
la_2017 = la_2017.drop('ST_NAME', axis = 1).reset_index().drop('index', axis=1)
la_2017

## Anti-Incumbency Check

In [0]:
# winners_ae12 = df_ae12[df_ae12['POSITION'] == 1]
# #winners_ae12 

# no_of_winners_12and17 = 0
# no_of_winners_12not17 = 0
# no_of_winners_12abs17 = 0

# for index, row in winners_ae12.iterrows():
#   #print(row.CAND_NAME)
#   if row.CAND_NAME in la_2017['CAND_NAME']:
#     print(row.CAND_NAME)
#     break
# else:
#   print('Over')
  
# set1 = set(winners_ae12.CAND_NAME)
# set1
# set2 = set(la_2017.CAND_NAME)
# set2
# set12 = set1.intersection(set2)
# set12

In [0]:
la_2017.CAND_NAME

## Join the dataframes

In [0]:
la17_df = la_2017.merge(right=df1_cand_details, 
                        how='left',
                        on=['AC_NAME', 'CAND_NAME'])
la17_df

### Dropping NOTA

In [0]:
la17_df = la17_df[la17_df.CAND_NAME != 'NONE OF THE ABOVE']
la17_df

### Dropping NaN values

In [0]:
la17_df = la17_df.dropna().reset_index(drop=True)
la17_df

### Change datatypes of columns from float to int

In [0]:
la17_df = la17_df.astype({"CAND_AGE": int, "C_CASE" : int, "NET_ASSETS": int})
la17_df

## Set Targets

In [0]:
la17_df.loc[la17_df['POSITION'] == 1, 'POSITION'] = 1
la17_df.loc[la17_df['POSITION'] > 1, 'POSITION'] = 0
la17_df

## Training and Test set

In [0]:
features_names = ['CAND_NAME', 'CAND_SEX', 'CAND_CATEGORY', 'CAND_AGE', 'C_CASE', 'EDU', 'NET_ASSETS']
X = la17_df[features_names]
y = la17_df['POSITION']
X = pd.get_dummies(X)
X

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)


## K-Nearest Neighbors

In [0]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
# X_train = OneHotEncoder(handle_unknown='ignore').fit(X_train)
# X_test = OneHotEncoder(handle_unknown='ignore').fit(X_test)

knn = KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)

In [0]:
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
y_pred

In [0]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

### Comparing Error Rate with the K Value

In [0]:
error = []
for i in range(1, 40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error.append(np.mean(pred_i != y_test))

In [0]:
plt.figure(figsize=(12, 6))
plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o',
         markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')