In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# Import Data

In [None]:
train_df = pd.read_csv('/kaggle/input/who-is-the-real-winner/train.csv')
train_df

In [None]:
test_df = pd.read_csv('/kaggle/input/who-is-the-real-winner/test.csv')
test_df

# Data Visualization

## Count of Education Labels

In [None]:
ax = sns.countplot(data=train_df, x="Education")
ax.bar_label(ax.containers[0])
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()
# ax.figure.savefig("Edu_cnt.png")

In [None]:
ax = sns.countplot(data=train_df, x="Party")
ax.bar_label(ax.containers[0])
ax.set_xticklabels(ax.get_xticklabels(), rotation=50, ha="right")
plt.tight_layout()
plt.show()
# ax.figure.savefig("Party_cnt.png")

In [None]:
print(train_df['Constituency ∇'].value_counts().to_frame())

#### We need to convert **Total Assets** and **Liabilities** to numerical values for Data Visualization.

# Data preprocessing

#### Coverting **Total Assets** and **Liabilities** to numerical values for training data.

In [None]:
uniq = []
for i in train_df['Total Assets'].str.split():
    uniq.append(i[-1])

np.unique(uniq)

In [None]:
HtagAssets = []
for i in train_df['Total Assets'].str.split():
    x = i[-1]
    if(x=='0'): HtagAssets.append(0)
    elif(x=='Crore+'): HtagAssets.append(int(i[0])*10000000)
    elif(x=='Lac+'): HtagAssets.append(int(i[0])*100000)
    elif(x=='Thou+'): HtagAssets.append(int(i[0])*1000)

train_df['Total Assets'] = HtagAssets
train_df.head()

In [None]:
uniq = []
for i in train_df['Liabilities'].str.split():
    uniq.append(i[-1])

np.unique(uniq)

In [None]:
HtagLiabilities = []
for i in train_df['Liabilities'].str.split():
    x = i[-1]
    if(x=='0'): HtagLiabilities.append(0)
    elif(x=='Crore+'): HtagLiabilities.append(int(i[0])*10000000)
    elif(x=='Lac+'): HtagLiabilities.append(int(i[0])*100000)
    elif(x=='Thou+'): HtagLiabilities.append(int(i[0])*1000)
    elif(x=='Hund+'): HtagLiabilities.append(int(i[0])*100)

train_df['Liabilities'] = HtagLiabilities
train_df.head()

#### Coverting **Total Assets** and **Liabilities** to numerical values for testing data.

In [None]:
uniq = []
for i in test_df['Total Assets'].str.split():
    uniq.append(i[-1])

np.unique(uniq)

In [None]:
HtagAssets = []
for i in test_df['Total Assets'].str.split():
    x = i[-1]
    if(x=='0'): HtagAssets.append(0)
    elif(x=='Crore+'): HtagAssets.append(int(i[0])*10000000)
    elif(x=='Lac+'): HtagAssets.append(int(i[0])*100000)
    elif(x=='Thou+'): HtagAssets.append(int(i[0])*1000)

test_df['Total Assets'] = HtagAssets
test_df.head()

In [None]:
uniq = []
for i in test_df['Liabilities'].str.split():
    uniq.append(i[-1])

np.unique(uniq)

In [None]:
HtagLiabilities = []
for i in test_df['Liabilities'].str.split():
    x = i[-1]
    if(x=='0'): HtagLiabilities.append(0)
    elif(x=='Crore+'): HtagLiabilities.append(int(i[0])*10000000)
    elif(x=='Lac+'): HtagLiabilities.append(int(i[0])*100000)
    elif(x=='Thou+'): HtagLiabilities.append(int(i[0])*1000)
    elif(x=='Hund+'): HtagLiabilities.append(int(i[0])*100)

test_df['Liabilities'] = HtagLiabilities
test_df.head()

# Data Visualization

## Check if any column contains **NaN** values

In [None]:
train_df.info()

#### Therefore, we can see that none of the values are **NaN**, so we do not need to handle it differently.

In [None]:
train_df_mod = train_df
train_df_mod.head()

In [None]:
train_df_plot = train_df
train_df_plot.head() # Used for plotting

In [None]:
test_df_mod = test_df
test_df_mod.head()

#### Obviously, Education will not depend on **ID** and **Candidate** column. Therefore we can drop these columns.

In [None]:
train_df_mod.drop(columns=['ID','Candidate'], inplace=True)
train_df_mod.head()

In [None]:
test_df_mod.drop(columns=['ID','Candidate'], inplace=True)
test_df_mod.head()

In [None]:
print("Unique values in Constituency: " + str(len(pd.unique(train_df_mod['Constituency ∇']))))
print("Total rows of data given: " + str(len(train_df_mod)))

#### Constituency ∇ is also almost unique for each data point. Therefore, it won't be very useful for training the model and we can drop it.

In [None]:
train_df_mod.drop(columns=['Constituency ∇'], inplace=True)
train_df_mod.head()

In [None]:
test_df_mod.drop(columns=['Constituency ∇'], inplace=True)
test_df_mod.head()

## One Hot Encoding

In [None]:
tdf1 = pd.get_dummies(train_df_mod['state'])
train_df_mod = train_df_mod.join(tdf1)
train_df_mod.drop(columns=['state'], inplace=True)
train_df_mod.head()

In [None]:
tdf2 = pd.get_dummies(train_df_mod['Party'])
train_df_mod = train_df_mod.join(tdf2)
train_df_mod.drop(columns=['Party'], inplace=True)
train_df_mod.head()

In [None]:
tdf1 = pd.get_dummies(test_df_mod['state'])
test_df_mod = test_df_mod.join(tdf1)
test_df_mod.drop(columns=['state'], inplace=True)
test_df_mod.head()

In [None]:
tdf2 = pd.get_dummies(test_df_mod['Party'])
test_df_mod = test_df_mod.join(tdf2)
test_df_mod.drop(columns=['Party'], inplace=True)
test_df_mod.head()

## Label Encoding

In [None]:
le = LabelEncoder()
train_df_mod['Education'] = le.fit_transform(train_df_mod['Education'])

train_df_mod.head()

## Correlation of all the columns

In [None]:
train_df_mod.corrwith(train_df_mod['Education']).sort_values(ascending=False)[1:]

# Plots

## Taking inverse of Education column for better labels

In [None]:
train_df_mod['Education'] = le.inverse_transform(train_df_mod['Education'])
train_df_mod.head()

## Total Assets vs Education

In [None]:
ax = sns.scatterplot(x="Education", y="Total Assets", data=train_df_mod)
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
# ax.figure.savefig("Total_Ass_vs_Edu.png")

## Liabilities vs Education

In [None]:
ax = sns.scatterplot(x="Education", y="Liabilities", data=train_df_mod)
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
# ax.figure.savefig("Lia_vs_Edu.png")

## Criminal Case vs Education

In [None]:
ax = sns.scatterplot(x="Education", y="Criminal Case", data=train_df_mod)
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
# ax.figure.savefig("Crim_case_vs_Edu.png")

## Remove Outliers

In [None]:
train_df_mod.drop(train_df_mod.index[train_df_mod['Criminal Case'] > 60], inplace=True)
train_df_mod.drop(train_df_mod.index[train_df_mod['Liabilities'] > 4e9], inplace=True)
train_df_mod.drop(train_df_mod.index[train_df_mod['Total Assets'] > 4e9], inplace=True)
print(len(train_df_mod))

## Percentage Distribution of Parties with the most criminal records

In [None]:
criminal_records = train_df_plot.groupby('Party')['Criminal Case'].sum().reset_index()
party_counts = train_df_plot['Party'].value_counts().reset_index()
party_counts.columns = ['Party', 'Count']
criminal_records = pd.merge(criminal_records, party_counts, on='Party')
criminal_records['Percentage'] = (criminal_records['Count'] / criminal_records['Count'].sum()) * 100

plt.figure(figsize=(10, 6))
ax = sns.barplot(x='Party', y='Percentage', data=criminal_records)
ax.bar_label(ax.containers[0], rotation=90, padding=5)
plt.xlabel("Party")
plt.ylabel("Percentage")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
# ax.figure.savefig("Fig1.png")

## Percentage Distribution of Parties with the most Wealthy Candidates

In [None]:
wealthy_candidates = train_df_plot.groupby('Party')['Total Assets'].sum().reset_index()
party_counts = train_df_plot['Party'].value_counts().reset_index()
party_counts.columns = ['Party', 'Count']
wealthy_candidates = pd.merge(wealthy_candidates, party_counts, on='Party')
wealthy_candidates['Percentage'] = (wealthy_candidates['Count'] / wealthy_candidates['Count'].sum()) * 100

plt.figure(figsize=(10, 6))
ax = sns.barplot(x='Party', y='Percentage', data=wealthy_candidates)
ax.bar_label(ax.containers[0], rotation=90, padding=5)
plt.xlabel("Party")
plt.ylabel("Percentage")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
# ax.figure.savefig("Fig2.png")

## Total Assets vs Liabilities

In [None]:
plt.figure(figsize=(10, 5))
ax = sns.scatterplot(x=train_df_mod['Liabilities'], y=train_df_mod['Total Assets'])
plt.xlabel("Liabilities")
plt.ylabel("Total Assets")
plt.tight_layout()
plt.show()
# ax.figure.savefig("Fig3.png")

# Label Encoding

In [None]:
train_df_mod['Education'] = le.fit_transform(train_df_mod['Education'])

train_df_mod.head()

# Preparing training and testing data

In [None]:
X = train_df_mod.drop(['Education'], axis=1)
X.head()

In [None]:
y = train_df_mod['Education']
y.head()

In [None]:
y.hist(bins=50)

In [None]:
scaler = StandardScaler()

X = scaler.fit_transform(X)
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Optimising number of neighbours

In [None]:
cols_results=['Family','Model','F1 Score']
results = pd.DataFrame(columns=cols_results)
results

In [None]:
kVals = range(1,30)
knn_names = ['KNN-'+str(k) for k in kVals]
for k in kVals:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)

    y_pred = knn.predict(X_test)
    new_row = pd.DataFrame([['KNN',knn_names[k-1],f1_score(y_test,y_pred,average='micro')]],columns=cols_results)
    results = pd.concat([results, new_row], ignore_index=True)
results[results.Family=='KNN']

In [None]:
knn = KNeighborsClassifier(n_neighbors=17)

model = knn.fit(X_train, y_train)
model

# Testing the model

In [None]:
y_predict = model.predict(X_test)
y_predict

# Calculating the **F1** score

In [None]:
f1_score(y_test, y_predict, average=None).mean()

# Cross Validation Score

In [None]:
cross_val_score(model, X_train, y_train).mean()

# Accuracy

In [None]:
accuracy_score(y_test, y_predict)

# Creating submission file

In [None]:
test_df_mod

In [None]:
X_final = test_df_mod
X_final = scaler.fit_transform(X_final)
X_final

In [None]:
y_final = model.predict(X_final)
y_final = le.inverse_transform(y_final)
# X_final['Education'] = y_final
# X_final
test_df['Education'] = y_final
test_df.head()

In [None]:
test_df['ID'] = test_df.index
test_df.head()

In [None]:
test_df[['ID','Education']].to_csv('prediction.csv',index=False)