In [1]:
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import scipy.stats as stats
%matplotlib inline

df=pd.read_excel('Election_Data.xlsx',sheet_name='Election_Dataset_Two Classes')
df.drop('Unnamed: 0',axis=1,inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(inplace=True,drop=True)
df =pd.get_dummies(df, columns=['gender','vote'],drop_first=True)
df

Unnamed: 0,age,economic.cond.national,economic.cond.household,Blair,Hague,Europe,political.knowledge,gender_male,vote_Labour
0,43,3,3,4,1,2,2,0,1
1,36,4,4,4,4,5,2,1,1
2,35,4,4,5,2,3,2,1,1
3,24,4,2,2,1,4,0,0,1
4,41,2,2,1,1,6,2,1,1
...,...,...,...,...,...,...,...,...,...
1512,67,5,3,2,4,11,3,1,0
1513,73,2,2,4,4,8,2,1,0
1514,37,3,3,5,4,2,2,1,1
1515,61,3,3,1,4,11,2,1,0


In [2]:
X = df.drop("vote_Labour" , axis=1)
y = df['vote_Labour']

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=1, stratify=y)

In [16]:
from sklearn.model_selection import GridSearchCV
grid={'n_estimators':[100],   # 50,150,200,300,500
    'criterion':['gini'],   # 'entropy'
    'max_depth':[5],   # 6,7,8,9
    'min_samples_split':[2],  #5,10,20
    'min_samples_leaf':[1],   #3,5,7,10
    'max_features':[2],   # 3,4,5,6
    'bootstrap':[True],  # False
    'oob_score':[False],  # True
    'warm_start':[False],  # True
    'ccp_alpha':[0.0,]   # 0.2,0.4
}
model = RandomForestClassifier(random_state=1)

In [17]:
grid_search = GridSearchCV(estimator = model, param_grid = grid, cv = 3,n_jobs=-1,scoring='f1')

In [18]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=1), n_jobs=-1,
             param_grid={'bootstrap': [True], 'ccp_alpha': [0.0],
                         'criterion': ['gini'], 'max_depth': [5],
                         'max_features': [2], 'min_samples_leaf': [1],
                         'min_samples_split': [2], 'n_estimators': [100],
                         'oob_score': [False], 'warm_start': [False]},
             scoring='f1')

In [19]:
print(grid_search.best_params_,'\n')
print(grid_search.best_estimator_)

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'gini', 'max_depth': 5, 'max_features': 2, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100, 'oob_score': False, 'warm_start': False} 

RandomForestClassifier(max_depth=5, max_features=2, random_state=1)


In [20]:
clf = grid_search.best_estimator_
model=clf.fit(X_train,y_train)

In [21]:
model.score(X_train, y_train)

0.8623939679547596

In [22]:
model.score(X_test, y_test)

0.8662280701754386

In [23]:
pred_train_prob = model.predict_proba(X_train)
pred_test_prob = model.predict_proba(X_test)
pred_train_prob

array([[0.16655429, 0.83344571],
       [0.03563459, 0.96436541],
       [0.54404966, 0.45595034],
       ...,
       [0.05923191, 0.94076809],
       [0.67162181, 0.32837819],
       [0.59936067, 0.40063933]])

In [24]:
pred_train_prob = pred_train_prob[:,1]

In [25]:
from sklearn.metrics import accuracy_score,f1_score,recall_score, precision_score


def GetCustomData(custom_prob):
    custom_cutoff_data=[]

    for i in range(0,len(pred_train_prob)):  #2333
        if pred_train_prob[i] >custom_prob:
            a= 1
        else:
            a=0
        custom_cutoff_data.append(a)
    return custom_cutoff_data

In [26]:
thresh=[]
acc=[]
f1=[]
rec=[]
pre=[]

In [27]:
for j in np.arange(0.1,1,0.1):
    result = GetCustomData(j)
    thresh.append(j)
    acc.append(accuracy_score(y_train,result))
    f1.append(f1_score(y_train,result))
    rec.append(recall_score(y_train,result))
    pre.append(precision_score(y_train,result))


In [28]:
dat = pd.DataFrame({'Threshold': thresh,
             'Accuracy': acc,
             'f1': f1,
             'Recall': rec,
             'Precision':pre})
dat

Unnamed: 0,Threshold,Accuracy,f1,Recall,Precision
0,0.1,0.696513,0.821111,1.0,0.696513
1,0.2,0.722903,0.834086,1.0,0.715392
2,0.3,0.784166,0.864256,0.986468,0.768987
3,0.4,0.819039,0.882784,0.978349,0.804227
4,0.5,0.862394,0.904575,0.936401,0.874842
5,0.6,0.864279,0.900552,0.882273,0.919605
6,0.7,0.835061,0.872912,0.813261,0.942006
7,0.8,0.743638,0.778862,0.648173,0.97556
8,0.9,0.592837,0.591682,0.423545,0.981191


In [29]:
dat[dat['f1']==dat['f1'].max()]

Unnamed: 0,Threshold,Accuracy,f1,Recall,Precision
4,0.5,0.862394,0.904575,0.936401,0.874842


In [30]:
dat[dat['Accuracy']==dat['Accuracy'].max()]

Unnamed: 0,Threshold,Accuracy,f1,Recall,Precision
5,0.6,0.864279,0.900552,0.882273,0.919605
