# Classificaiton for Prediction

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
%pylab inline

import sklearn as sk

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_csv('affairs.csv', index_col=0)

In [3]:
df

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,affairs
0,3,32.0,9.0,3.0,3,17,2,0.111111
1,3,27.0,13.0,3.0,1,14,3,3.230769
2,4,22.0,2.5,0.0,1,16,3,1.400000
3,4,37.0,16.5,4.0,3,16,5,0.727273
4,5,27.0,9.0,1.0,1,14,3,4.666666
...,...,...,...,...,...,...,...,...
6361,5,32.0,13.0,2.0,3,17,4,0.000000
6362,4,32.0,13.0,1.0,1,16,5,0.000000
6363,5,22.0,2.5,0.0,2,14,3,0.000000
6364,5,32.0,6.0,1.0,3,14,3,0.000000


In [4]:
(df.affairs == 0).mean()

0.6775054979579014

In [5]:
df['Cheater'] = (df.affairs > 0) * 1.0

In [6]:
df

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,affairs,Cheater
0,3,32.0,9.0,3.0,3,17,2,0.111111,1.0
1,3,27.0,13.0,3.0,1,14,3,3.230769,1.0
2,4,22.0,2.5,0.0,1,16,3,1.400000,1.0
3,4,37.0,16.5,4.0,3,16,5,0.727273,1.0
4,5,27.0,9.0,1.0,1,14,3,4.666666,1.0
...,...,...,...,...,...,...,...,...,...
6361,5,32.0,13.0,2.0,3,17,4,0.000000,0.0
6362,4,32.0,13.0,1.0,1,16,5,0.000000,0.0
6363,5,22.0,2.5,0.0,2,14,3,0.000000,0.0
6364,5,32.0,6.0,1.0,3,14,3,0.000000,0.0


In [7]:
df = pd.get_dummies(df, columns = ['occupation'])

In [8]:
df

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,affairs,Cheater,occupation_1,occupation_2,occupation_3,occupation_4,occupation_5,occupation_6
0,3,32.0,9.0,3.0,3,17,0.111111,1.0,0,1,0,0,0,0
1,3,27.0,13.0,3.0,1,14,3.230769,1.0,0,0,1,0,0,0
2,4,22.0,2.5,0.0,1,16,1.400000,1.0,0,0,1,0,0,0
3,4,37.0,16.5,4.0,3,16,0.727273,1.0,0,0,0,0,1,0
4,5,27.0,9.0,1.0,1,14,4.666666,1.0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6361,5,32.0,13.0,2.0,3,17,0.000000,0.0,0,0,0,1,0,0
6362,4,32.0,13.0,1.0,1,16,0.000000,0.0,0,0,0,0,1,0
6363,5,22.0,2.5,0.0,2,14,0.000000,0.0,0,0,1,0,0,0
6364,5,32.0,6.0,1.0,3,14,0.000000,0.0,0,0,1,0,0,0


In [9]:
X = df.drop(columns = ['Cheater', 'affairs'])

In [10]:
y = df.Cheater

## Hold-out sample

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size = 0.3, random_state = 0)

In [12]:
len(X_train)

4456

### Train on the training set

In [15]:
from sklearn.ensemble import RandomForestClassifier
cl = RandomForestClassifier(random_state = 2)

In [16]:
cl.fit(X_train, y_train)

RandomForestClassifier(random_state=2)

### prediction

In [17]:
y_pred = cl.predict(X_test)

In [18]:
y_pred

array([1., 0., 0., ..., 0., 0., 0.])

In [19]:
(y_pred - y_test).abs().sum()

613.0

In [20]:
len(y_test)

1910

In [21]:
(len(y_test) - (y_pred - y_test).abs().sum())/len(y_test)

0.6790575916230367

### collect scores
#### confusion matrix

In [22]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[1045,  258],
       [ 355,  252]], dtype=int64)

#### Accuracy

In [23]:
import sklearn
sklearn.metrics.accuracy_score(y_test, y_pred)

0.6790575916230367

#### precision

In [24]:
sklearn.metrics.precision_score(y_test, y_pred)

0.49411764705882355

#### recall

In [25]:
sklearn.metrics.recall_score(y_test, y_pred)

0.41515650741350907

#### AUC score

In [26]:
cl.predict_proba(X_test)[:, 1]

array([0.92      , 0.00916667, 0.23824603, ..., 0.4905    , 0.42797222,
       0.13178965])

In [27]:
y_proba = cl.predict_proba(X_test)[:, 1]

In [29]:
sklearn.metrics.roc_auc_score(y_test, y_proba)

0.6795444804221914

## Cross-Validation

In [30]:
cl

RandomForestClassifier(random_state=2)

In [31]:
from sklearn.model_selection import KFold
kf = KFold(n_splits = 10, random_state = 0, shuffle = True)
sklearn.model_selection.cross_val_score(cl, X, y, cv = kf, scoring = 'roc_auc')

array([0.65229223, 0.68218519, 0.70259027, 0.7293057 , 0.68133356,
       0.70471573, 0.71104008, 0.69956151, 0.64804427, 0.6723662 ])

In [32]:
sklearn.model_selection.cross_val_score(cl, X, y, cv = kf, scoring = 'roc_auc').mean()

0.6883434722984785

## Which classifier obtains the highest performance?
#### Here is a list of classifiers

In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

clfs = [DecisionTreeClassifier(), sk.ensemble.RandomForestClassifier(n_jobs = -1),
        sk.naive_bayes.GaussianNB(), sk.linear_model.LogisticRegression(n_jobs = -1),
        sk.tree.DecisionTreeClassifier(), sk.ensemble.AdaBoostClassifier(),
        QuadraticDiscriminantAnalysis(), MLPClassifier(), SVC()]

let's find the best one in terms of average AUC

In [38]:
maxAUC = -1
bestCL = ''
for cl in clfs:
    kf = KFold(n_splits = 10, random_state = 2, shuffle = True)
    auc = sklearn.model_selection.cross_val_score(cl, X, y, cv = kf, scoring = 'roc_auc').mean()
    if auc > maxAUC:
        bestCL = cl
        maxAUC = auc
    print(str(cl) + ': ' + str(auc))

print('***************************')
print(str(bestCL) + ': ' + str(maxAUC))

DecisionTreeClassifier(): 0.604639896253186
RandomForestClassifier(n_jobs=-1): 0.6862137412746873
GaussianNB(): 0.7122887878361104
LogisticRegression(n_jobs=-1): 0.7436946508269009
DecisionTreeClassifier(): 0.6057053786011067
AdaBoostClassifier(): 0.7550603488940328
QuadraticDiscriminantAnalysis(): 0.606178350524553




MLPClassifier(): 0.7484898592036606
SVC(): 0.7481168132568637
***************************
AdaBoostClassifier(): 0.7550603488940328
