In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler


In [2]:
train = pd.read_csv('./train.csv',sep=',\s+', delimiter=',', encoding="utf-8", skipinitialspace=True)
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


In [3]:
train.count()

age               20808
workclass         20808
fnlwgt            20808
education         20808
education-num     20808
marital-status    20808
occupation        20808
relationship      20808
race              20808
sex               20808
capital-gain      20807
capital-loss      20807
hours-per-week    20807
native-country    20807
salary            20807
dtype: int64

In [4]:
train.dropna(inplace=True)

In [5]:
train.count()
train['salary'] = train['salary'].map({'>50K': 1, '<=50K': 0})


In [6]:
for c in train.columns:
    if 'object' == train[c].dtype:
        lb_make = LabelEncoder()
        train[c] = lb_make.fit_transform(train[c])

In [7]:
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,7,77516,9,13,4,1,1,4,1,2174.0,0.0,40.0,39,0
1,50,6,83311,9,13,2,4,0,4,1,0.0,0.0,13.0,39,0
2,38,4,215646,11,9,0,6,1,4,1,0.0,0.0,40.0,39,0
3,53,4,234721,1,7,2,6,0,2,1,0.0,0.0,40.0,39,0
4,28,4,338409,9,13,2,10,5,2,0,0.0,0.0,40.0,5,0


In [8]:
scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(train) 
train.loc[:,:] = scaled_values


In [9]:

train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,0.30137,0.875,0.044302,0.6,0.8,0.666667,0.071429,0.2,1.0,1.0,0.02174,0.0,0.397959,0.95122,0.0
1,0.452055,0.75,0.048238,0.6,0.8,0.333333,0.285714,0.0,1.0,1.0,0.0,0.0,0.122449,0.95122,0.0
2,0.287671,0.5,0.138113,0.733333,0.533333,0.0,0.428571,0.2,1.0,1.0,0.0,0.0,0.397959,0.95122,0.0
3,0.493151,0.5,0.151068,0.066667,0.4,0.333333,0.428571,0.0,0.5,1.0,0.0,0.0,0.397959,0.95122,0.0
4,0.150685,0.5,0.221488,0.6,0.8,0.333333,0.714286,1.0,0.5,0.0,0.0,0.0,0.397959,0.121951,0.0


In [14]:
from numpy.core.umath_tests import inner1d

from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import train_test_split



In [11]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GaussianNB()]

In [12]:
y = train.pop('salary')
X = train
X_train,X_test,y_train,y_test = train_test_split(X.index,y,test_size=0.2)


In [13]:
#From https://www.kaggle.com/jeffd23/10-classifier-showdown-in-scikit-learn

#Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train.to_frame(), y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(X_test.to_frame())
    acc = accuracy_score(y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    
    train_predictions = clf.predict_proba(X_test.to_frame())
    ll = log_loss(y_test, train_predictions)
    print("Log Loss: {}".format(ll))
    
    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    log = log.append(log_entry)
    
print("="*30)

KNeighborsClassifier
****Results****
Accuracy: 68.7650%
Log Loss: 4.325163324129155
SVC
****Results****
Accuracy: 76.1893%
Log Loss: 0.5488791101098857
DecisionTreeClassifier
****Results****
Accuracy: 63.7674%
Log Loss: 12.514289957598585
RandomForestClassifier
****Results****
Accuracy: 65.4733%
Log Loss: 4.8874915835369475
GaussianNB
****Results****
Accuracy: 76.1893%
Log Loss: 0.5489250516265934
