# Scikit Learn Library

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler


## 1. Read data

In [2]:
train = pd.read_csv('./train.csv',sep=',\s+', delimiter=',', encoding="utf-8", skipinitialspace=True)
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


In [3]:
train.count()

age               20808
workclass         20808
fnlwgt            20808
education         20808
education-num     20808
marital-status    20808
occupation        20808
relationship      20808
race              20808
sex               20808
capital-gain      20807
capital-loss      20807
hours-per-week    20807
native-country    20807
salary            20807
dtype: int64

## 2. Preprocessing

In [4]:
train.dropna(inplace=True)

In [5]:
train.count()
train['salary'] = train['salary'].map({'>50K': 1, '<=50K': 0})


In [6]:
for c in train.columns:
    print (c, train[c].dtype)

age int64
workclass object
fnlwgt int64
education object
education-num int64
marital-status object
occupation object
relationship object
race object
sex object
capital-gain float64
capital-loss float64
hours-per-week float64
native-country object
salary int64


In [7]:
lb_make = LabelEncoder()
country = ["USA", "S.Korea","Norway","China"]
lb_make.fit_transform(country)


array([3, 2, 1, 0])

In [8]:
lb_make = LabelEncoder()
for c in train.columns:
    if 'object' == train[c].dtype:
        train[c] = lb_make.fit_transform(train[c])

In [9]:
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,7,77516,9,13,4,1,1,4,1,2174.0,0.0,40.0,39,0
1,50,6,83311,9,13,2,4,0,4,1,0.0,0.0,13.0,39,0
2,38,4,215646,11,9,0,6,1,4,1,0.0,0.0,40.0,39,0
3,53,4,234721,1,7,2,6,0,2,1,0.0,0.0,40.0,39,0
4,28,4,338409,9,13,2,10,5,2,0,0.0,0.0,40.0,5,0


In [10]:
scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(train) 
train.loc[:,:] = scaled_values


In [11]:

train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,0.30137,0.875,0.044302,0.6,0.8,0.666667,0.071429,0.2,1.0,1.0,0.02174,0.0,0.397959,0.95122,0.0
1,0.452055,0.75,0.048238,0.6,0.8,0.333333,0.285714,0.0,1.0,1.0,0.0,0.0,0.122449,0.95122,0.0
2,0.287671,0.5,0.138113,0.733333,0.533333,0.0,0.428571,0.2,1.0,1.0,0.0,0.0,0.397959,0.95122,0.0
3,0.493151,0.5,0.151068,0.066667,0.4,0.333333,0.428571,0.0,0.5,1.0,0.0,0.0,0.397959,0.95122,0.0
4,0.150685,0.5,0.221488,0.6,0.8,0.333333,0.714286,1.0,0.5,0.0,0.0,0.0,0.397959,0.121951,0.0


## 3. Model validation

In [12]:
from numpy.core.umath_tests import inner1d

from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import train_test_split



  """Entry point for launching an IPython kernel.


In [13]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GaussianNB()
]

In [14]:
y = train.pop('salary')

X = train
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)


In [15]:
#From https://www.kaggle.com/jeffd23/10-classifier-showdown-in-scikit-learn

#Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions_1 = clf.predict(X_test)
    print(train_predictions_1.sum())
    acc = accuracy_score(y_test, train_predictions_1)
    print("Accuracy: {:.4%}".format(acc))
    
    train_predictions_2 = clf.predict_proba(X_test)
    ll = log_loss(y_test, train_predictions_2)
    print("Log Loss: {}".format(ll))
    
    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    log = log.append(log_entry)
    
print("="*30)

KNeighborsClassifier
****Results****
900.0
Accuracy: 81.1389%
Log Loss: 2.5309490818151357
SVC
****Results****
465.0
Accuracy: 82.3642%
Log Loss: 0.35333418249207227
DecisionTreeClassifier
****Results****
982.0
Accuracy: 80.7064%
Log Loss: 6.663776416413571
RandomForestClassifier
****Results****
775.0
Accuracy: 84.8150%
Log Loss: 0.8988480087229307
GaussianNB
****Results****
483.0
Accuracy: 80.2018%
Log Loss: 0.8081472952059354


# Deep Learning?

In [17]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(hidden_layer_sizes=(100,100,100), max_iter=5000, alpha=0.0001,
                     solver='sgd', verbose=10,  random_state=0,tol=0.000000001)
clf.fit(X_train, y_train)
name = clf.__class__.__name__

print("="*30)
print(name)

print('****Results****')
train_predictions_1 = clf.predict(X_test)

print("Number of class 1: {:.0%}".format(train_predictions_1.sum()))
acc = accuracy_score(y_test, train_predictions_1)
print("Accuracy: {:.4%}".format(acc))

train_predictions_2 = clf.predict_proba(X_test)
ll = log_loss(y_test, train_predictions_2)
print("Log Loss: {}".format(ll))

log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
log = log.append(log_entry)

Iteration 1, loss = 0.58268328
Iteration 2, loss = 0.54221922
Iteration 3, loss = 0.53477935
Iteration 4, loss = 0.52900407
Iteration 5, loss = 0.52317841
Iteration 6, loss = 0.51728046
Iteration 7, loss = 0.51102721
Iteration 8, loss = 0.50444799
Iteration 9, loss = 0.49761868
Iteration 10, loss = 0.49077800
Iteration 11, loss = 0.48411657
Iteration 12, loss = 0.47771358
Iteration 13, loss = 0.47185836
Iteration 14, loss = 0.46638622
Iteration 15, loss = 0.46129805
Iteration 16, loss = 0.45669658
Iteration 17, loss = 0.45247119
Iteration 18, loss = 0.44825501
Iteration 19, loss = 0.44437647
Iteration 20, loss = 0.44071437
Iteration 21, loss = 0.43706687
Iteration 22, loss = 0.43377928
Iteration 23, loss = 0.43053642
Iteration 24, loss = 0.42751876
Iteration 25, loss = 0.42474153
Iteration 26, loss = 0.42214051
Iteration 27, loss = 0.41955576
Iteration 28, loss = 0.41733814
Iteration 29, loss = 0.41495887
Iteration 30, loss = 0.41299098
Iteration 31, loss = 0.41070663
Iteration 32, los