# Scikit Learn Library

In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler


## 1. Read data

In [None]:
train = pd.read_csv('./train.csv',sep=',\s+', delimiter=',', encoding="utf-8", skipinitialspace=True)
train.head()

In [None]:
train.count()

## 2. Preprocessing

In [None]:
train.dropna(inplace=True)

In [None]:
train.count()
train['salary'] = train['salary'].map({'>50K': 1, '<=50K': 0})


In [None]:
for c in train.columns:
    print (c, train[c].dtype)

In [None]:
lb_make = LabelEncoder()
country = ["USA", "S.Korea","Norway","China"]
lb_make.fit_transform(country)


In [None]:
lb_make = LabelEncoder()
for c in train.columns:
    if 'object' == train[c].dtype:
        train[c] = lb_make.fit_transform(train[c])

In [None]:
train.head()

In [None]:
scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(train) 
train.loc[:,:] = scaled_values


In [None]:

train.head()

## 3. Model validation

In [None]:
from numpy.core.umath_tests import inner1d

from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import train_test_split



In [None]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GaussianNB()
]

In [None]:
y = train.pop('salary')

X = train
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)


In [None]:
#From https://www.kaggle.com/jeffd23/10-classifier-showdown-in-scikit-learn

#Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions_1 = clf.predict(X_test)
    print(train_predictions_1.sum())
    acc = accuracy_score(y_test, train_predictions_1)
    print("Accuracy: {:.4%}".format(acc))
    
    train_predictions_2 = clf.predict_proba(X_test)
    ll = log_loss(y_test, train_predictions_2)
    print("Log Loss: {}".format(ll))
    
    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    log = log.append(log_entry)
    
print("="*30)

# Deep Learning?

In [None]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(hidden_layer_sizes=(100,100,100), max_iter=5000, alpha=0.0001,
                     solver='sgd', verbose=10,  random_state=0,tol=0.000000001)
clf.fit(X_train.to_frame(), y_train)
name = clf.__class__.__name__

print("="*30)
print(name)

print('****Results****')
train_predictions_1 = clf.predict(X_test)

print("Number of class 1: {:.0%}".format(train_predictions_1.sum()))
acc = accuracy_score(y_test, train_predictions_1)
print("Accuracy: {:.4%}".format(acc))

train_predictions_2 = clf.predict_proba(X_test)
ll = log_loss(y_test, train_predictions_2)
print("Log Loss: {}".format(ll))

log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
log = log.append(log_entry)