# Adult Dataset

In [2]:
#importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import scipy

#Importing Adult training Dataset
dataset_train = pd.read_csv('adult.data',header=None)
data_train = pd.DataFrame(dataset_train)
X = data_train.iloc[:, 1:14].values
Y = data_train.iloc[:, 14].values

#Importing Adult test Dataset
dataset_test = pd.read_csv('adult.test',skiprows=1)
data_test = pd.DataFrame(dataset_test)
print(data_test)
X_test = data_test.iloc[:, 1:14].values
Y_test = data_test.iloc[:, 14].values

X_train = X
Y_train = Y

print(dataset_train)
print(dataset_test)
print(X.shape)
print(Y.shape)

#Setting up an array describing if a feature's data is discrete(D) or continuous(C)
datatype = np.array(['D','C','D','C','D','D','D','D','D','C','C','C','D'])

       25        Private   226802           11th   7        Never-married  \
0      38        Private    89814        HS-grad   9   Married-civ-spouse   
1      28      Local-gov   336951     Assoc-acdm  12   Married-civ-spouse   
2      44        Private   160323   Some-college  10   Married-civ-spouse   
3      18              ?   103497   Some-college  10        Never-married   
4      34        Private   198693           10th   6        Never-married   
...    ..            ...      ...            ...  ..                  ...   
16275  39        Private   215419      Bachelors  13             Divorced   
16276  64              ?   321403        HS-grad   9              Widowed   
16277  38        Private   374983      Bachelors  13   Married-civ-spouse   
16278  44        Private    83891      Bachelors  13             Divorced   
16279  35   Self-emp-inc   182148      Bachelors  13   Married-civ-spouse   

        Machine-op-inspct        Own-child                Black     Male  \

In [2]:
dataset_train.info()
dataset_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
0     32561 non-null int64
1     32561 non-null object
2     32561 non-null int64
3     32561 non-null object
4     32561 non-null int64
5     32561 non-null object
6     32561 non-null object
7     32561 non-null object
8     32561 non-null object
9     32561 non-null object
10    32561 non-null int64
11    32561 non-null int64
12    32561 non-null int64
13    32561 non-null object
14    32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16280 entries, 0 to 16279
Data columns (total 15 columns):
25                    16280 non-null int64
 Private              16280 non-null object
 226802               16280 non-null int64
 11th                 16280 non-null object
 7                    16280 non-null int64
 Never-married        16280 non-null object
 Machine-op-inspct    16280 non-null object
 Own-child   

In [3]:
# Preprocessing the discrete data
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
for i in range(0,13):
    if datatype[i]=='D':
        X_train[:,i]  = labelencoder.fit_transform(X_train[:,i])
        X_test[:,i]  = labelencoder.fit_transform(X_test[:,i])
Y_train = labelencoder.fit_transform(Y_train)
Y_test = labelencoder.fit_transform(Y_test)

In [4]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [None]:
# Model Accuracy
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

names = ['KNeighborsClassifier','SVC', 'DecisionTreeClassifier', 'RandomForestClassifier','AdaBoostClassifier','LogisticRegression','GaussianNB','MLPClassifier']
models = [KNeighborsClassifier(),
          SVC(),
          DecisionTreeClassifier(),
          RandomForestClassifier(),
          AdaBoostClassifier(),
          LogisticRegression(),
          GaussianNB(),
          MLPClassifier()]

param_distributions = {
    'KNeighborsClassifier': {'n_neighbors': [5,11], 'metric':['minkowski','euclidean']},
    'SVC': {'kernel':['rbf']},
    'DecisionTreeClassifier': {'criterion':['gini','entropy'], 'max_depth': range(1,10,2)},
    'RandomForestClassifier': {'n_estimators': [16, 32]},
    'AdaBoostClassifier': {'n_estimators': [16, 32], 'learning_rate':[0.8,1]},
    'LogisticRegression': {'max_iter':[100,130,140],'C': np.logspace(-1, 1, 3)},
    'GaussianNB': {},
    'MLPClassifier': {'hidden_layer_sizes': [(100,),(30,20)],'activation':['tanh', 'relu'], 'max_iter': [100,200]}
}

accuracy= np.zeros(8)
for counter, model in enumerate(models):
    gridcv = GridSearchCV(model, param_distributions[names[counter]], verbose=1, n_jobs=4, cv=3)
    gridcv.fit(X_train, Y_train)
    grid_accuracy_test  = sklearn.metrics.accuracy_score(Y_test,  gridcv.best_estimator_.predict(X_test))
    accuracy[counter] = grid_accuracy_test*100
    print("Accuracy for " + names[counter] + ":",accuracy[counter])

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  12 out of  12 | elapsed:   34.6s finished


Accuracy for KNeighborsClassifier: 83.63636363636363
Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


In [None]:
#Comparision graph between all models
import seaborn as sns
y_pos = np.arange(len(names))
heights = [accuracy[0],accuracy[1],accuracy[2],accuracy[3],accuracy[4],accuracy[5],accuracy[6], accuracy[7]]

fig, ax=plt.subplots(1,1,figsize=(12,6))

plt.xticks(rotation='90')
sns.barplot(x=names, y=heights)
plt.ylabel('accuracy score')
plt.title('Adult dataset models accuracy')