In [0]:
import pandas as pd
import io
import time
from sklearn.model_selection import train_test_split  
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import tree
from sklearn import metrics 
from sklearn import svm
from sklearn.preprocessing import OneHotEncoder
from google.colab import files
import numpy as np
import pickle
import sys

In [8]:

uploaded = files.upload()

Saving adult.csv to adult.csv


In [9]:

preprocessed_data = pd.read_csv(io.StringIO(uploaded['adult.csv'].decode('utf-8')))
print(preprocessed_data.head())


   age  workclass  fnlwgt  ... hours-per-week  native-country income
0   25    Private  226802  ...             40   United-States  <=50K
1   38    Private   89814  ...             50   United-States  <=50K
2   28  Local-gov  336951  ...             40   United-States   >50K
3   44    Private  160323  ...             40   United-States   >50K
4   18          ?  103497  ...             30   United-States  <=50K

[5 rows x 15 columns]


In [0]:
numerical_colName_features =  ['age',  'fnlwgt',  'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']
category_colName_features = [ 'workclass',   'marital-status', 'occupation', 'relationship', 'race', 'gender','native-country']

categorical_colName_label = ['income']

onehotencoded_categorical_features =pd.get_dummies(preprocessed_data[category_colName_features], prefix_sep='_', drop_first=True)
onehotencoded_label =  pd.get_dummies(preprocessed_data[categorical_colName_label], prefix_sep='_', drop_first=True)

data = preprocessed_data[numerical_colName_features]
data = data.join(onehotencoded_categorical_features)
data = data.join(onehotencoded_label)

feature_cols = numerical_colName_features 
feature_cols.extend(onehotencoded_categorical_features.columns)
label_cols = onehotencoded_label.columns


In [0]:
train_data, test_data = train_test_split(data, test_size=0.1)

train_data_x = train_data[feature_cols]
train_data_y = train_data[label_cols]
test_data_x = test_data[feature_cols]
test_data_y = test_data[label_cols]


In [0]:

class evaluation_metric:
  def __init__(self, name, training_time, test_time, area_under_curve, memory):
        self.training_time = training_time
        self.test_time = test_time
        self.area_under_curve = area_under_curve
        self.name = name
        self.memory = memory


def model_training(name_of_algorithm,model, train_data_x, train_data_y, test_data_x, test_data_y):
  # training the model with training set
  training_start_time = time.time()
  model.fit(train_data_x , train_data_y.values.ravel() ) 
  training_time = time.time() - training_start_time

  # meomory consumption of the model in bytes
  p = pickle.dumps(model)
  memory_size = sys.getsizeof(p)
  

  # making predictions on the testing set
  testing_start_time = time.time() 
  predicted_y = model.predict(test_data_x) 
  testing_time = time.time() - testing_start_time
  
  area_under_curve = metrics.roc_auc_score(test_data_y, predicted_y)

  return evaluation_metric(name_of_algorithm, training_time,testing_time,area_under_curve, memory_size)


def k_nearest_neigbour_classifier(train_data_x, train_data_y, test_data_x, test_data_y):
  knn_models = []  
  for i in range(1,10):
    knn_models.append(model_training("K-Nearest Neigbour (k="+ str(i)+")", KNeighborsClassifier(n_neighbors=i), train_data_x, train_data_y, test_data_x, test_data_y))

  return max(knn_models, key=lambda x: x.area_under_curve)



In [0]:
result_list =[]

result_list.append(model_training("Gaussian Naive Bayes", GaussianNB(), train_data_x, train_data_y, test_data_x, test_data_y))
result_list.append(model_training("Logistic Regression", LogisticRegression(solver='liblinear', random_state=0), train_data_x, train_data_y, test_data_x, test_data_y))
result_list.append(k_nearest_neigbour_classifier(train_data_x, train_data_y, test_data_x, test_data_y))
result_list.append(model_training("Support Vector Machine", svm.SVC(), train_data_x, train_data_y, test_data_x, test_data_y))
result_list.append(model_training("Decision Tree", tree.DecisionTreeClassifier(), train_data_x, train_data_y, test_data_x, test_data_y))
result_list.append(model_training("Random Forest", RandomForestClassifier(), train_data_x, train_data_y, test_data_x, test_data_y))
result_list.append(model_training("Extra Tree", ExtraTreesClassifier(), train_data_x, train_data_y, test_data_x, test_data_y))



In [25]:
print('-' * 125)
print( '{:<25s}{:<25s}{:<25s}{:<25s}{:<25s}'.format("Name","Training Time","Testing Time","Memory Consumption","Area Under Curve")) 
print( '{:<25s}{:<25s}{:<25s}{:<25s}{:<25s}'.format("(for "+str(len(feature_cols))+" features)","(for "+ str(len(train_data)) +" data points)","(for "+ str(len(test_data))+" data points)","In Bytes","")) 
print('-' * 125)

result_list.sort(key=sort_area_under_curve, reverse = True)
for obj in result_list:
    print( '{:<25s}{:<25.10f}{:<25.10f}{:>15d}{:>25.10f}'.format(obj.name, obj.training_time, obj.test_time, obj.memory, obj.area_under_curve )) 



-----------------------------------------------------------------------------------------------------------------------------
Name                     Training Time            Testing Time             Memory Consumption       Area Under Curve         
(for 85 features)        (for 43957 data points)  (for 4885 data points)   In Bytes                                          
-----------------------------------------------------------------------------------------------------------------------------
Random Forest            6.6591873169             0.1621749401                   106176566             0.7785054769
Extra Tree               7.5570557117             0.2106199265                   241020324             0.7708447554
Decision Tree            0.5575380325             0.0040352345                      925912             0.7474020760
K-Nearest Neigbour (k=3) 1.6563806534             0.5458877087                    63335707             0.6563818317
Gaussian Naive Bayes     0.10970