In [2]:
# -*- coding: utf-8 -*-

"""
This is an example to perform simple linear regression algorithm on the dataset (weight and height),
where x = weight and y = height.
"""
import pandas as pd
import numpy as np
import datetime
import random

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import Imputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

from utilities.losses import compute_loss
from utilities.optimizers import gradient_descent, pso, mini_batch_gradient_descent
from sklearn.model_selection import train_test_split

# General settings
from utilities.visualization import visualize_train, visualize_test

seed = 309
# Freeze the random seed
random.seed(seed)
np.random.seed(seed)
train_test_split_test_size = 0.3

# Training settings
alpha = 0.1  # step size
max_iters = 50  # max iterations

algorithms = {
    "KNNC": KNeighborsClassifier(),
    "GNB": GaussianNB(),
    "SVM": SVC(),
    "DTC": DecisionTreeClassifier(),
    "RFC": RandomForestClassifier(),
    "ABC": AdaBoostClassifier(),
    "GBC": GradientBoostingClassifier(),
    "LDA": LinearDiscriminantAnalysis(),
    "MLPC": MLPClassifier(),
    "LR": LogisticRegression(),
}


def load_data():
    df_train = pd.read_csv("../data/adult.data.csv", header=None)
    df_test = pd.read_csv("../data/adult.test.csv", skiprows=[0], header=None)
    return df_train, df_test

def data_preprocess(train,test):
    train_data, test_data = train, test
    # Pre-process data
    
    for col_name in test_data.columns:
        if(test_data[col_name].dtype == 'object'):
            test_data[col_name] = test_data[col_name].astype('category')
            test_data[col_name] = test_data[col_name].cat.codes
            
    for col_name in train_data.columns:
        if(train_data[col_name].dtype == 'object'):
            train_data[col_name] = train_data[col_name].astype('category')
            train_data[col_name] = train_data[col_name].cat.codes
    
    train_data_full = train_data.copy()
    train_data = train_data.iloc[:, :-1]#use last value 
    train_labels = train_data_full.iloc[:, -1]

    test_data_full = test_data.copy()
    test_data = test_data.iloc[:, :-1]
    test_labels = test_data_full.iloc[:, -1]
    
    #impute data using median
    imputer = Imputer(strategy="median")
    train_data = pd.DataFrame(imputer.fit_transform(train_data))
    test_data = pd.DataFrame(imputer.transform(test_data))
    
    # Standardize the inputs
    train_mean = train_data.mean()
    train_std = train_data.std()
    train_data = (train_data - train_mean) / train_std
    test_data = (test_data - train_mean) / train_std
    
    train_data['intercept_dummy'] = pd.Series(1.0, index=train_data.index)
    test_data['intercept_dummy'] = pd.Series(1.0, index=test_data.index)
    return train_data, train_labels, test_data, test_labels, train_data_full, test_data_full

if __name__ == '__main__':
    
    # Step 1: Load Data
    train,test = load_data()

    # Step 2: Preprocess the data
    train_data, train_labels, test_data, test_labels, train_data_full, test_data_full = data_preprocess(train,test)

    # Step 3: Learning Start
    for method in algorithms:
        clf = algorithms[method]
        start_time = datetime.datetime.now()  # Track learning starting time
        clf.fit(train_data.values, train_labels)
        end_time = datetime.datetime.now()  # Track learning ending time
        exection_time = (end_time - start_time).total_seconds()  # Track execution time
        prediction = clf.predict(test_data.values)

        # Step 4: Results presentation
        print(clf)
        print("Learn: execution time={t:.3f} seconds".format(t=exection_time))

        # Build baseline model
        print("Accuracy score:", float("{0:.2f}".format(accuracy_score(test_labels, prediction)))) # R2 should be maximize
        print("Precision score:", float("{0:.2f}".format(precision_score(test_labels, prediction))))
        print("Recall score:", float("{0:.2f}".format(recall_score(test_labels, prediction))))
        print("F1 score:", float("{0:.2f}".format(f1_score(test_labels, prediction))))
        print("AUC:", float("{0:.2f}".format(roc_auc_score(test_labels, prediction), "\n")))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
Learn: execution time=0.634 seconds
Accuracy score: 0.83
Precision score: 0.66
Recall score: 0.57
F1 score: 0.61
AUC: 0.74
GaussianNB(priors=None)
Learn: execution time=0.048 seconds
Accuracy score: 0.8
Precision score: 0.67
Recall score: 0.33
F1 score: 0.44
AUC: 0.64
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Learn: execution time=15.153 seconds
Accuracy score: 0.85
Precision score: 0.76
Recall score: 0.55
F1 score: 0.64
AUC: 0.75
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_le



LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)
Learn: execution time=0.792 seconds
Accuracy score: 0.82
Precision score: 0.69
Recall score: 0.41
F1 score: 0.51
AUC: 0.67
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)
Learn: execution time=6.739 seconds
Accuracy score: 0.85
Precision score: 0.7
Recall score: 0.63
F1 score: 0.67
AUC: 0.77
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_