# Data Cleaning

In [None]:
# imports
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import seaborn as sns
import sys
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, average_precision_score, f1_score, jaccard_score, recall_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

## 1. Adult Dataset

In [None]:
# read csv data from Github
original_adult_df =  pd.read_csv('https://raw.githubusercontent.com/Hellboy1008/COGS118A_Final_Project/master/data/adult.data', na_values='?', sep=',', skipinitialspace=True, names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'])
original_adult_df

In [None]:
# remove education-num column because it is a repetition of education column in numeric form
# remove fnlwgt because it is not related to the income
adult_df = original_adult_df.drop(columns=['fnlwgt', 'education-num'])
adult_df

In [None]:
# find the number of rows with missing values
missing_rows = adult_df.shape[0] - adult_df.dropna().shape[0]
missing_rows

In [None]:
# remove rows with missing values as it is only 7% of the dataset and we still have plenty of data to work with
adult_df = adult_df.dropna()
adult_df

In [None]:
# function to plot class balance for dataset
def plotBalance(df, label):
  label_count = df[label].value_counts()
  label_percentage = str(round(label_count.values[0] / df.shape[0] * 100,2)) + '% vs ' + str(round(label_count.values[1] / df.shape[0] * 100,2)) + '%'
  sns.barplot(x=label_count.index, y=label_count.values, alpha=0.9)
  plt.title('Frequency of the classes (' + label_percentage + ')')
  plt.ylabel('Number of Occurrences', fontsize=12)
  plt.xlabel(label, fontsize=12)
  plt.show()
# plot income class balance
plotBalance(adult_df, 'income')

The binary variable that we'll be evaluating is "income" which describes whether or not income for an individual exceeds $50,000 per year. 

In [None]:
# function to one hot encode categorical variables
def OneHotEncode(df, cols):
  ohe = OneHotEncoder(sparse=False)
  return ohe.fit_transform(df[cols])

# function to scale continuous variables
def StandardScale(df, cols):
  scaler = StandardScaler()
  return scaler.fit_transform(df[cols])

# function to change label to binary class
def changeBinary(col, zero_class):
  col = [0 if x == zero_class else 1 for x in col]
  return np.asarray(col)[:, np.newaxis]

# apply one hot encoding and standard scaling, and represent <=50K as 0 and >50K as 1 for the income label
encode_cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
scale_cols = ['age', 'capital-gain', 'capital-loss', 'hours-per-week']
adult_data = np.concatenate([StandardScale(adult_df, scale_cols), OneHotEncode(adult_df, encode_cols), changeBinary(adult_df['income'], '<=50K')], axis=1)
adult_data

## 2. Mushroom Dataset

In [None]:
# read csv data from Github
original_mushroom_df =  pd.read_csv('https://raw.githubusercontent.com/Hellboy1008/COGS118A_Final_Project/master/data/mushroom.data', na_values='?', sep=',', skipinitialspace=True, names=['toxicity', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface--ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat'])
original_mushroom_df

In [None]:
# find the number of rows with missing values
missing_rows = original_mushroom_df.shape[0] - original_mushroom_df.dropna().shape[0]
missing_rows

In [None]:
# We find that about 30% of the data have a missing value, which happens to be all for the attribute stalk-root. Instead of removing the data,
# we will change the N/A value to the character 'm' for missing.
mushroom_df = original_mushroom_df.fillna('m')
mushroom_df

In [None]:
# plot toxicity class balance
plotBalance(mushroom_df, 'toxicity')

The binary variable 

In [None]:
# apply one hot encoding and standard scaling, and represent e as 0 and t as 1 for the toxicity label
encode_cols = mushroom_df.drop(columns=['toxicity']).columns
mushroom_data = np.concatenate([OneHotEncode(mushroom_df, encode_cols), changeBinary(mushroom_df['toxicity'], 'e')], axis=1)
mushroom_data

## 3. QSAR Oral Toxicity Data Set

In [None]:
# read csv data from Github
cols = []
for i in range(1024):
  cols.append('binary molecule data ' + str(i+1))
cols.append('toxicity')
original_qsar_df = pd.read_csv('https://raw.githubusercontent.com/Hellboy1008/COGS118A_Final_Project/master/data/qsar_oral_toxicity.csv', sep=';', skipinitialspace=True, names=cols)
original_qsar_df

In [None]:
# find the number of rows with missing values
missing_rows = original_qsar_df.shape[0] - original_qsar_df.dropna().shape[0]
missing_rows

In [None]:
# plot toxicity class balance
plotBalance(original_qsar_df, 'toxicity')

In [None]:
# represent negative as 0 and positive as 1 for the toxicity label
qsar_data = np.concatenate([original_qsar_df.drop(columns=['toxicity']).to_numpy(), changeBinary(original_qsar_df['toxicity'], 'negative')], axis=1)
qsar_data

## 4. EEG Eye State Data Set

In [None]:
# read csv data from Github
original_eeg_df = pd.read_csv('https://raw.githubusercontent.com/Hellboy1008/COGS118A_Final_Project/master/data/EEG%20Eye%20State.arff', sep=',', skipinitialspace=True, names=['AF3', 'F7', 'F3', 'FC5', 'T7', 'P7', 'O1', 'O2', 'P8', 'T8', 'FC6', 'F4', 'F8', 'AF4', 'eye detection'])
original_eeg_df

In [None]:
# find the number of rows with missing values
missing_rows = original_eeg_df.shape[0] - original_eeg_df.dropna().shape[0]
missing_rows

In [None]:
# plot toxicity class balance
plotBalance(original_eeg_df, 'eye detection')

In [None]:
# apply standard scaling
scale_cols = original_eeg_df.drop(columns=['eye detection']).columns
eeg_data = np.concatenate([StandardScale(original_eeg_df, scale_cols), original_eeg_df['eye detection'].to_numpy()[:, np.newaxis]], axis=1)
eeg_data

## 5. In-vehicle coupon recommendation Dataset

In [None]:
# read csv data from Github
original_coupon_df = pd.read_csv('https://raw.githubusercontent.com/Hellboy1008/COGS118A_Final_Project/master/data/in-vehicle-coupon-recommendation.csv', sep=',', skipinitialspace=True)
original_coupon_df

In [None]:
# find the number of rows with missing values
missing_rows = original_coupon_df.shape[0] - original_coupon_df.dropna().shape[0]
missing_rows

In [None]:
# check which columns have missing values
original_coupon_df.isna().sum()

In [None]:
# drop the data from car column as it is missing values for most of the dataset
coupon_df = original_coupon_df.drop(columns=['car'])
# drop the rest of the rows with missing values as it is only 4% of the dataset
coupon_df = coupon_df.dropna()
coupon_df

In [None]:
# plot coupon class balance
plotBalance(coupon_df, 'Y')

In [None]:
# apply one hot encoding
encoding_cols = coupon_df.drop(columns=['Y']).columns
coupon_data = np.concatenate([OneHotEncode(coupon_df, encoding_cols), coupon_df['Y'].to_numpy()[:, np.newaxis]], axis=1)
coupon_data

## 6. Occupancy Detection Dataset

In [None]:
# read csv data from Github
original_occupancy_df = pd.read_csv('https://raw.githubusercontent.com/Hellboy1008/COGS118A_Final_Project/master/data/occupancy.txt', sep=',', skipinitialspace=True)
original_occupancy_df

In [None]:
# drop date column as it adds no value to the data
occupancy_df = original_occupancy_df.drop(columns=['date'])
occupancy_df

In [None]:
# find the number of rows with missing values
missing_rows = occupancy_df.shape[0] - occupancy_df.dropna().shape[0]
missing_rows

In [None]:
# plot coupon class balance
plotBalance(occupancy_df, 'Occupancy')

In [None]:
# apply standard scaling
scale_cols = occupancy_df.drop(columns=['Occupancy']).columns
occupancy_data = np.concatenate([StandardScale(occupancy_df, scale_cols), occupancy_df['Occupancy'].to_numpy()[:, np.newaxis]], axis=1)
occupancy_data

## 7. Default of credit card clients Dataset

In [None]:
# read csv data from Github
original_credit_df = pd.read_csv('https://raw.githubusercontent.com/Hellboy1008/COGS118A_Final_Project/master/data/default%20of%20credit%20card%20clients.csv', sep=',', skipinitialspace=True)
original_credit_df

In [None]:
# drop ID column as it adds no value to the data
credit_df = original_credit_df.drop(columns=['ID'])
credit_df

In [None]:
# find the number of rows with missing values
missing_rows = credit_df.shape[0] - credit_df.dropna().shape[0]
missing_rows

In [None]:
# plot default payment class balance
plotBalance(credit_df, 'default payment next month')

In [None]:
# apply one hot encoding and standard scaling
encode_cols = ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
scale_cols = [x for x in credit_df.columns if x not in encode_cols]
scale_cols.remove('default payment next month')
credit_data = np.concatenate([StandardScale(credit_df, scale_cols), OneHotEncode(credit_df, encode_cols), credit_df['default payment next month'].to_numpy()[:, np.newaxis]], axis=1)
credit_data

## 8. MoCap Hand Postures Dataset

In [None]:
# read csv data from Github
original_mocap_df = pd.read_csv('https://raw.githubusercontent.com/Hellboy1008/COGS118A_Final_Project/master/data/mocap.csv', na_values='?', sep=',', skipinitialspace=True)
original_mocap_df

In [None]:
# drop user column as it provides no additional value to the data
mocap_df = original_mocap_df.drop(columns=['User'])

In [None]:
# find the number of rows with missing values
missing_rows = mocap_df.shape[0] - mocap_df.dropna().shape[0]
missing_rows

In [None]:
# find out which columns have missing values
mocap_df.isna().sum()

In [None]:
# drop the data from the columns with over 10000 instances of missing values
mocap_df = mocap_df.drop(columns=['X5', 'Y5', 'Z5', 'X6', 'Y6', 'Z6', 'X7', 'Y7', 'Z7', 'X8', 'Y8', 'Z8', 'X9', 'Y9', 'Z9', 'X10', 'Y10', 'Z10', 'X11', 'Y11', 'Z11'])
# drop the rest of the rows with missing values as it is only 3% of the dataset
mocap_df = mocap_df.dropna()
mocap_df

In [None]:
# combine class 3 and 4 for pointing (1) and combine class 1, 2, and 5 for not pointing (0)
mocap_df['Class'] = mocap_df['Class'].replace([1,2,3,4,5], [0,0,1,1,0])
mocap_df

In [None]:
# plot class balance
plotBalance(mocap_df, 'Class')

In [None]:
# apply standard scaling
scale_cols = mocap_df.drop(columns=['Class']).columns
mocap_data = np.concatenate([StandardScale(mocap_df, scale_cols), mocap_df['Class'].to_numpy()[:, np.newaxis]], axis=1)
mocap_data

# Training and testing

In [None]:
# function that writes data into a text file given the dataset and algorithm as the filename used to store all the raw data the 240 total trials
def write(data, dataset, algorithm, description):
    original_stdout = sys.stdout
    with open('./results/' + dataset + '-' + algorithm + '-' +  description + '.result', 'a') as f:
        sys.stdout = f
        print(data)
        sys.stdout = original_stdout

In [None]:
# runs each metric for the test set and returns a list containing the value for all the metrics
def getMetrics(y_pred, y_true):
    metrics_lst = []
    metrics_lst.append(('accuracy', accuracy_score(y_true, y_pred)))
    metrics_lst.append(('average-precision-score', average_precision_score(y_true, y_pred)))
    metrics_lst.append(('f1', f1_score(y_true, y_pred)))
    metrics_lst.append(('jaccard', jaccard_score(y_true, y_pred)))
    metrics_lst.append(('recall', recall_score(y_true, y_pred)))
    metrics_lst.append(('roc_auc_score', roc_auc_score(y_true, y_pred)))
    return metrics_lst

In [None]:
# useful lists that are used later on for calculations
datasets = [adult_data, mushroom_data, qsar_data, eeg_data, coupon_data, occupancy_data, credit_data, mocap_data]
datasets_str = ['ADULT', 'MUSHROOM', 'QSAR', 'EEG', 'COUPON', 'OCCUPANCY', 'CREDIT', 'MOCAP']
tests = ['rank_test_accuracy', 'rank_test_average_precision', 'rank_test_f1', 'rank_test_jaccard', 'rank_test_recall', 'rank_test_roc_auc']
metrics = ['accuracy', 'f1', 'roc_auc', 'average_precision', 'recall', 'jaccard']

In [None]:
# Support Vector Machines
def SVM_test(X_train, X_test, y_train, y_test, trial_num, data_index):
    svm_param_grid = {'kernel': ['linear', 'rbf'], 'C': np.logspace(-4,2,7)}
    svm_clf = GridSearchCV(SVC(), svm_param_grid, cv=StratifiedKFold(n_splits=5), scoring=metrics, refit=False, verbose=1)
    svm_best_model = svm_clf.fit(X_train, y_train)
    write(svm_best_model.cv_results_, datasets_str[data_index], 'svm', 'trial-' + str(trial_num + 1))
    # find the six best models per metric and record the metrics on the test set
    for test in tests:
        best_params = svm_best_model.cv_results_['params'][np.argmin(svm_best_model.cv_results_[test])]
        best_model = SVC(**best_params)
        best_model.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)
        write(getMetrics(y_pred, y_test), datasets_str[data_index], 'svm', 'trial-' + str(trial_num + 1))

for data_index, data in enumerate(datasets):
    for trial_num in range(5):
        X = data[:, 0:-1]
        y = data[:, -1:].flatten()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=((X.shape[0]-5000.0)/X.shape[0]), stratify=y)
        SVM_test(X_train, X_test, y_train, y_test, trial_num, data_index)

In [None]:
# Logistic Regression
def LGR_test(X_train, X_test, y_train, y_test, trial_num, data_index):
    lgr_pipe = Pipeline([('classifier', LogisticRegression())])
    lgr_param_grid = [{'classifier': [LogisticRegression(max_iter=5000)], 'classifier__solver': ['saga'], 'classifier__penalty': ['l1', 'l2'], 'classifier__C': np.logspace(-4, 4, 5)}, {'classifier': [LogisticRegression(max_iter=5000)], 'classifier__solver': ['lbfgs'], 'classifier__penalty': ['l2'], 'classifier__C': np.logspace(-4, 4, 5)},]
    lgr_clf = GridSearchCV(lgr_pipe, lgr_param_grid, cv=StratifiedKFold(n_splits=5), scoring=metrics, refit=False, verbose=3)
    lgr_best_model = lgr_clf.fit(X_train, y_train)
    write(lgr_best_model.cv_results_, datasets_str[data_index], 'lgr', 'trial-' + str(trial_num + 1))
    # find the six best models per metric and record the metrics on the test set
    for test in tests:
        best_params = lgr_best_model.cv_results_['params'][np.argmin(lgr_best_model.cv_results_[test])]
        best_params = {'C': best_params['classifier__C'], 'solver': best_params['classifier__solver'], 'penalty': best_params['classifier__penalty']}
        best_model = LogisticRegression(**best_params)
        best_model.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)
        write(getMetrics(y_pred, y_test), datasets_str[data_index], 'lgr', 'trial-' + str(trial_num + 1))

for data_index, data in enumerate(datasets):
    for trial_num in range(5):
        X = data[:, 0:-1]
        y = data[:, -1:].flatten()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=((X.shape[0]-5000.0)/X.shape[0]), stratify=y)
        LGR_test(X_train, X_test, y_train, y_test, trial_num, data_index)

In [None]:
# K Nearest Neighbors
def KNN_test(X_train, X_test, y_train, y_test, trial_num, data_index):
    neigbors = np.arange(1, X_train.shape[1]+1, 4)
    if data_index == 2:
        neigbors = np.arange(1, int(math.sqrt(X_train.shape[1])+1), 4)
    knn_param_grid = {'n_neighbors': neigbors, 'algorithm': ['ball_tree', 'kd_tree']}
    knn_clf = GridSearchCV(KNeighborsClassifier(), knn_param_grid, cv=StratifiedKFold(n_splits=5), scoring=metrics, refit=False, verbose=1)
    knn_best_model = knn_clf.fit(X_train, y_train)
    write(knn_best_model.cv_results_, datasets_str[data_index], 'knn', 'trial-' + str(trial_num + 1))
    # find the six best models per metric and record the metrics on the test set
    for test in tests:
        best_params = knn_best_model.cv_results_['params'][np.argmin(knn_best_model.cv_results_[test])]
        best_model = KNeighborsClassifier(**best_params)
        best_model.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)
        write(getMetrics(y_pred, y_test), datasets_str[data_index], 'knn', 'trial-' + str(trial_num + 1))

for data_index, data in enumerate(datasets):
    for trial_num in range(5):
        y = data[:, -1:].flatten()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=((X.shape[0]-5000.0)/X.shape[0]), stratify=y)
        KNN_test(X_train, X_test, y_train, y_test, trial_num, data_index)

In [None]:
# Decision Trees
def DT_test(X_train, X_test, y_train, y_test, trial_num, data_index):
    depth = np.arange(1, 41)
    dt_param_grid = {'criterion': ['gini', 'entropy'], 'max_features': ['sqrt', 'log2'], 'max_depth': depth}
    dt_clf = GridSearchCV(DecisionTreeClassifier(random_state=(data_index+1)*10+trial_num), dt_param_grid, cv=StratifiedKFold(n_splits=5), scoring=metrics, refit=False, verbose=1)
    dt_best_model = dt_clf.fit(X_train, y_train)
    write(dt_best_model.cv_results_, datasets_str[data_index], 'dt', 'trial-' + str(trial_num + 1))
    # find the six best models per metric and record the metrics on the test set
    for test in tests:
        best_params = dt_best_model.cv_results_['params'][np.argmin(dt_best_model.cv_results_[test])]
        best_model = DecisionTreeClassifier(**best_params, random_state=(data_index+1)*10+trial_num)
        best_model.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)
        write(getMetrics(y_pred, y_test), datasets_str[data_index], 'dt', 'trial-' + str(trial_num + 1))

for data_index, data in enumerate(datasets):
    for trial_num in range(5):
        X = data[:, 0:-1]
        y = data[:, -1:].flatten()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=((X.shape[0]-5000.0)/X.shape[0]), stratify=y)
        DT_test(X_train, X_test, y_train, y_test, trial_num, data_index)

In [None]:
# Gaussian Naive Bayes
def NB_test(X_train, X_test, y_train, y_test, trial_num, data_index):
    nb_param_grid = {'var_smoothing': np.logspace(-11, 0, 12)}
    nb_clf = GridSearchCV(GaussianNB(), nb_param_grid, cv=StratifiedKFold(n_splits=5), scoring=metrics, refit=False, verbose=1)
    nb_best_model = nb_clf.fit(X_train, y_train)
    write(nb_best_model.cv_results_, datasets_str[data_index], 'nb', 'trial-' + str(trial_num + 1))
    # find the six best models per metric and record the metrics on the test set
    for test in tests:
        best_params = nb_best_model.cv_results_['params'][np.argmin(nb_best_model.cv_results_[test])]
        best_model = GaussianNB(**best_params)
        best_model.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)
        write(getMetrics(y_pred, y_test), datasets_str[data_index], 'nb', 'trial-' + str(trial_num + 1))

for data_index, data in enumerate(datasets):
    for trial_num in range(5):
        X = data[:, 0:-1]
        y = data[:, -1:].flatten()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=((X.shape[0]-5000.0)/X.shape[0]), stratify=y)
        NB_test(X_train, X_test, y_train, y_test, trial_num, data_index)

In [None]:
# Random Forests
def RF_test(X_train, X_test, y_train, y_test, trial_num, data_index):
    depth = np.arange(1, 22, 3)
    rf_param_grid = {'criterion': ['gini', 'entropy'], 'max_features': ['sqrt', 'log2'], 'n_estimators': [100, 200, 300], 'max_depth': depth}
    rf_clf = GridSearchCV(RandomForestClassifier(random_state=(data_index+1)*10+trial_num), rf_param_grid, cv=StratifiedKFold(n_splits=5), scoring=metrics, refit=False, verbose=1)
    rf_best_model = rf_clf.fit(X_train, y_train)
    write(rf_best_model.cv_results_, datasets_str[data_index], 'rf', 'trial-' + str(trial_num + 1))
    # find the six best models per metric and record the metrics on the test set
    for test in tests:
        best_params = rf_best_model.cv_results_['params'][np.argmin(rf_best_model.cv_results_[test])]
        best_model = RandomForestClassifier(**best_params, random_state=(data_index+1)*10+trial_num)
        best_model.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)
        write(getMetrics(y_pred, y_test), datasets_str[data_index], 'rf', 'trial-' + str(trial_num + 1))

for data_index, data in enumerate(datasets):
    for trial_num in range(5):
        X = data[:, 0:-1]
        y = data[:, -1:].flatten()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=((X.shape[0]-5000.0)/X.shape[0]), stratify=y)
        RF_test(X_train, X_test, y_train, y_test, trial_num, data_index)