# Machine Learning Exercise 1 - Classification

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

from sklearn.model_selection import GridSearchCV

import time

In [2]:
def build_generic(fit_fun, params, X_train, X_test, y_train, y_test):
    timings = []
    scores = []
    scoring = {'accuracy' : make_scorer(accuracy_score, ), 
               'precision' : make_scorer(precision_score, average = 'macro', zero_division = 0),
               'recall' : make_scorer(recall_score, average = 'macro', zero_division = 0), 
               'f1_score' : make_scorer(f1_score, average = 'macro'),
              }
    best_model = 'none'
    ho_accs = []
    
    for p in params:
        start = time.time()
        model = eval(f'{fit_fun}(p, X_train, y_train)')
        timings.append(time.time() - start)
        
        X = X_train.append(X_test)
        y = y_train.append(y_test)
        res_cv = cross_validate(model, X, y, cv = 5, scoring = scoring)
        scores.append(res_cv)
        
        # holdout accuracy
        ho_acc = accuracy_score(y_test, model.predict(X_test))
        ho_accs.append(ho_acc)
        
        best_model = model
        
    return timings, scores, params, best_model, ho_accs

In [3]:
# KNN model builder
def fit_knn(params, X_train, y_train):
    knn_model = KNeighborsClassifier(**params)
    knn_model.fit(X_train, y_train)
    return knn_model
    
def build_knn(X_train, X_test, y_train, y_test):
    params = []
    params.append({'n_neighbors':1})
    params.append({'n_neighbors':5})
    params.append({'n_neighbors':10})
    
    return build_generic('fit_knn', params, X_train, X_test, y_train, y_test)    

In [4]:
# Tree model builder
def fit_tree(params, X_train, y_train):
    tree_model = DecisionTreeClassifier(**params)
    tree_model.fit(X_train, y_train)
    return tree_model
    
def build_tree(X_train, X_test, y_train, y_test):
    params = []
    params.append({'max_depth':5, 'min_samples_leaf': 4})
    params.append({'max_depth':20, 'min_samples_leaf': 4, 'splitter': 'best'})
    params.append({'max_depth':20, 'min_samples_leaf': 4, 'splitter': 'random'})

    return build_generic('fit_tree', params, X_train, X_test, y_train, y_test)    

In [5]:
# MLP model builder
def fit_mlp(params, X_train, y_train):
    mlp_model = MLPClassifier(**params)
    mlp_model.fit(X_train, y_train)
    return mlp_model
    
def build_mlp(X_train, X_test, y_train, y_test):
    params = []
    params.append({'early_stopping': True, 'solver': 'adam'})
    params.append({'early_stopping': True, 'solver': 'sgd', 'learning_rate': 'adaptive'})
    params.append({'early_stopping': True, 'solver': 'lbfgs', 'max_fun': 15000, 'max_iter': 300})
    
    return build_generic('fit_mlp', params, X_train, X_test, y_train, y_test)    

In [6]:
def build_models(X_train, X_test, y_train, y_test):
    
    knn_timings, knn_scores, knn_params, knn_model, knn_ho_acc = build_knn(X_train, X_test, y_train, y_test)
    tree_timings, tree_scores, tree_params, tree_model, tree_ho_acc = build_tree(X_train, X_test, y_train, y_test)
    mlp_timings, mlp_scores, mlp_params, mlp_model, mlp_ho_acc = build_mlp(X_train, X_test, y_train, y_test)
    
    idx = pd.MultiIndex.from_product([['KNN', 'TREE', 'MLP'],['params', 'time', 'holdout accuracy', 'accuracy', 'precision', 'recall', 'f1_score']])
    
    data = []
    for i in range(3):
        row = [
            knn_params[i], knn_timings[i], knn_ho_acc[i], knn_scores[i].get('test_accuracy').mean(), knn_scores[i].get('test_precision').mean(), knn_scores[i].get('test_recall').mean(), knn_scores[i].get('test_f1_score').mean(),
            tree_params[i], tree_timings[i], tree_ho_acc[i], tree_scores[i].get('test_accuracy').mean(), tree_scores[i].get('test_precision').mean(), tree_scores[i].get('test_recall').mean(), tree_scores[i].get('test_f1_score').mean(),
            mlp_params[i], mlp_timings[i], mlp_ho_acc[i], mlp_scores[i].get('test_accuracy').mean(), mlp_scores[i].get('test_precision').mean(), mlp_scores[i].get('test_recall').mean(), mlp_scores[i].get('test_f1_score').mean(),
        ]
        data.append(row)

    results = pd.DataFrame(data, columns = idx, index = [0,1,2])
    return results

In [7]:
pd.set_option('display.max_colwidth', None)

notebook_time = time.time()

## Mushroom Edibility

In [8]:
mushrooms = pd.read_csv('./mushrooms/mushrooms.csv')
# encode labels
mushrooms = mushrooms.apply(LabelEncoder().fit_transform)

mushrooms_X = mushrooms.drop('edibility', axis=1)
mushrooms_y = mushrooms['edibility']

# these features are enough to classify the whole dataset, see agaricus-lepiota.names
mushrooms_X = mushrooms_X[['odor', 'spore-print-color', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'habitat', 'cap-color']]

In [9]:
mushrooms_X_train, mushrooms_X_test, mushrooms_y_train, mushrooms_y_test = train_test_split(mushrooms_X, mushrooms_y)

mushrooms_results = build_models(mushrooms_X_train, mushrooms_X_test, mushrooms_y_train, mushrooms_y_test)
mushrooms_results

Unnamed: 0_level_0,KNN,KNN,KNN,KNN,KNN,KNN,KNN,TREE,TREE,TREE,TREE,TREE,TREE,TREE,MLP,MLP,MLP,MLP,MLP,MLP,MLP
Unnamed: 0_level_1,params,time,holdout accuracy,accuracy,precision,recall,f1_score,params,time,holdout accuracy,...,precision,recall,f1_score,params,time,holdout accuracy,accuracy,precision,recall,f1_score
0,{'n_neighbors': 1},0.018936,1.0,1.0,1.0,1.0,1.0,"{'max_depth': 5, 'min_samples_leaf': 4}",0.005419,0.996198,...,0.996828,0.9965,0.996657,"{'early_stopping': True, 'solver': 'adam'}",8.586573,1.0,0.990851,0.991267,0.990439,0.990801
1,{'n_neighbors': 5},0.011325,1.0,1.0,1.0,1.0,1.0,"{'max_depth': 20, 'min_samples_leaf': 4, 'splitter': 'best'}",0.004126,1.0,...,1.0,1.0,1.0,"{'early_stopping': True, 'solver': 'sgd', 'learning_rate': 'adaptive'}",7.747749,0.855038,0.831391,0.830873,0.830287,0.830482
2,{'n_neighbors': 10},0.013205,0.996198,0.998693,0.998717,0.998663,0.998687,"{'max_depth': 20, 'min_samples_leaf': 4, 'splitter': 'random'}",0.004667,1.0,...,1.0,1.0,1.0,"{'early_stopping': True, 'solver': 'lbfgs', 'max_fun': 15000, 'max_iter': 300}",5.720261,1.0,1.0,1.0,1.0,1.0


## Soybeans

In [10]:
soybeans = pd.read_csv('./soybeans/soybean_cleaned.csv')

# handle missing values by dropping, see pdf for more info
soybeans.dropna(inplace = True)

# encode labels
soybeans = soybeans.apply(LabelEncoder().fit_transform)

soybeans_X = soybeans.drop('class', axis=1)
soybeans_y = soybeans['class']

soybeans_X_train, soybeans_X_test, soybeans_y_train, soybeans_y_test = train_test_split(soybeans_X, soybeans_y)

In [11]:
soybeans_results = build_models(soybeans_X_train, soybeans_X_test, soybeans_y_train, soybeans_y_test)
soybeans_results

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Unnamed: 0_level_0,KNN,KNN,KNN,KNN,KNN,KNN,KNN,TREE,TREE,TREE,TREE,TREE,TREE,TREE,MLP,MLP,MLP,MLP,MLP,MLP,MLP
Unnamed: 0_level_1,params,time,holdout accuracy,accuracy,precision,recall,f1_score,params,time,holdout accuracy,...,precision,recall,f1_score,params,time,holdout accuracy,accuracy,precision,recall,f1_score
0,{'n_neighbors': 1},0.007485,0.865248,0.822155,0.878251,0.827593,0.840024,"{'max_depth': 5, 'min_samples_leaf': 4}",0.002577,0.716312,...,0.644369,0.667914,0.630811,"{'early_stopping': True, 'solver': 'adam'}",0.459778,0.829787,0.758028,0.797879,0.711774,0.726799
1,{'n_neighbors': 5},0.002903,0.77305,0.784719,0.850757,0.761667,0.777536,"{'max_depth': 20, 'min_samples_leaf': 4, 'splitter': 'best'}",0.002873,0.914894,...,0.871678,0.869113,0.859335,"{'early_stopping': True, 'solver': 'sgd', 'learning_rate': 'adaptive'}",0.557056,0.312057,0.439602,0.256412,0.252349,0.228068
2,{'n_neighbors': 10},0.002471,0.730496,0.756242,0.805023,0.706813,0.72201,"{'max_depth': 20, 'min_samples_leaf': 4, 'splitter': 'random'}",0.002953,0.921986,...,0.929047,0.906598,0.908504,"{'early_stopping': True, 'solver': 'lbfgs', 'max_fun': 15000, 'max_iter': 300}",1.984764,0.914894,0.864744,0.919447,0.900517,0.901509


## Breast Cancer Data

In [12]:
breastcancer_train = pd.read_csv('./breastcancer/breast-cancer-diagnostic.shuf.lrn.csv')
breastcancer_sol_input = pd.read_csv('./breastcancer/breast-cancer-diagnostic.shuf.tes.csv')


breastcancer_train = breastcancer_train.drop('ID', axis=1)
breastcancer_sol_input = breastcancer_sol_input.drop('ID', axis=1)

breastcancer_X = breastcancer_train.drop('class', axis=1)
breastcancer_y = breastcancer_train['class']

breastcancer_X_train, breastcancer_X_test, breastcancer_y_train, breastcancer_y_test  = train_test_split(breastcancer_X, breastcancer_y)

In [13]:
breastcancer_results = build_models(breastcancer_X_train, breastcancer_X_test, breastcancer_y_train, breastcancer_y_test)
breastcancer_results

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Unnamed: 0_level_0,KNN,KNN,KNN,KNN,KNN,KNN,KNN,TREE,TREE,TREE,TREE,TREE,TREE,TREE,MLP,MLP,MLP,MLP,MLP,MLP,MLP
Unnamed: 0_level_1,params,time,holdout accuracy,accuracy,precision,recall,f1_score,params,time,holdout accuracy,...,precision,recall,f1_score,params,time,holdout accuracy,accuracy,precision,recall,f1_score
0,{'n_neighbors': 1},0.005484,0.888889,0.926316,0.932369,0.907681,0.916178,"{'max_depth': 5, 'min_samples_leaf': 4}",0.004458,0.944444,...,0.912861,0.911657,0.911011,"{'early_stopping': True, 'solver': 'adam'}",0.045944,0.666667,0.866667,0.809878,0.814068,0.802727
1,{'n_neighbors': 5},0.002087,0.902778,0.940351,0.949136,0.920121,0.931283,"{'max_depth': 20, 'min_samples_leaf': 4, 'splitter': 'best'}",0.004065,0.930556,...,0.918865,0.911799,0.914454,"{'early_stopping': True, 'solver': 'sgd', 'learning_rate': 'adaptive'}",0.252822,0.666667,0.715789,0.648209,0.677105,0.603196
2,{'n_neighbors': 10},0.003385,0.888889,0.915789,0.924919,0.88926,0.902232,"{'max_depth': 20, 'min_samples_leaf': 4, 'splitter': 'random'}",0.001987,0.944444,...,0.896783,0.867994,0.87729,"{'early_stopping': True, 'solver': 'lbfgs', 'max_fun': 15000, 'max_iter': 300}",0.113721,0.666667,0.821053,0.683793,0.756031,0.712404


## Purchase Data

In [14]:
purchase_train = pd.read_csv('./purchase/purchase600-100cls-15k.lrn.csv')
purchase_sol_input = pd.read_csv('./purchase/purchase600-100cls-15k.tes.csv')

# labels do not need to be encoded, inputs are numeric
#purchase_train = purchase_train.apply(LabelEncoder().fit_transform)

purchase_train = purchase_train.drop('ID', axis=1)
purchase_sol_input = purchase_sol_input.drop('ID', axis=1)

purchase_X = purchase_train.drop('class', axis=1)
purchase_y = purchase_train['class']

purchase_X_train, purchase_X_test, purchase_y_train, purchase_y_test  = train_test_split(purchase_X, purchase_y, test_size = 0.5)

In [15]:
purchase_results = build_models(purchase_X_train, purchase_X_test, purchase_y_train, purchase_y_test)
purchase_results



Unnamed: 0_level_0,KNN,KNN,KNN,KNN,KNN,KNN,KNN,TREE,TREE,TREE,TREE,TREE,TREE,TREE,MLP,MLP,MLP,MLP,MLP,MLP,MLP
Unnamed: 0_level_1,params,time,holdout accuracy,accuracy,precision,recall,f1_score,params,time,holdout accuracy,...,precision,recall,f1_score,params,time,holdout accuracy,accuracy,precision,recall,f1_score
0,{'n_neighbors': 1},0.43002,0.192,0.2035,0.208637,0.20045,0.18232,"{'max_depth': 5, 'min_samples_leaf': 4}",0.187971,0.0872,...,0.039902,0.06883,0.041394,"{'early_stopping': True, 'solver': 'adam'}",9.102995,0.6134,0.6865,0.690486,0.658368,0.663109
1,{'n_neighbors': 5},0.352467,0.2316,0.2536,0.259943,0.236063,0.211383,"{'max_depth': 20, 'min_samples_leaf': 4, 'splitter': 'best'}",0.648633,0.0914,...,0.088836,0.085955,0.082213,"{'early_stopping': True, 'solver': 'sgd', 'learning_rate': 'adaptive'}",15.502464,0.0268,0.6875,0.681452,0.64277,0.651025
2,{'n_neighbors': 10},0.331397,0.2476,0.2764,0.3061,0.255396,0.233079,"{'max_depth': 20, 'min_samples_leaf': 4, 'splitter': 'random'}",0.593883,0.091,...,0.084583,0.084115,0.080671,"{'early_stopping': True, 'solver': 'lbfgs', 'max_fun': 15000, 'max_iter': 300}",19.933641,0.6258,0.7207,0.716441,0.699654,0.700212


In [16]:
print(f'notebook took this long in seconds: {time.time()-notebook_time}')

notebook took this long in seconds: 1366.580510377884
