# Machine Learning Exercise 1 - Classification

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.model_selection import GridSearchCV

import time

In [2]:
def build_generic(fit_fun, params, X_train, X_test, y_train, y_test):
    timings = []
    scores = []
    best_model = 'none'
    
    for p in params:
        start = time.time()
        model = eval(f'{fit_fun}(p, X_train, y_train)')
        timings.append(time.time() - start)
        
        X = X_train.append(X_test)
        y = y_train.append(y_test)
        scores.append(cross_val_score(model, X, y, cv=5).mean())
        
        best_model = model
        
    return timings, scores, params, best_model

In [3]:
# KNN model builder
def fit_knn(params, X_train, y_train):
    knn_model = KNeighborsClassifier(**params)
    knn_model.fit(X_train, y_train)
    return knn_model
    
def build_knn(X_train, X_test, y_train, y_test):
    params = []
    params.append({'n_neighbors':1})
    params.append({'n_neighbors':5})
    params.append({'n_neighbors':10})
    
    return build_generic('fit_knn', params, X_train, X_test, y_train, y_test)    

In [4]:
# Tree model builder
def fit_tree(params, X_train, y_train):
    tree_model = DecisionTreeClassifier(**params)
    tree_model.fit(X_train, y_train)
    return tree_model
    
def build_tree(X_train, X_test, y_train, y_test):
    params = []
    params.append({'max_depth':5})
    params.append({'max_depth':20, 'splitter': 'best'})
    params.append({'max_depth':20, 'splitter': 'random'})

    return build_generic('fit_tree', params, X_train, X_test, y_train, y_test)    

In [5]:
# MLP model builder
def fit_mlp(params, X_train, y_train):
    mlp_model = MLPClassifier(**params)
    mlp_model.fit(X_train, y_train)
    return mlp_model
    
def build_mlp(X_train, X_test, y_train, y_test):
    params = []
    params.append({'early_stopping': True, 'solver': 'adam'})
    params.append({'early_stopping': True, 'solver': 'sgd', 'learning_rate': 'adaptive'})
    params.append({'early_stopping': True, 'solver': 'lbfgs', 'max_fun': 15000, 'max_iter': 300})
    
    return build_generic('fit_mlp', params, X_train, X_test, y_train, y_test)    

In [6]:
def build_models(X_train, X_test, y_train, y_test):
    
    knn_timings, knn_scores, knn_params, knn_model = build_knn(X_train, X_test, y_train, y_test)
    tree_timings, tree_scores, tree_params, tree_model = build_tree(X_train, X_test, y_train, y_test)
    mlp_timings, mlp_scores, mlp_params, mlp_model = build_mlp(X_train, X_test, y_train, y_test)
    
    idx = pd.MultiIndex.from_product([['KNN', 'TREE', 'MLP'],['time', 'accuracy', 'params']])
    
    data = []
    for i in range(3):
        row = [
            knn_timings[i], knn_scores[i], knn_params[i],
            tree_timings[i], tree_scores[i], tree_params[i],
            mlp_timings[i], mlp_scores[i], mlp_params[i],
        ]
        data.append(row)

    results = pd.DataFrame(data, columns = idx, index = [0,1,2])
    return results

In [7]:
pd.set_option('display.max_colwidth', None)

notebook_time = time.time()

## Mushroom Edibility

In [8]:
mushrooms = pd.read_csv('./mushrooms/mushrooms.csv')
# encode labels
mushrooms = mushrooms.apply(LabelEncoder().fit_transform)

mushrooms_X = mushrooms.drop('edibility', axis=1)
mushrooms_y = mushrooms['edibility']

# these features are enough to classify the whole dataset, see agaricus-lepiota.names
mushrooms_X = mushrooms_X[['odor', 'spore-print-color', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'habitat', 'cap-color']]

In [9]:
mushrooms_X_train, mushrooms_X_test, mushrooms_y_train, mushrooms_y_test = train_test_split(mushrooms_X, mushrooms_y)

mushrooms_results = build_models(mushrooms_X_train, mushrooms_X_test, mushrooms_y_train, mushrooms_y_test)
mushrooms_results

Unnamed: 0_level_0,KNN,KNN,KNN,TREE,TREE,TREE,MLP,MLP,MLP
Unnamed: 0_level_1,time,accuracy,params,time,accuracy,params,time,accuracy,params
0,0.017311,1.0,{'n_neighbors': 1},0.007273,0.996317,{'max_depth': 5},3.526148,0.986217,"{'early_stopping': True, 'solver': 'adam'}"
1,0.010543,1.0,{'n_neighbors': 5},0.003886,1.0,"{'max_depth': 20, 'splitter': 'best'}",4.863949,0.831513,"{'early_stopping': True, 'solver': 'sgd', 'learning_rate': 'adaptive'}"
2,0.010284,0.999168,{'n_neighbors': 10},0.003592,1.0,"{'max_depth': 20, 'splitter': 'random'}",3.034235,1.0,"{'early_stopping': True, 'solver': 'lbfgs', 'max_fun': 15000, 'max_iter': 300}"


## Soybeans

In [10]:
soybeans = pd.read_csv('./soybeans/soybean_cleaned.csv')

# handle missing values by dropping, see pdf for more info
soybeans.dropna(inplace = True)

# encode labels
soybeans = soybeans.apply(LabelEncoder().fit_transform)

soybeans_X = soybeans.drop('class', axis=1)
soybeans_y = soybeans['class']

soybeans_X_train, soybeans_X_test, soybeans_y_train, soybeans_y_test = train_test_split(soybeans_X, soybeans_y)

In [11]:
soybeans_results = build_models(soybeans_X_train, soybeans_X_test, soybeans_y_train, soybeans_y_test)
soybeans_results

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Unnamed: 0_level_0,KNN,KNN,KNN,TREE,TREE,TREE,MLP,MLP,MLP
Unnamed: 0_level_1,time,accuracy,params,time,accuracy,params,time,accuracy,params
0,0.003997,0.839934,{'n_neighbors': 1},0.002589,0.7189,{'max_depth': 5},0.286835,0.70817,"{'early_stopping': True, 'solver': 'adam'}"
1,0.002521,0.793616,{'n_neighbors': 5},0.002788,0.909308,"{'max_depth': 20, 'splitter': 'best'}",0.614347,0.316846,"{'early_stopping': True, 'solver': 'sgd', 'learning_rate': 'adaptive'}"
2,0.002629,0.752718,{'n_neighbors': 10},0.002464,0.905705,"{'max_depth': 20, 'splitter': 'random'}",0.649894,0.877212,"{'early_stopping': True, 'solver': 'lbfgs', 'max_fun': 15000, 'max_iter': 300}"


## Breast Cancer Data

In [12]:
breastcancer_train = pd.read_csv('./breastcancer/breast-cancer-diagnostic.shuf.lrn.csv')
breastcancer_sol_input = pd.read_csv('./breastcancer/breast-cancer-diagnostic.shuf.tes.csv')


breastcancer_train = breastcancer_train.drop('ID', axis=1)
breastcancer_sol_input = breastcancer_sol_input.drop('ID', axis=1)

breastcancer_X = breastcancer_train.drop('class', axis=1)
breastcancer_y = breastcancer_train['class']

breastcancer_X_train, breastcancer_X_test, breastcancer_y_train, breastcancer_y_test  = train_test_split(breastcancer_X, breastcancer_y)

In [13]:
breastcancer_results = build_models(breastcancer_X_train, breastcancer_X_test, breastcancer_y_train, breastcancer_y_test)
breastcancer_results

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Unnamed: 0_level_0,KNN,KNN,KNN,TREE,TREE,TREE,MLP,MLP,MLP
Unnamed: 0_level_1,time,accuracy,params,time,accuracy,params,time,accuracy,params
0,0.006145,0.901754,{'n_neighbors': 1},0.004206,0.922807,{'max_depth': 5},0.040752,0.8,"{'early_stopping': True, 'solver': 'adam'}"
1,0.002441,0.915789,{'n_neighbors': 5},0.003934,0.912281,"{'max_depth': 20, 'splitter': 'best'}",0.157472,0.764912,"{'early_stopping': True, 'solver': 'sgd', 'learning_rate': 'adaptive'}"
2,0.001973,0.919298,{'n_neighbors': 10},0.001873,0.912281,"{'max_depth': 20, 'splitter': 'random'}",0.041036,0.929825,"{'early_stopping': True, 'solver': 'lbfgs', 'max_fun': 15000, 'max_iter': 300}"


## Purchase Data

In [14]:
purchase_train = pd.read_csv('./purchase/purchase600-100cls-15k.lrn.csv')
purchase_sol_input = pd.read_csv('./purchase/purchase600-100cls-15k.tes.csv')

# labels do not need to be encoded, inputs are numeric
#purchase_train = purchase_train.apply(LabelEncoder().fit_transform)

purchase_train = purchase_train.drop('ID', axis=1)
purchase_sol_input = purchase_sol_input.drop('ID', axis=1)

purchase_X = purchase_train.drop('class', axis=1)
purchase_y = purchase_train['class']

purchase_X_train, purchase_X_test, purchase_y_train, purchase_y_test  = train_test_split(purchase_X, purchase_y)

In [15]:
purchase_results = build_models(purchase_X_train, purchase_X_test, purchase_y_train, purchase_y_test)
purchase_results



Unnamed: 0_level_0,KNN,KNN,KNN,TREE,TREE,TREE,MLP,MLP,MLP
Unnamed: 0_level_1,time,accuracy,params,time,accuracy,params,time,accuracy,params
0,0.500913,0.2081,{'n_neighbors': 1},0.250532,0.0956,{'max_depth': 5},8.224043,0.6766,"{'early_stopping': True, 'solver': 'adam'}"
1,0.458227,0.2552,{'n_neighbors': 5},1.411533,0.0908,"{'max_depth': 20, 'splitter': 'best'}",51.859043,0.6838,"{'early_stopping': True, 'solver': 'sgd', 'learning_rate': 'adaptive'}"
2,0.457258,0.2807,{'n_neighbors': 10},1.323388,0.0937,"{'max_depth': 20, 'splitter': 'random'}",26.748865,0.7252,"{'early_stopping': True, 'solver': 'lbfgs', 'max_fun': 15000, 'max_iter': 300}"


In [16]:
print(f'notebook took this long in seconds: {time.time()-notebook_time}')

notebook took this long in seconds: 1007.4870054721832
