# Machine Learning Exercise 1 - Classification

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.model_selection import GridSearchCV

import time

In [50]:
def build_generic(fit_fun, params, X_train, X_test, y_train, y_test):
    timings = []
    scores = []
    best_model = 'none'
    
    for p in params:
        start = time.time()
        model = eval(f'{fit_fun}(p, X_train, y_train)')
        timings.append(time.time() - start)
        
        X = X_train.append(X_test)
        y = y_train.append(y_test)
        scores.append(cross_val_score(model, X, y, cv=5).mean())
        
        best_model = model
        
    return timings, scores, params, best_model

In [51]:
# KNN model builder
def fit_knn(params, X_train, y_train):
    knn_model = KNeighborsClassifier(**params)
    knn_model.fit(X_train, y_train)
    return knn_model
    
def build_knn(X_train, X_test, y_train, y_test):
    params = []
    params.append({'n_neighbors':1})
    params.append({'n_neighbors':5})
    params.append({'n_neighbors':10})
    
    return build_generic('fit_knn', params, X_train, X_test, y_train, y_test)    

In [52]:
# Tree model builder
def fit_tree(params, X_train, y_train):
    tree_model = DecisionTreeClassifier(**params)
    tree_model.fit(X_train, y_train)
    return tree_model
    
def build_tree(X_train, X_test, y_train, y_test):
    params = []
    params.append({'max_depth':None})
    params.append({'max_depth':5})
    params.append({'max_depth':20})
    
    return build_generic('fit_tree', params, X_train, X_test, y_train, y_test)    

In [53]:
# MLP model builder
def fit_mlp(params, X_train, y_train):
    mlp_model = MLPClassifier(**params)
    mlp_model.fit(X_train, y_train)
    return mlp_model
    
def build_mlp(X_train, X_test, y_train, y_test):
    params = []
    params.append({})
    params.append({})
    params.append({})
    
    return build_generic('fit_mlp', params, X_train, X_test, y_train, y_test)    

In [54]:
def build_models(X_train, X_test, y_train, y_test):
    
    knn_timings, knn_scores, knn_params, knn_model = build_knn(X_train, X_test, y_train, y_test)
    tree_timings, tree_scores, tree_params, tree_model = build_tree(X_train, X_test, y_train, y_test)
    mlp_timings, mlp_scores, mlp_params, mlp_model = build_mlp(X_train, X_test, y_train, y_test)
    
    idx = pd.MultiIndex.from_product([['KNN', 'TREE', 'MLP'],['time', 'accuracy', 'params']])
    
    data = []
    for i in range(3):
        row = [
            knn_timings[i], knn_scores[i], knn_params[i],
            tree_timings[i], tree_scores[i], tree_params[i],
            mlp_timings[i], mlp_scores[i], mlp_params[i],
        ]
        data.append(row)

    results = pd.DataFrame(data, columns = idx, index = [0,1,2])
    return results

In [55]:
notebook_time = time.time()

## Mushroom Edibility

In [56]:
mushrooms = pd.read_csv('./mushrooms/mushrooms.csv')
# encode labels
mushrooms = mushrooms.apply(LabelEncoder().fit_transform)

mushrooms_X = mushrooms.drop('edibility', axis=1)
mushrooms_y = mushrooms['edibility']

mushrooms_X_train, mushrooms_X_test, mushrooms_y_train, mushrooms_y_test = train_test_split(mushrooms_X, mushrooms_y)

mushrooms_results = build_models(mushrooms_X_train, mushrooms_X_test, mushrooms_y_train, mushrooms_y_test)
mushrooms_results

Unnamed: 0_level_0,KNN,KNN,KNN,TREE,TREE,TREE,MLP,MLP,MLP
Unnamed: 0_level_1,time,accuracy,params,time,accuracy,params,time,accuracy,params
0,0.01777,1.0,{'n_neighbors': 1},0.008522,1.0,{'max_depth': None},3.990745,1.0,{}
1,0.017367,0.999881,{'n_neighbors': 5},0.008193,0.992871,{'max_depth': 5},3.901125,1.0,{}
2,0.020114,0.997029,{'n_neighbors': 10},0.03118,1.0,{'max_depth': 20},4.73791,1.0,{}


## Soybeans

In [11]:
soybeans = pd.read_csv('./soybeans/soybean.csv')

# encode labels
soybeans = soybeans.apply(LabelEncoder().fit_transform)

# handle missing values by dropping, see pdf for more info
#soybeans[soybeans.precip != "?"]
#soybeans.precip == "?"

soybeans_X = soybeans.drop('class', axis=1)
soybeans_y = soybeans['class']

soybeans_X_train, soybeans_X_test, soybeans_y_train, soybeans_y_test = train_test_split(soybeans_X, soybeans_y)

In [12]:
soybeans_results = build_models(soybeans_X_train, soybeans_X_test, soybeans_y_train, soybeans_y_test)
soybeans_results



Unnamed: 0_level_0,KNN,KNN,KNN,TREE,TREE,TREE,MLP,MLP,MLP
Unnamed: 0_level_1,time,accuracy,params,time,accuracy,params,time,accuracy,params
0,0.001877,0.862462,{'n_neighbors': 1},0.00233,0.922477,{'max_depth': None},0.630141,0.904873,{}
1,0.001271,0.822907,{'n_neighbors': 5},0.001837,0.730646,{'max_depth': 5},0.734903,0.900462,{}
2,0.001058,0.783405,{'n_neighbors': 10},0.002408,0.922467,{'max_depth': 20},0.977154,0.894622,{}


## Breast Cancer Data

In [13]:
breastcancer_train = pd.read_csv('./breastcancer/breast-cancer-diagnostic.shuf.lrn.csv')
breastcancer_sol_input = pd.read_csv('./breastcancer/breast-cancer-diagnostic.shuf.tes.csv')


breastcancer_train = breastcancer_train.drop('ID', axis=1)
breastcancer_sol_input = breastcancer_sol_input.drop('ID', axis=1)

breastcancer_X = breastcancer_train.drop('class', axis=1)
breastcancer_y = breastcancer_train['class']

breastcancer_X_train, breastcancer_X_test, breastcancer_y_train, breastcancer_y_test  = train_test_split(breastcancer_X, breastcancer_y)

In [14]:
breastcancer_results = build_models(breastcancer_X_train, breastcancer_X_test, breastcancer_y_train, breastcancer_y_test)
breastcancer_results



Unnamed: 0_level_0,KNN,KNN,KNN,TREE,TREE,TREE,MLP,MLP,MLP
Unnamed: 0_level_1,time,accuracy,params,time,accuracy,params,time,accuracy,params
0,0.001979,0.922807,{'n_neighbors': 1},0.003551,0.905263,{'max_depth': None},0.244823,0.912281,{}
1,0.001002,0.915789,{'n_neighbors': 5},0.003303,0.898246,{'max_depth': 5},0.159274,0.929825,{}
2,0.000999,0.915789,{'n_neighbors': 10},0.003897,0.905263,{'max_depth': 20},0.101698,0.908772,{}


## Purchase Data

In [15]:
purchase_train = pd.read_csv('./purchase/purchase600-100cls-15k.lrn.csv')
purchase_sol_input = pd.read_csv('./purchase/purchase600-100cls-15k.tes.csv')

# labels do not need to be encoded, inputs are numeric
#purchase_train = purchase_train.apply(LabelEncoder().fit_transform)

purchase_train = purchase_train.drop('ID', axis=1)
purchase_sol_input = purchase_sol_input.drop('ID', axis=1)

purchase_X = purchase_train.drop('class', axis=1)
purchase_y = purchase_train['class']

purchase_X_train, purchase_X_test, purchase_y_train, purchase_y_test  = train_test_split(purchase_X, purchase_y)

In [16]:
purchase_results = build_models(purchase_X_train, purchase_X_test, purchase_y_train, purchase_y_test)
purchase_results

Unnamed: 0_level_0,KNN,KNN,KNN,TREE,TREE,TREE,MLP,MLP,MLP
Unnamed: 0_level_1,time,accuracy,params,time,accuracy,params,time,accuracy,params
0,0.004428,0.2078,{'n_neighbors': 1},0.805958,0.0954,{'max_depth': None},20.404598,0.6716,{}
1,0.003253,0.2521,{'n_neighbors': 5},0.158642,0.0939,{'max_depth': 5},19.96999,0.682,{}
2,0.003426,0.281,{'n_neighbors': 10},0.762953,0.0966,{'max_depth': 20},20.012546,0.6738,{}


In [17]:
print(f'notebook took this long in seconds: {time.time()-notebook_time}')

notebook took this long in seconds: 483.1763606071472
