# Model selection

In [20]:
# We would need these libraries to manage our dataset
# Numpy: used for large, multi-dimensional arrays and matrices, and for high-level mathematical functions
# Pandas: used for data manipulation and analysis
# matplotlib: used for visualisation and plotting graph/image/etc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
RANDOM_SEED = 42

In [21]:
# Import the iris dataset from sklearn
# https://scikit-learn.org/stable/modules/classes.html#module-sklearn.datasets
from sklearn.datasets import load_iris
# load the dataset
iris = load_iris()

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [24]:
from sklearn.model_selection import train_test_split
np.random.seed(RANDOM_SEED)
X = iris.data[:, :2] # .reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X, iris.target)
print(X_train.shape, X_test.shape, np.unique(y_train, return_counts=True))

(112, 2) (38, 2) (array([0, 1, 2]), array([35, 39, 38]))


In [25]:
X_training, X_vali, y_training, y_vali = train_test_split(X_train, y_train)

In [26]:
rf_clf = RandomForestClassifier().fit(X_train, y_train)
mlp_clf = MLPClassifier().fit(X_train, y_train)
svc_clf = SVC().fit(X_train, y_train)
knn_clf = KNeighborsClassifier().fit(X_train, y_train)



In [27]:
rf_res = rf_clf.predict(X_vali)
mlp_res = mlp_clf.predict(X_vali)
svc_res = svc_clf.predict(X_vali)
knn_res = knn_clf.predict(X_vali)

In [28]:
from sklearn.metrics import accuracy_score
print('RF ', accuracy_score(rf_res, y_vali))
print('MLP', accuracy_score(mlp_res, y_vali))
print('SVC', accuracy_score(svc_res, y_vali))
print('KNN', accuracy_score(knn_res, y_vali))

RF  0.9285714285714286
MLP 0.7857142857142857
SVC 0.7857142857142857
KNN 0.8214285714285714


In [29]:
test_res = rf_clf.predict(X_test)
print('RF ', accuracy_score(test_res, y_test))

RF  0.7631578947368421


## Hyperparameter selection

In [30]:
from sklearn.model_selection import GridSearchCV
rf_grid = RandomForestClassifier()
parameters = {'n_estimators': [2, 5, 10, 15, 25, 30, 50], 'min_samples_split': range(2,7)}
grid = GridSearchCV(rf_grid, parameters)
grid.fit(X_train, y_train)
print(grid.best_params_, grid.best_score_)

{'min_samples_split': 6, 'n_estimators': 25} 0.7675889328063241
RF  0.7894736842105263


In [31]:
grid_res = grid.predict(X_test)
print('RF ', accuracy_score(grid_res, y_test))

RF  0.7894736842105263


## Cross-validation

In [40]:
from sklearn.model_selection import cross_validate
rf_cv = RandomForestClassifier(n_estimators=25, min_samples_split=6)
cv_res = cross_validate(rf_cv, X_train, y_train, cv=4)
print('RF ', cv_res['test_score'])

RF  [0.78571429 0.75       0.71428571 0.67857143]


In [62]:
from sklearn.model_selection import LeavePOut
rf_lpo = RandomForestClassifier(n_estimators=25, min_samples_split=6)
lpo = LeavePOut(1)
n_split = lpo.get_n_splits(X_train)
print(n_split)
lpo_res = []
j = 0
for train_idx, test_idx in lpo.split(X_train):
    print(j, end='\r')
    j = j+1
    rf_lpo.fit(X_train[train_idx], y_train[train_idx])
    lpo_res.append(accuracy_score(y_train[test_idx], rf_lpo.predict(X_train[test_idx]]
np.unique(lpo_res, return_counts=True), np.mean(lpo_res)

112
[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111] [0]
[  0   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111] [1]
[  0   1   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  2

((array([0, 1, 2]), array([33, 35, 44])), 1.0982142857142858)

  np.array([i for i in lpo.split(X_train)]).shape


(6216, 2)