## Load the data

In [32]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# load the data
dataset1 = pd.read_csv('data_electrical_grid.csv')
dataset1 = dataset1.values
print(dataset1.shape)

dataset2 = pd.read_csv('data_credit_card.csv', skiprows=1)
dataset2 = dataset2.values
dataset2 = np.delete(dataset2, 0, 1)
print(dataset2.shape)

dataset3 = pd.read_csv('data_telescope.csv')
dataset3 = dataset3.values
print(dataset3.shape)

np.random.shuffle(dataset1)
np.random.shuffle(dataset2)
np.random.shuffle(dataset3)


X_data1 = dataset1[:, : 13]
Y_data1 = dataset1[:, 13]

X_data2 = dataset2[:, : 23]
Y_data2 = dataset2[:, 23]

X_data3 = dataset3[:, : 10]
Y_data3 = dataset3[:, 10]

# ---- convert the class names into categorical labels.
le = LabelEncoder()
le.fit(dataset1[:, 13])
Y_data1 = le.transform(Y_data1).reshape(len(Y_data1), 1)

le = LabelEncoder()
le.fit(dataset2[:, 23])
Y_data2 = le.transform(Y_data2).reshape(len(Y_data2), 1)


le = LabelEncoder()
le.fit(dataset3[:, 10])



(10000, 14)
(30000, 24)
(19019, 11)


LabelEncoder()

## 1. Dataset 1; 80% training and 20% testing

In [20]:
train_acc_tree_1 = 0
val_acc_tree_1 = 0
test_acc_tree_1 = 0

train_acc_rf_1 = 0
val_acc_rf_1 = 0
test_acc_rf_1 = 0

train_acc_knn_1 = 0
val_acc_knn_1 = 0
test_acc_knn_1 = 0

for i in range(3):
    # Split the set: 80% training, 20% testing
    X_train_data1_1 = X_data1[:int(0.8*len(X_data1))] # Get features from train + val set.
    X_test_data1_1  = X_data1[int(0.8*len(X_data1)):] # Get features from test set.     
    Y_train_data1_1 = Y_data1[:int(0.8*len(Y_data1))] # Get labels from train + val set.
    Y_test_data1_1  = Y_data1[int(0.8*len(Y_data1)):] # Get labels from test set.
    
    # Decision Tree Classifier
    classifier_tree = tree.DecisionTreeClassifier(criterion = 'entropy')
    depth_list = [11, 12, 13, 14, 15]
    parameters_tree = {'max_depth': depth_list}
    clf_tree = GridSearchCV(classifier_tree, parameters_tree, cv = 3)
    clf_tree.fit(X_train_data1_1, Y_train_data1_1)

    train_acc_tree_1 += clf_tree.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_tree_1 += clf_tree.cv_results_['mean_test_score'].reshape(5, 1)


    opt_depth_1 = depth_list[clf_tree.best_index_]
    classifier_tree_2 = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = opt_depth_1)
    classifier_tree_2.fit(X_train_data1_1, Y_train_data1_1)
    test_acc_tree_1 += classifier_tree_2.score(X_test_data1_1, Y_test_data1_1)
    
    # Random Forest Classifier
    classifier_rf = RandomForestClassifier(n_estimators=100, random_state=0)
    rf_depth_list = [11, 12, 13, 14, 15]
    parameters_rf = {'max_depth': rf_depth_list}
    clf_rf = GridSearchCV(classifier_rf, parameters_rf, cv = 3)
    clf_rf.fit(X_train_data1_1, Y_train_data1_1)
    
    train_acc_rf_1 += clf_rf.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_rf_1 += clf_rf.cv_results_['mean_test_score'].reshape(5, 1)
    
    opt_rf_depth_1 = rf_depth_list[clf_tree.best_index_]
    classifier_rf_2 = RandomForestClassifier(n_estimators=100, max_depth = opt_rf_depth_1, random_state=0)
    classifier_rf_2.fit(X_train_data1_1, Y_train_data1_1)
    test_acc_rf_1 += classifier_rf_2.score(X_test_data1_1, Y_test_data1_1)
    
    # K-Nearest Neighborhood Classifier
    classifier_knn = KNeighborsClassifier()
    k_list = [1, 4, 8, 12, 16]
    parameters_knn = {'n_neighbors': k_list}
    clf_knn = GridSearchCV(classifier_knn, parameters_knn, cv = 3)
    clf_knn.fit(X_train_data1_1, Y_train_data1_1)
    
    train_acc_knn_1 += clf_knn.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_knn_1 += clf_knn.cv_results_['mean_test_score'].reshape(5, 1)
    
    opt_k_1 = k_list[clf_knn.best_index_]
    classifier_knn_2 = KNeighborsClassifier(n_neighbors=opt_k_1)
    classifier_knn_2.fit(X_train_data1_1, Y_train_data1_1)
    test_acc_knn_1 += classifier_knn_2.score(X_test_data1_1, Y_test_data1_1)
    
train_acc_tree_1 = train_acc_tree_1 / 3
val_acc_tree_1 = val_acc_tree_1 / 3
test_acc_tree_1 = test_acc_tree_1 / 3
print('average tree training accuracy: ','\n', train_acc_tree_1, '\n', 
      'average tree validation accuracy: ','\n', val_acc_tree_1,'\n', 
      'average tree testing accuracy: ', test_acc_tree_1)

train_acc_rf_1 = train_acc_rf_1 / 3
val_acc_rf_1 = val_acc_rf_1 / 3
test_acc_rf_1 = test_acc_rf_1 / 3
print('average rf training accuracy: ','\n', train_acc_rf_1, '\n', 
      'average rf validation accuracy: ','\n', val_acc_rf_1,'\n', 
      'average rf testing accuracy: ', test_acc_rf_1)

train_acc_knn_1 = train_acc_knn_1 / 3
val_acc_knn_1 = val_acc_knn_1 / 3
test_acc_knn_1 = test_acc_knn_1 / 3
print('average knn training accuracy: ','\n', train_acc_knn_1, '\n', 
      'average knn validation accuracy: ','\n', val_acc_knn_1,'\n', 
      'average knn testing accuracy: ', test_acc_knn_1)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_p

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_p

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_p

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_p

average tree training accuracy:  
 [[1.]
 [1.]
 [1.]
 [1.]
 [1.]] 
 average tree validation accuracy:  
 [[0.999875]
 [0.999875]
 [0.999875]
 [0.999875]
 [0.999875]] 
 average tree testing accuracy:  1.0
average rf training accuracy:  
 [[1.]
 [1.]
 [1.]
 [1.]
 [1.]] 
 average rf validation accuracy:  
 [[1.      ]
 [1.      ]
 [0.999875]
 [0.999875]
 [0.999875]] 
 average rf testing accuracy:  1.0
average knn training accuracy:  
 [[1.        ]
 [0.86068765]
 [0.83662498]
 [0.82662499]
 [0.81824991]] 
 average knn validation accuracy:  
 [[0.746125]
 [0.7565  ]
 [0.78225 ]
 [0.790875]
 [0.79025 ]] 
 average knn testing accuracy:  0.786


## 2. Dataset 1; 50% training and 50% testing

In [22]:
train_acc_tree_2 = 0
val_acc_tree_2 = 0
test_acc_tree_2 = 0

train_acc_rf_2 = 0
val_acc_rf_2 = 0
test_acc_rf_2 = 0

train_acc_knn_2 = 0
val_acc_knn_2 = 0
test_acc_knn_2 = 0

for i in range(3):
    # Split the set: 50% training, 50% testing
    X_train_data1_2 = X_data1[:int(0.5*len(X_data1))] # Get features from train + val set.
    X_test_data1_2  = X_data1[int(0.5*len(X_data1)):] # Get features from test set.     
    Y_train_data1_2 = Y_data1[:int(0.5*len(Y_data1))] # Get labels from train + val set.
    Y_test_data1_2  = Y_data1[int(0.5*len(Y_data1)):] # Get labels from test set.
    
    # Decision Tree Classifier
    classifier_tree = tree.DecisionTreeClassifier(criterion = 'entropy')
    depth_list = [11, 12, 13, 14, 15]
    parameters_tree = {'max_depth': depth_list}
    clf_tree = GridSearchCV(classifier_tree, parameters_tree, cv = 3)
    clf_tree.fit(X_train_data1_2, Y_train_data1_2)

    train_acc_tree_2 += clf_tree.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_tree_2 += clf_tree.cv_results_['mean_test_score'].reshape(5, 1)


    opt_depth_2 = depth_list[clf_tree.best_index_]
    classifier_tree_2 = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = opt_depth_2)
    classifier_tree_2.fit(X_train_data1_2, Y_train_data1_2)
    test_acc_tree_2 += classifier_tree_2.score(X_test_data1_2, Y_test_data1_2)
    
    # Random Forest Classifier
    classifier_rf = RandomForestClassifier(n_estimators=100, random_state=0)
    rf_depth_list = [11, 12, 13, 14, 15]
    parameters_rf = {'max_depth': rf_depth_list}
    clf_rf = GridSearchCV(classifier_rf, parameters_rf, cv = 3)
    clf_rf.fit(X_train_data1_2, Y_train_data1_2)
    
    train_acc_rf_2 += clf_rf.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_rf_2 += clf_rf.cv_results_['mean_test_score'].reshape(5, 1)
    
    opt_rf_depth_2 = rf_depth_list[clf_tree.best_index_]
    classifier_rf_2 = RandomForestClassifier(n_estimators=100, max_depth = opt_rf_depth_2, random_state=0)
    classifier_rf_2.fit(X_train_data1_2, Y_train_data1_2)
    test_acc_rf_2 += classifier_rf_2.score(X_test_data1_2, Y_test_data1_2)
    
    # K-Nearest Neighborhood Classifier
    classifier_knn = KNeighborsClassifier()
    k_list = [1, 4, 8, 12, 16]
    parameters_knn = {'n_neighbors': k_list}
    clf_knn = GridSearchCV(classifier_knn, parameters_knn, cv = 3)
    clf_knn.fit(X_train_data1_2, Y_train_data1_2)
    
    train_acc_knn_2 += clf_knn.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_knn_2 += clf_knn.cv_results_['mean_test_score'].reshape(5, 1)
    
    opt_k_2 = k_list[clf_knn.best_index_]
    classifier_knn_2 = KNeighborsClassifier(n_neighbors=opt_k_2)
    classifier_knn_2.fit(X_train_data1_2, Y_train_data1_2)
    test_acc_knn_2 += classifier_knn_2.score(X_test_data1_2, Y_test_data1_2)
    
train_acc_tree_2 = train_acc_tree_2 / 3
val_acc_tree_2 = val_acc_tree_2 / 3
test_acc_tree_2 = test_acc_tree_2 / 3
print('average tree training accuracy: ','\n', train_acc_tree_2, '\n', 
      'average tree validation accuracy: ','\n', val_acc_tree_2,'\n', 
      'average tree testing accuracy: ', test_acc_tree_2)

train_acc_rf_2 = train_acc_rf_2 / 3
val_acc_rf_2 = val_acc_rf_2 / 3
test_acc_rf_2 = test_acc_rf_2 / 3
print('average rf training accuracy: ','\n', train_acc_rf_2, '\n', 
      'average rf validation accuracy: ','\n', val_acc_rf_2,'\n', 
      'average rf testing accuracy: ', test_acc_rf_2)

train_acc_knn_2 = train_acc_knn_2 / 3
val_acc_knn_2 = val_acc_knn_2 / 3
test_acc_knn_2 = test_acc_knn_2 / 3
print('average knn training accuracy: ','\n', train_acc_knn_2, '\n', 
      'average knn validation accuracy: ','\n', val_acc_knn_2,'\n', 
      'average knn testing accuracy: ', test_acc_knn_2)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_p

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_p

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_p

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_p

average tree training accuracy:  
 [[1.]
 [1.]
 [1.]
 [1.]
 [1.]] 
 average tree validation accuracy:  
 [[0.9998]
 [0.9998]
 [0.9998]
 [0.9998]
 [0.9998]] 
 average tree testing accuracy:  1.0
average rf training accuracy:  
 [[1.]
 [1.]
 [1.]
 [1.]
 [1.]] 
 average rf validation accuracy:  
 [[0.9996]
 [0.9998]
 [0.9996]
 [0.9996]
 [0.9996]] 
 average rf testing accuracy:  1.0
average knn training accuracy:  
 [[1.        ]
 [0.85610081]
 [0.83179895]
 [0.82289941]
 [0.81799895]] 
 average knn validation accuracy:  
 [[0.7374]
 [0.7534]
 [0.782 ]
 [0.7858]
 [0.7888]] 
 average knn testing accuracy:  0.7878


## 3. Dataset 1; 20% training and 80% testing

In [23]:
train_acc_tree_3 = 0
val_acc_tree_3 = 0
test_acc_tree_3 = 0

train_acc_rf_3 = 0
val_acc_rf_3 = 0
test_acc_rf_3 = 0

train_acc_knn_3 = 0
val_acc_knn_3 = 0
test_acc_knn_3 = 0

for i in range(3):
    # Split the set: 20% training, 80% testing
    X_train_data1_3 = X_data1[:int(0.2*len(X_data1))] # Get features from train + val set.
    X_test_data1_3  = X_data1[int(0.2*len(X_data1)):] # Get features from test set.     
    Y_train_data1_3 = Y_data1[:int(0.2*len(Y_data1))] # Get labels from train + val set.
    Y_test_data1_3  = Y_data1[int(0.2*len(Y_data1)):] # Get labels from test set.
    
    # Decision Tree Classifier
    classifier_tree = tree.DecisionTreeClassifier(criterion = 'entropy')
    depth_list = [11, 12, 13, 14, 15]
    parameters_tree = {'max_depth': depth_list}
    clf_tree = GridSearchCV(classifier_tree, parameters_tree, cv = 3)
    clf_tree.fit(X_train_data1_3, Y_train_data1_3)

    train_acc_tree_3 += clf_tree.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_tree_3 += clf_tree.cv_results_['mean_test_score'].reshape(5, 1)


    opt_depth_3 = depth_list[clf_tree.best_index_]
    classifier_tree_2 = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = opt_depth_3)
    classifier_tree_2.fit(X_train_data1_3, Y_train_data1_3)
    test_acc_tree_3 += classifier_tree_2.score(X_test_data1_3, Y_test_data1_3)
    
    # Random Forest Classifier
    classifier_rf = RandomForestClassifier(n_estimators=100, random_state=0)
    rf_depth_list = [11, 12, 13, 14, 15]
    parameters_rf = {'max_depth': rf_depth_list}
    clf_rf = GridSearchCV(classifier_rf, parameters_rf, cv = 3)
    clf_rf.fit(X_train_data1_3, Y_train_data1_3)
    
    train_acc_rf_3 += clf_rf.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_rf_3 += clf_rf.cv_results_['mean_test_score'].reshape(5, 1)
    
    opt_rf_depth_3 = rf_depth_list[clf_tree.best_index_]
    classifier_rf_2 = RandomForestClassifier(n_estimators=100, max_depth = opt_rf_depth_3, random_state=0)
    classifier_rf_2.fit(X_train_data1_3, Y_train_data1_3)
    test_acc_rf_3 += classifier_rf_2.score(X_test_data1_3, Y_test_data1_3)
    
    # K-Nearest Neighborhood Classifier
    classifier_knn = KNeighborsClassifier()
    k_list = [1, 4, 8, 12, 16]
    parameters_knn = {'n_neighbors': k_list}
    clf_knn = GridSearchCV(classifier_knn, parameters_knn, cv = 3)
    clf_knn.fit(X_train_data1_3, Y_train_data1_3)
    
    train_acc_knn_3 += clf_knn.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_knn_3 += clf_knn.cv_results_['mean_test_score'].reshape(5, 1)
    
    opt_k_3 = k_list[clf_knn.best_index_]
    classifier_knn_2 = KNeighborsClassifier(n_neighbors=opt_k_3)
    classifier_knn_2.fit(X_train_data1_3, Y_train_data1_3)
    test_acc_knn_3 += classifier_knn_2.score(X_test_data1_3, Y_test_data1_3)
    
train_acc_tree_3 = train_acc_tree_3 / 3
val_acc_tree_3 = val_acc_tree_3 / 3
test_acc_tree_3 = test_acc_tree_3 / 3
print('average tree training accuracy: ','\n', train_acc_tree_3, '\n', 
      'average tree validation accuracy: ','\n', val_acc_tree_3,'\n', 
      'average tree testing accuracy: ', test_acc_tree_3)

train_acc_rf_3 = train_acc_rf_3 / 3
val_acc_rf_3 = val_acc_rf_3 / 3
test_acc_rf_3 = test_acc_rf_3 / 3
print('average rf training accuracy: ','\n', train_acc_rf_3, '\n', 
      'average rf validation accuracy: ','\n', val_acc_rf_3,'\n', 
      'average rf testing accuracy: ', test_acc_rf_3)

train_acc_knn_3 = train_acc_knn_3 / 3
val_acc_knn_3 = val_acc_knn_3 / 3
test_acc_knn_3 = test_acc_knn_3 / 3
print('average knn training accuracy: ','\n', train_acc_knn_3, '\n', 
      'average knn validation accuracy: ','\n', val_acc_knn_3,'\n', 
      'average knn testing accuracy: ', test_acc_knn_3)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_p

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_p

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_p

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


average tree training accuracy:  
 [[1.]
 [1.]
 [1.]
 [1.]
 [1.]] 
 average tree validation accuracy:  
 [[0.999]
 [0.999]
 [0.999]
 [0.999]
 [0.999]] 
 average tree testing accuracy:  0.99975
average rf training accuracy:  
 [[1.]
 [1.]
 [1.]
 [1.]
 [1.]] 
 average rf validation accuracy:  
 [[0.9995]
 [0.9995]
 [0.9995]
 [0.9995]
 [0.9995]] 
 average rf testing accuracy:  0.999625
average knn training accuracy:  
 [[1.        ]
 [0.8474978 ]
 [0.82674391]
 [0.8119949 ]
 [0.81249766]] 
 average knn validation accuracy:  
 [[0.73  ]
 [0.755 ]
 [0.7705]
 [0.778 ]
 [0.7835]] 
 average knn testing accuracy:  0.7825000000000001


## 4. Dataset 2; 80% training and 20% testing

In [29]:
train_acc_tree_4 = 0
val_acc_tree_4 = 0
test_acc_tree_4 = 0

train_acc_rf_4 = 0
val_acc_rf_4 = 0
test_acc_rf_4 = 0

train_acc_knn_4 = 0
val_acc_knn_4 = 0
test_acc_knn_4 = 0

for i in range(3):
    # Split the set: 80% training, 20% testing
    X_train_data2_1 = X_data2[:int(0.8*len(X_data2))] # Get features from train + val set.
    X_test_data2_1  = X_data2[int(0.8*len(X_data2)):] # Get features from test set.     
    Y_train_data2_1 = Y_data2[:int(0.8*len(Y_data2))] # Get labels from train + val set.
    Y_test_data2_1  = Y_data2[int(0.8*len(Y_data2)):] # Get labels from test set.
    
    # Decision Tree Classifier
    classifier_tree = tree.DecisionTreeClassifier(criterion = 'entropy')
    depth_list = [11, 12, 13, 14, 15]
    parameters_tree = {'max_depth': depth_list}
    clf_tree = GridSearchCV(classifier_tree, parameters_tree, cv = 3)
    clf_tree.fit(X_train_data2_1, Y_train_data2_1)

    train_acc_tree_4 += clf_tree.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_tree_4 += clf_tree.cv_results_['mean_test_score'].reshape(5, 1)


    opt_depth_4 = depth_list[clf_tree.best_index_]
    classifier_tree_2 = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = opt_depth_4)
    classifier_tree_2.fit(X_train_data2_1, Y_train_data2_1)
    test_acc_tree_4 += classifier_tree_2.score(X_test_data2_1, Y_test_data2_1)
    
    # Random Forest Classifier
    classifier_rf = RandomForestClassifier(n_estimators=100, random_state=0)
    rf_depth_list = [11, 12, 13, 14, 15]
    parameters_rf = {'max_depth': rf_depth_list}
    clf_rf = GridSearchCV(classifier_rf, parameters_rf, cv = 3)
    clf_rf.fit(X_train_data2_1, Y_train_data2_1)
    
    train_acc_rf_4 += clf_rf.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_rf_4 += clf_rf.cv_results_['mean_test_score'].reshape(5, 1)
    
    opt_rf_depth_4 = rf_depth_list[clf_tree.best_index_]
    classifier_rf_2 = RandomForestClassifier(n_estimators=100, max_depth = opt_rf_depth_4, random_state=0)
    classifier_rf_2.fit(X_train_data2_1, Y_train_data2_1)
    test_acc_rf_4 += classifier_rf_2.score(X_test_data2_1, Y_test_data2_1)
    
    # K-Nearest Neighborhood Classifier
    classifier_knn = KNeighborsClassifier()
    k_list = [1, 4, 8, 12, 16]
    parameters_knn = {'n_neighbors': k_list}
    clf_knn = GridSearchCV(classifier_knn, parameters_knn, cv = 3)
    clf_knn.fit(X_train_data2_1, Y_train_data2_1)
    
    train_acc_knn_4 += clf_knn.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_knn_4 += clf_knn.cv_results_['mean_test_score'].reshape(5, 1)
    
    opt_k_4 = k_list[clf_knn.best_index_]
    classifier_knn_2 = KNeighborsClassifier(n_neighbors=opt_k_4)
    classifier_knn_2.fit(X_train_data2_1, Y_train_data2_1)
    test_acc_knn_4 += classifier_knn_2.score(X_test_data2_1, Y_test_data2_1)
    
train_acc_tree_4 = train_acc_tree_4 / 3
val_acc_tree_4 = val_acc_tree_4 / 3
test_acc_tree_4 = test_acc_tree_4 / 3
print('average tree training accuracy: ','\n', train_acc_tree_4, '\n', 
      'average tree validation accuracy: ','\n', val_acc_tree_4,'\n', 
      'average tree testing accuracy: ', test_acc_tree_4)
        
train_acc_rf_4 = train_acc_rf_4 / 3
val_acc_rf_4 = val_acc_rf_4 / 3
test_acc_rf_4 = test_acc_rf_4 / 3
print('average rf training accuracy: ','\n', train_acc_rf_4, '\n', 
      'average rf validation accuracy: ','\n', val_acc_rf_4,'\n', 
      'average rf testing accuracy: ', test_acc_rf_4)

train_acc_knn_4 = train_acc_knn_4 / 3
val_acc_knn_4 = val_acc_knn_4 / 3
test_acc_knn_4 = test_acc_knn_4 / 3
print('average knn training accuracy: ','\n', train_acc_knn_4, '\n', 
      'average knn validation accuracy: ','\n', val_acc_knn_4,'\n', 
      'average knn testing accuracy: ', test_acc_knn_4)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_p

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_p

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_p

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_p

average tree training accuracy:  
 [[0.86117361]
 [0.87127083]
 [0.882125  ]
 [0.89314583]
 [0.90360417]] 
 average tree validation accuracy:  
 [[0.801625  ]
 [0.79669444]
 [0.79108333]
 [0.7875    ]
 [0.77934722]] 
 average tree testing accuracy:  0.810888888888889
average rf training accuracy:  
 [[0.87539583]
 [0.88583333]
 [0.89427083]
 [0.90264583]
 [0.91054167]] 
 average rf validation accuracy:  
 [[0.820125  ]
 [0.81875   ]
 [0.81991667]
 [0.818875  ]
 [0.818125  ]] 
 average rf testing accuracy:  0.8178333333333333
average knn training accuracy:  
 [[0.999625  ]
 [0.81535417]
 [0.796875  ]
 [0.79177083]
 [0.78804167]] 
 average knn validation accuracy:  
 [[0.6945    ]
 [0.76625   ]
 [0.77      ]
 [0.772375  ]
 [0.77345833]] 
 average knn testing accuracy:  0.7796666666666666


## 5. Dataset 2; 50% training and 50% testing

In [30]:
train_acc_tree_5 = 0
val_acc_tree_5 = 0
test_acc_tree_5 = 0

train_acc_rf_5 = 0
val_acc_rf_5 = 0
test_acc_rf_5 = 0

train_acc_knn_5 = 0
val_acc_knn_5 = 0
test_acc_knn_5 = 0

for i in range(3):
    # Split the set: 50% training, 50% testing
    X_train_data2_2 = X_data2[:int(0.5*len(X_data2))] # Get features from train + val set.
    X_test_data2_2  = X_data2[int(0.5*len(X_data2)):] # Get features from test set.     
    Y_train_data2_2 = Y_data2[:int(0.5*len(Y_data2))] # Get labels from train + val set.
    Y_test_data2_2  = Y_data2[int(0.5*len(Y_data2)):] # Get labels from test set.
        
    # Decision Tree Classifier
    classifier_tree = tree.DecisionTreeClassifier(criterion = 'entropy')
    depth_list = [11, 12, 13, 14, 15]
    parameters_tree = {'max_depth': depth_list}
    clf_tree = GridSearchCV(classifier_tree, parameters_tree, cv = 3)
    clf_tree.fit(X_train_data2_2, Y_train_data2_2)

    train_acc_tree_5 += clf_tree.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_tree_5 += clf_tree.cv_results_['mean_test_score'].reshape(5, 1)


    opt_depth_5 = depth_list[clf_tree.best_index_]
    classifier_tree_2 = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = opt_depth_5)
    classifier_tree_2.fit(X_train_data2_2, Y_train_data2_2)
    test_acc_tree_5 += classifier_tree_2.score(X_test_data2_2, Y_test_data2_2)
    
    # Random Forest Classifier
    classifier_rf = RandomForestClassifier(n_estimators=100, random_state=0)
    rf_depth_list = [11, 12, 13, 14, 15]
    parameters_rf = {'max_depth': rf_depth_list}
    clf_rf = GridSearchCV(classifier_rf, parameters_rf, cv = 3)
    clf_rf.fit(X_train_data2_2, Y_train_data2_2)
    
    train_acc_rf_5 += clf_rf.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_rf_5 += clf_rf.cv_results_['mean_test_score'].reshape(5, 1)
    
    opt_rf_depth_5 = rf_depth_list[clf_tree.best_index_]
    classifier_rf_2 = RandomForestClassifier(n_estimators=100, max_depth = opt_rf_depth_5, random_state=0)
    classifier_rf_2.fit(X_train_data2_2, Y_train_data2_2)
    test_acc_rf_5 += classifier_rf_2.score(X_test_data2_2, Y_test_data2_2)
    
    # K-Nearest Neighborhood Classifier
    classifier_knn = KNeighborsClassifier()
    k_list = [1, 4, 8, 12, 16]
    parameters_knn = {'n_neighbors': k_list}
    clf_knn = GridSearchCV(classifier_knn, parameters_knn, cv = 3)
    clf_knn.fit(X_train_data2_2, Y_train_data2_2)
    
    train_acc_knn_5 += clf_knn.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_knn_5 += clf_knn.cv_results_['mean_test_score'].reshape(5, 1)
    
    opt_k_5 = k_list[clf_knn.best_index_]
    classifier_knn_2 = KNeighborsClassifier(n_neighbors=opt_k_5)
    classifier_knn_2.fit(X_train_data2_2, Y_train_data2_2)
    test_acc_knn_5 += classifier_knn_2.score(X_test_data2_2, Y_test_data2_2)
    
train_acc_tree_5 = train_acc_tree_5 / 3
val_acc_tree_5 = val_acc_tree_5 / 3
test_acc_tree_5 = test_acc_tree_5 / 3
print('average tree training accuracy: ','\n', train_acc_tree_5, '\n', 
      'average tree validation accuracy: ','\n', val_acc_tree_5,'\n', 
      'average tree testing accuracy: ', test_acc_tree_5)
        
train_acc_rf_5 = train_acc_rf_5 / 3
val_acc_rf_5 = val_acc_rf_5 / 3
test_acc_rf_5 = test_acc_rf_5 / 3
print('average rf training accuracy: ','\n', train_acc_rf_5, '\n', 
      'average rf validation accuracy: ','\n', val_acc_rf_5,'\n', 
      'average rf testing accuracy: ', test_acc_rf_5)

train_acc_knn_5 = train_acc_knn_5 / 3
val_acc_knn_5 = val_acc_knn_5 / 3
test_acc_knn_5 = test_acc_knn_5 / 3
print('average knn training accuracy: ','\n', train_acc_knn_5, '\n', 
      'average knn validation accuracy: ','\n', val_acc_knn_5,'\n', 
      'average knn testing accuracy: ', test_acc_knn_5)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_p

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_p

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_p

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_p

average tree training accuracy:  
 [[0.86994444]
 [0.88082222]
 [0.89181111]
 [0.902     ]
 [0.9131    ]] 
 average tree validation accuracy:  
 [[0.79635556]
 [0.79217778]
 [0.78686667]
 [0.78155556]
 [0.77568889]] 
 average tree testing accuracy:  0.801
average rf training accuracy:  
 [[0.88576667]
 [0.89513333]
 [0.9039    ]
 [0.91236667]
 [0.9231    ]] 
 average rf validation accuracy:  
 [[0.81686667]
 [0.8176    ]
 [0.81693333]
 [0.8166    ]
 [0.81466667]] 
 average rf testing accuracy:  0.8191333333333334
average knn training accuracy:  
 [[0.9998    ]
 [0.815     ]
 [0.79773333]
 [0.7919    ]
 [0.78776667]] 
 average knn validation accuracy:  
 [[0.69606667]
 [0.76453333]
 [0.76873333]
 [0.77306667]
 [0.776     ]] 
 average knn testing accuracy:  0.7760666666666666


## 6. Dataset 2; 20% training and 80% testing

In [31]:
train_acc_tree_6 = 0
val_acc_tree_6 = 0
test_acc_tree_6 = 0

train_acc_rf_6 = 0
val_acc_rf_6 = 0
test_acc_rf_6 = 0

train_acc_knn_6 = 0
val_acc_knn_6 = 0
test_acc_knn_6 = 0

for i in range(3):
    # Split the set: 20% training, 80% testing
    X_train_data2_3 = X_data2[:int(0.2*len(X_data2))] # Get features from train + val set.
    X_test_data2_3  = X_data2[int(0.2*len(X_data2)):] # Get features from test set.     
    Y_train_data2_3 = Y_data2[:int(0.2*len(Y_data2))] # Get labels from train + val set.
    Y_test_data2_3  = Y_data2[int(0.2*len(Y_data2)):] # Get labels from test set.
        
    # Decision Tree Classifier
    classifier_tree = tree.DecisionTreeClassifier(criterion = 'entropy')
    depth_list = [11, 12, 13, 14, 15]
    parameters_tree = {'max_depth': depth_list}
    clf_tree = GridSearchCV(classifier_tree, parameters_tree, cv = 3)
    clf_tree.fit(X_train_data2_3, Y_train_data2_3)

    train_acc_tree_6 += clf_tree.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_tree_6 += clf_tree.cv_results_['mean_test_score'].reshape(5, 1)


    opt_depth_6 = depth_list[clf_tree.best_index_]
    classifier_tree_2 = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = opt_depth_6)
    classifier_tree_2.fit(X_train_data2_3, Y_train_data2_3)
    test_acc_tree_6 += classifier_tree_2.score(X_test_data2_3, Y_test_data2_3)
    
    # Random Forest Classifier
    classifier_rf = RandomForestClassifier(n_estimators=100, random_state=0)
    rf_depth_list = [11, 12, 13, 14, 15]
    parameters_rf = {'max_depth': rf_depth_list}
    clf_rf = GridSearchCV(classifier_rf, parameters_rf, cv = 3)
    clf_rf.fit(X_train_data2_3, Y_train_data2_3)
    
    train_acc_rf_6 += clf_rf.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_rf_6 += clf_rf.cv_results_['mean_test_score'].reshape(5, 1)
    
    opt_rf_depth_6 = rf_depth_list[clf_tree.best_index_]
    classifier_rf_2 = RandomForestClassifier(n_estimators=100, max_depth = opt_rf_depth_6, random_state=0)
    classifier_rf_2.fit(X_train_data2_3, Y_train_data2_3)
    test_acc_rf_6 += classifier_rf_2.score(X_test_data2_3, Y_test_data2_3)
    
    # K-Nearest Neighborhood Classifier
    classifier_knn = KNeighborsClassifier()
    k_list = [1, 4, 8, 12, 16]
    parameters_knn = {'n_neighbors': k_list}
    clf_knn = GridSearchCV(classifier_knn, parameters_knn, cv = 3)
    clf_knn.fit(X_train_data2_3, Y_train_data2_3)
    
    train_acc_knn_6 += clf_knn.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_knn_6 += clf_knn.cv_results_['mean_test_score'].reshape(5, 1)
    
    opt_k_6 = k_list[clf_knn.best_index_]
    classifier_knn_2 = KNeighborsClassifier(n_neighbors=opt_k_6)
    classifier_knn_2.fit(X_train_data2_3, Y_train_data2_3)
    test_acc_knn_6 += classifier_knn_2.score(X_test_data2_3, Y_test_data2_3)
    
train_acc_tree_6 = train_acc_tree_6 / 3
val_acc_tree_6 = val_acc_tree_6 / 3
test_acc_tree_6 = test_acc_tree_6 / 3
print('average tree training accuracy: ','\n', train_acc_tree_6, '\n', 
      'average tree validation accuracy: ','\n', val_acc_tree_6,'\n', 
      'average tree testing accuracy: ', test_acc_tree_6)
        
train_acc_rf_6 = train_acc_rf_6 / 3
val_acc_rf_6 = val_acc_rf_6 / 3
test_acc_rf_6 = test_acc_rf_6 / 3
print('average rf training accuracy: ','\n', train_acc_rf_6, '\n', 
      'average rf validation accuracy: ','\n', val_acc_rf_6,'\n', 
      'average rf testing accuracy: ', test_acc_rf_6)

train_acc_knn_6 = train_acc_knn_6 / 3
val_acc_knn_6 = val_acc_knn_6 / 3
test_acc_knn_6 = test_acc_knn_6 / 3
print('average knn training accuracy: ','\n', train_acc_knn_6, '\n', 
      'average knn validation accuracy: ','\n', val_acc_knn_6,'\n', 
      'average knn testing accuracy: ', test_acc_knn_6)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_p

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_p

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_p

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_p

average tree training accuracy:  
 [[0.89544444]
 [0.90988889]
 [0.92369444]
 [0.93480556]
 [0.94758333]] 
 average tree validation accuracy:  
 [[0.76805556]
 [0.764     ]
 [0.75988889]
 [0.74994444]
 [0.74405556]] 
 average tree testing accuracy:  0.7915972222222222
average rf training accuracy:  
 [[0.90408333]
 [0.91216667]
 [0.9245    ]
 [0.93758333]
 [0.95158333]] 
 average rf validation accuracy:  
 [[0.81333333]
 [0.8125    ]
 [0.81116667]
 [0.812     ]
 [0.81116667]] 
 average rf testing accuracy:  0.8180416666666667
average knn training accuracy:  
 [[0.99975   ]
 [0.80658333]
 [0.78916667]
 [0.78133333]
 [0.77758333]] 
 average knn validation accuracy:  
 [[0.67766667]
 [0.75866667]
 [0.76116667]
 [0.76433333]
 [0.7655    ]] 
 average knn testing accuracy:  0.7729583333333333


## 7. Dataset 3; 80% training and 20% testing

In [36]:
train_acc_tree_7 = 0
val_acc_tree_7 = 0
test_acc_tree_7 = 0

train_acc_rf_7 = 0
val_acc_rf_7 = 0
test_acc_rf_7 = 0

train_acc_knn_7 = 0
val_acc_knn_7 = 0
test_acc_knn_7 = 0

for i in range(3):
    # Split the set: 80% training, 20% testing
    X_train_data3_1 = X_data3[:int(0.8*len(X_data3))] # Get features from train + val set.
    X_test_data3_1  = X_data3[int(0.8*len(X_data3)):] # Get features from test set.     
    Y_train_data3_1 = Y_data3[:int(0.8*len(Y_data3))] # Get labels from train + val set.
    Y_test_data3_1  = Y_data3[int(0.8*len(Y_data3)):] # Get labels from test set.
        
    # Decision Tree Classifier
    classifier_tree = tree.DecisionTreeClassifier(criterion = 'entropy')
    depth_list = [11, 12, 13, 14, 15]
    parameters_tree = {'max_depth': depth_list}
    clf_tree = GridSearchCV(classifier_tree, parameters_tree, cv = 3)
    clf_tree.fit(X_train_data3_1, Y_train_data3_1)

    train_acc_tree_7 += clf_tree.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_tree_7 += clf_tree.cv_results_['mean_test_score'].reshape(5, 1)


    opt_depth_7 = depth_list[clf_tree.best_index_]
    classifier_tree_2 = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = opt_depth_7)
    classifier_tree_2.fit(X_train_data3_1, Y_train_data3_1)
    test_acc_tree_7 += classifier_tree_2.score(X_test_data3_1, Y_test_data3_1)
    
    # Random Forest Classifier
    classifier_rf = RandomForestClassifier(n_estimators=100, random_state=0)
    rf_depth_list = [11, 12, 13, 14, 15]
    parameters_rf = {'max_depth': rf_depth_list}
    clf_rf = GridSearchCV(classifier_rf, parameters_rf, cv = 3)
    clf_rf.fit(X_train_data3_1, Y_train_data3_1)
    
    train_acc_rf_7 += clf_rf.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_rf_7 += clf_rf.cv_results_['mean_test_score'].reshape(5, 1)
    
    opt_rf_depth_7 = rf_depth_list[clf_tree.best_index_]
    classifier_rf_2 = RandomForestClassifier(n_estimators=100, max_depth = opt_rf_depth_7, random_state=0)
    classifier_rf_2.fit(X_train_data3_1, Y_train_data3_1)
    test_acc_rf_7 += classifier_rf_2.score(X_test_data3_1, Y_test_data3_1)
    
    # K-Nearest Neighborhood Classifier
    classifier_knn = KNeighborsClassifier()
    k_list = [1, 4, 8, 12, 16]
    parameters_knn = {'n_neighbors': k_list}
    clf_knn = GridSearchCV(classifier_knn, parameters_knn, cv = 3)
    clf_knn.fit(X_train_data3_1, Y_train_data3_1)
    
    train_acc_knn_7 += clf_knn.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_knn_7 += clf_knn.cv_results_['mean_test_score'].reshape(5, 1)
    
    opt_k_7 = k_list[clf_knn.best_index_]
    classifier_knn_2 = KNeighborsClassifier(n_neighbors=opt_k_7)
    classifier_knn_2.fit(X_train_data3_1, Y_train_data3_1)
    test_acc_knn_7 += classifier_knn_2.score(X_test_data3_1, Y_test_data3_1)
    
train_acc_tree_7 = train_acc_tree_7 / 3
val_acc_tree_7 = val_acc_tree_7 / 3
test_acc_tree_7 = test_acc_tree_7 / 3
print('average tree training accuracy: ','\n', train_acc_tree_7, '\n', 
      'average tree validation accuracy: ','\n', val_acc_tree_7,'\n', 
      'average tree testing accuracy: ', test_acc_tree_7)
        
train_acc_rf_7 = train_acc_rf_7 / 3
val_acc_rf_7 = val_acc_rf_7 / 3
test_acc_rf_7 = test_acc_rf_7 / 3
print('average rf training accuracy: ','\n', train_acc_rf_7, '\n', 
      'average rf validation accuracy: ','\n', val_acc_rf_7,'\n', 
      'average rf testing accuracy: ', test_acc_rf_7)

train_acc_knn_7 = train_acc_knn_7 / 3
val_acc_knn_7 = val_acc_knn_7 / 3
test_acc_knn_7 = test_acc_knn_7 / 3
print('average knn training accuracy: ','\n', train_acc_knn_7, '\n', 
      'average knn validation accuracy: ','\n', val_acc_knn_7,'\n', 
      'average knn testing accuracy: ', test_acc_knn_7)



average tree training accuracy:  
 [[0.92521634]
 [0.93840504]
 [0.95181289]
 [0.96279987]
 [0.97165081]] 
 average tree validation accuracy:  
 [[0.83288422]
 [0.82845876]
 [0.82725381]
 [0.82249973]
 [0.82039654]] 
 average tree testing accuracy:  0.8559411146161935
average rf training accuracy:  
 [[0.92651987]
 [0.9388104 ]
 [0.94857047]
 [0.957279  ]
 [0.96500163]] 
 average rf validation accuracy:  
 [[0.86644758]
 [0.86960237]
 [0.86960237]
 [0.87163983]
 [0.87098258]] 
 average rf testing accuracy:  0.8748685594111462
average knn training accuracy:  
 [[1.        ]
 [0.844857  ]
 [0.82997042]
 [0.82178764]
 [0.8170884 ]] 
 average knn validation accuracy:  
 [[0.77075255]
 [0.79487348]
 [0.80269471]
 [0.80308906]
 [0.80335196]] 
 average knn testing accuracy:  0.818086225026288


## 8. Dataset 3; 50% training and 50% testing

In [38]:
train_acc_tree_8 = 0
val_acc_tree_8 = 0
test_acc_tree_8 = 0

train_acc_rf_8 = 0
val_acc_rf_8 = 0
test_acc_rf_8 = 0

train_acc_knn_8 = 0
val_acc_knn_8 = 0
test_acc_knn_8 = 0

for i in range(3):
    # Split the set: 50% training, 50% testing
    X_train_data3_2 = X_data3[:int(0.5*len(X_data3))] # Get features from train + val set.
    X_test_data3_2  = X_data3[int(0.5*len(X_data3)):] # Get features from test set.     
    Y_train_data3_2 = Y_data3[:int(0.5*len(Y_data3))] # Get labels from train + val set.
    Y_test_data3_2  = Y_data3[int(0.5*len(Y_data3)):] # Get labels from test set.
        
    # Decision Tree Classifier
    classifier_tree = tree.DecisionTreeClassifier(criterion = 'entropy')
    depth_list = [11, 12, 13, 14, 15]
    parameters_tree = {'max_depth': depth_list}
    clf_tree = GridSearchCV(classifier_tree, parameters_tree, cv = 3)
    clf_tree.fit(X_train_data3_2, Y_train_data3_2)

    train_acc_tree_8 += clf_tree.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_tree_8 += clf_tree.cv_results_['mean_test_score'].reshape(5, 1)


    opt_depth_8 = depth_list[clf_tree.best_index_]
    classifier_tree_2 = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = opt_depth_8)
    classifier_tree_2.fit(X_train_data3_2, Y_train_data3_2)
    test_acc_tree_8 += classifier_tree_2.score(X_test_data3_2, Y_test_data3_2)
    
    # Random Forest Classifier
    classifier_rf = RandomForestClassifier(n_estimators=100, random_state=0)
    rf_depth_list = [11, 12, 13, 14, 15]
    parameters_rf = {'max_depth': rf_depth_list}
    clf_rf = GridSearchCV(classifier_rf, parameters_rf, cv = 3)
    clf_rf.fit(X_train_data3_2, Y_train_data3_2)
    
    train_acc_rf_8 += clf_rf.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_rf_8 += clf_rf.cv_results_['mean_test_score'].reshape(5, 1)
    
    opt_rf_depth_8 = rf_depth_list[clf_tree.best_index_]
    classifier_rf_2 = RandomForestClassifier(n_estimators=100, max_depth = opt_rf_depth_8, random_state=0)
    classifier_rf_2.fit(X_train_data3_2, Y_train_data3_2)
    test_acc_rf_8 += classifier_rf_2.score(X_test_data3_2, Y_test_data3_2)
    
    # K-Nearest Neighborhood Classifier
    classifier_knn = KNeighborsClassifier()
    k_list = [1, 4, 8, 12, 16]
    parameters_knn = {'n_neighbors': k_list}
    clf_knn = GridSearchCV(classifier_knn, parameters_knn, cv = 3)
    clf_knn.fit(X_train_data3_2, Y_train_data3_2)
    
    train_acc_knn_8 += clf_knn.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_knn_8 += clf_knn.cv_results_['mean_test_score'].reshape(5, 1)
    
    opt_k_8 = k_list[clf_knn.best_index_]
    classifier_knn_2 = KNeighborsClassifier(n_neighbors=opt_k_8)
    classifier_knn_2.fit(X_train_data3_2, Y_train_data3_2)
    test_acc_knn_8 += classifier_knn_2.score(X_test_data3_2, Y_test_data3_2)
    
train_acc_tree_8 = train_acc_tree_8 / 3
val_acc_tree_8 = val_acc_tree_8 / 3
test_acc_tree_8 = test_acc_tree_8 / 3
print('average tree training accuracy: ','\n', train_acc_tree_8, '\n', 
      'average tree validation accuracy: ','\n', val_acc_tree_8,'\n', 
      'average tree testing accuracy: ', test_acc_tree_8)
        
train_acc_rf_8 = train_acc_rf_8 / 3
val_acc_rf_8 = val_acc_rf_8 / 3
test_acc_rf_8 = test_acc_rf_8 / 3
print('average rf training accuracy: ','\n', train_acc_rf_8, '\n', 
      'average rf validation accuracy: ','\n', val_acc_rf_8,'\n', 
      'average rf testing accuracy: ', test_acc_rf_8)

train_acc_knn_8 = train_acc_knn_8 / 3
val_acc_knn_8 = val_acc_knn_8 / 3
test_acc_knn_8 = test_acc_knn_8 / 3
print('average knn training accuracy: ','\n', train_acc_knn_8, '\n', 
      'average knn validation accuracy: ','\n', val_acc_knn_8,'\n', 
      'average knn testing accuracy: ', test_acc_knn_8)



average tree training accuracy:  
 [[0.94107347]
 [0.95374577]
 [0.96596214]
 [0.97537422]
 [0.98206966]] 
 average tree validation accuracy:  
 [[0.83012585]
 [0.83065166]
 [0.82662039]
 [0.8208364 ]
 [0.82293967]] 
 average tree testing accuracy:  0.8344199088678583
average rf training accuracy:  
 [[0.93632358]
 [0.94820695]
 [0.95772422]
 [0.96576928]
 [0.97434015]] 
 average rf validation accuracy:  
 [[0.8682301 ]
 [0.86949206]
 [0.87117468]
 [0.87338311]
 [0.87590704]] 
 average rf testing accuracy:  0.8687346652646338
average knn training accuracy:  
 [[1.        ]
 [0.8447789 ]
 [0.82742673]
 [0.81864561]
 [0.81412365]] 
 average knn validation accuracy:  
 [[0.76653696]
 [0.78672836]
 [0.79713955]
 [0.79934799]
 [0.79756021]] 
 average knn testing accuracy:  0.8017875920084122


## 9. Dataset 3; 20% training and 80% testing

In [39]:
train_acc_tree_9 = 0
val_acc_tree_9 = 0
test_acc_tree_9 = 0

train_acc_rf_9 = 0
val_acc_rf_9 = 0
test_acc_rf_9 = 0

train_acc_knn_9 = 0
val_acc_knn_9 = 0
test_acc_knn_9 = 0

for i in range(3):
    # Split the set: 20% training, 80% testing
    X_train_data3_3 = X_data3[:int(0.2*len(X_data3))] # Get features from train + val set.
    X_test_data3_3  = X_data3[int(0.2*len(X_data3)):] # Get features from test set.     
    Y_train_data3_3 = Y_data3[:int(0.2*len(Y_data3))] # Get labels from train + val set.
    Y_test_data3_3  = Y_data3[int(0.2*len(Y_data3)):] # Get labels from test set.
        
    # Decision Tree Classifier
    classifier_tree = tree.DecisionTreeClassifier(criterion = 'entropy')
    depth_list = [11, 12, 13, 14, 15]
    parameters_tree = {'max_depth': depth_list}
    clf_tree = GridSearchCV(classifier_tree, parameters_tree, cv = 3)
    clf_tree.fit(X_train_data3_3, Y_train_data3_3)

    train_acc_tree_9 += clf_tree.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_tree_9 += clf_tree.cv_results_['mean_test_score'].reshape(5, 1)


    opt_depth_9 = depth_list[clf_tree.best_index_]
    classifier_tree_2 = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = opt_depth_9)
    classifier_tree_2.fit(X_train_data3_3, Y_train_data3_3)
    test_acc_tree_9 += classifier_tree_2.score(X_test_data3_3, Y_test_data3_3)
    
    # Random Forest Classifier
    classifier_rf = RandomForestClassifier(n_estimators=100, random_state=0)
    rf_depth_list = [11, 12, 13, 14, 15]
    parameters_rf = {'max_depth': rf_depth_list}
    clf_rf = GridSearchCV(classifier_rf, parameters_rf, cv = 3)
    clf_rf.fit(X_train_data3_3, Y_train_data3_3)
    
    train_acc_rf_9 += clf_rf.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_rf_9 += clf_rf.cv_results_['mean_test_score'].reshape(5, 1)
    
    opt_rf_depth_9 = rf_depth_list[clf_tree.best_index_]
    classifier_rf_2 = RandomForestClassifier(n_estimators=100, max_depth = opt_rf_depth_9, random_state=0)
    classifier_rf_2.fit(X_train_data3_3, Y_train_data3_3)
    test_acc_rf_9 += classifier_rf_2.score(X_test_data3_3, Y_test_data3_3)
    
    # K-Nearest Neighborhood Classifier
    classifier_knn = KNeighborsClassifier()
    k_list = [1, 4, 8, 12, 16]
    parameters_knn = {'n_neighbors': k_list}
    clf_knn = GridSearchCV(classifier_knn, parameters_knn, cv = 3)
    clf_knn.fit(X_train_data3_3, Y_train_data3_3)
    
    train_acc_knn_9 += clf_knn.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc_knn_9 += clf_knn.cv_results_['mean_test_score'].reshape(5, 1)
    
    opt_k_9 = k_list[clf_knn.best_index_]
    classifier_knn_2 = KNeighborsClassifier(n_neighbors=opt_k_9)
    classifier_knn_2.fit(X_train_data3_3, Y_train_data3_3)
    test_acc_knn_9 += classifier_knn_2.score(X_test_data3_3, Y_test_data3_3)
    
train_acc_tree_9 = train_acc_tree_9 / 3
val_acc_tree_9 = val_acc_tree_9 / 3
test_acc_tree_9 = test_acc_tree_9 / 3
print('average tree training accuracy: ','\n', train_acc_tree_9, '\n', 
      'average tree validation accuracy: ','\n', val_acc_tree_9,'\n', 
      'average tree testing accuracy: ', test_acc_tree_9)
        
train_acc_rf_9 = train_acc_rf_9 / 3
val_acc_rf_9 = val_acc_rf_9 / 3
test_acc_rf_9 = test_acc_rf_9 / 3
print('average rf training accuracy: ','\n', train_acc_rf_9, '\n', 
      'average rf validation accuracy: ','\n', val_acc_rf_9,'\n', 
      'average rf testing accuracy: ', test_acc_rf_9)

train_acc_knn_9 = train_acc_knn_9 / 3
val_acc_knn_9 = val_acc_knn_9 / 3
test_acc_knn_9 = test_acc_knn_9 / 3
print('average knn training accuracy: ','\n', train_acc_knn_9, '\n', 
      'average knn validation accuracy: ','\n', val_acc_knn_9,'\n', 
      'average knn testing accuracy: ', test_acc_knn_9)



average tree training accuracy:  
 [[0.95897951]
 [0.96923384]
 [0.9777798 ]
 [0.98382786]
 [0.98939374]] 
 average tree validation accuracy:  
 [[0.81532124]
 [0.81312999]
 [0.81041283]
 [0.80865983]
 [0.80883513]] 
 average tree testing accuracy:  0.8240886785839466
average rf training accuracy:  
 [[0.95950509]
 [0.96778762]
 [0.97633447]
 [0.98422288]
 [0.99171665]] 
 average rf validation accuracy:  
 [[0.86011044]
 [0.86195109]
 [0.86247699]
 [0.86195109]
 [0.86195109]] 
 average rf testing accuracy:  0.8610672975814931
average knn training accuracy:  
 [[1.        ]
 [0.83250174]
 [0.80712707]
 [0.79831889]
 [0.7930596 ]] 
 average knn validation accuracy:  
 [[0.75729687]
 [0.77964765]
 [0.7793847 ]
 [0.78122535]
 [0.78069945]] 
 average knn testing accuracy:  0.7908123028391167
