# Part - 1: Generating h<sup>*</sup>(D) model

## Step - 1: Importing respective libraries and meta-dataset.csv

In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report, roc_auc_score, log_loss
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('./meta-dataset.csv')
df

Unnamed: 0.1,Unnamed: 0,beta_0,beta_1,beta_2,beta_3,beta_4,beta_5,beta_6,beta_7,label
0,0,0.116358,0.134907,0.278246,0.470489,0.0,0.0,0.0,0.0,1
1,1,0.096939,0.210459,0.269133,0.423469,0.0,0.0,0.0,0.0,1
2,2,0.067696,0.157957,0.483373,0.290974,0.0,0.0,0.0,0.0,1
3,3,0.085299,0.132486,0.328494,0.453721,0.0,0.0,0.0,0.0,1
4,4,0.026059,0.262215,0.288274,0.423453,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...
221,221,0.000000,0.335975,0.664025,0.000000,0.0,0.0,0.0,0.0,1
222,222,0.000000,0.322368,0.677632,0.000000,0.0,0.0,0.0,0.0,1
223,223,0.062405,0.138508,0.512938,0.286149,0.0,0.0,0.0,0.0,1
224,224,0.001678,0.318792,0.677852,0.001678,0.0,0.0,0.0,0.0,1


In [3]:
column_headers = df.columns.values.tolist()
column_headers

for column in column_headers: 
    print(f"{column} = {df[column].corr(df['label'])}")

Unnamed: 0 = 0.034580032509154354
beta_0 = 0.09342538075102236
beta_1 = -0.12233882935886431
beta_2 = -0.17387561852802072
beta_3 = 0.07023988688013583
beta_4 = 0.04900796623224159
beta_5 = 0.0023708454277309174
beta_6 = 0.0054813855785419115
beta_7 = 0.1342352819275331
label = 1.0


In [4]:
# features = df.iloc[:, [2,3,8]]
features = df.iloc[:, 1:-1]
labels = df.iloc[:, [-1]]
scaler = StandardScaler()

In [5]:
features

Unnamed: 0,beta_0,beta_1,beta_2,beta_3,beta_4,beta_5,beta_6,beta_7
0,0.116358,0.134907,0.278246,0.470489,0.0,0.0,0.0,0.0
1,0.096939,0.210459,0.269133,0.423469,0.0,0.0,0.0,0.0
2,0.067696,0.157957,0.483373,0.290974,0.0,0.0,0.0,0.0
3,0.085299,0.132486,0.328494,0.453721,0.0,0.0,0.0,0.0
4,0.026059,0.262215,0.288274,0.423453,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
221,0.000000,0.335975,0.664025,0.000000,0.0,0.0,0.0,0.0
222,0.000000,0.322368,0.677632,0.000000,0.0,0.0,0.0,0.0
223,0.062405,0.138508,0.512938,0.286149,0.0,0.0,0.0,0.0
224,0.001678,0.318792,0.677852,0.001678,0.0,0.0,0.0,0.0


In [6]:
features_std = scaler.fit_transform(features)
features_std

array([[ 1.21829768, -0.3165277 , -0.12547885, ..., -0.31748475,
        -0.14503491, -0.21312473],
       [ 0.83158988,  0.37134376, -0.16543095, ..., -0.31748475,
        -0.14503491, -0.21312473],
       [ 0.24924362, -0.10666632,  0.7737579 , ..., -0.31748475,
        -0.14503491, -0.21312473],
       ...,
       [ 0.14387595, -0.28374088,  0.90336384, ..., -0.31748475,
        -0.14503491, -0.21312473],
       [-1.06545201,  1.35767216,  1.62631896, ..., -0.31748475,
        -0.14503491, -0.21312473],
       [-1.09886505,  1.59307727,  1.52768379, ..., -0.31748475,
        -0.14503491, -0.21312473]])

In [7]:
labels 

Unnamed: 0,label
0,1
1,1
2,1
3,1
4,1
...,...
221,1
222,1
223,1
224,1


In [8]:
lable_count = labels.value_counts()
lable_count

label
1        148
2         58
0         20
dtype: int64

## Step-2: Splitting the dataset to training data and testing data respectively

In [9]:
features_train, features_test, labels_train, labels_test = train_test_split(features_std, labels, test_size = 0.35)
print(f'features_train = \n{features_train}')
print(f'features_test = \n{features_test}')
print(f'labels_train = \n{labels_train}')
print(f'labels_test = \n{labels_test}')

features_train = 
[[ 1.13614231 -1.2118509   0.33529539 ... -0.31748475 -0.14503491
  -0.21312473]
 [ 1.32527128 -0.1361286  -0.24805806 ... -0.31748475 -0.14503491
  -0.21312473]
 [-1.09886505  1.07515501  0.39956681 ... -0.31748475 -0.14503491
  -0.21312473]
 ...
 [-1.07628663 -0.39898742  1.40332228 ... -0.31748475 -0.14503491
  -0.21312473]
 [ 1.53851979 -0.79336988  0.02626097 ... -0.31748475 -0.14503491
  -0.21312473]
 [-1.09886505 -0.02389797  2.02536136 ... -0.31748475 -0.14503491
  -0.21312473]]
features_test = 
[[-6.92254074e-02 -1.96766896e-01 -8.91937578e-01  1.56632415e+00
  -3.67731411e-01 -3.17484754e-01 -1.45034912e-01 -2.13124728e-01]
 [-1.09886505e+00  2.62318126e+00  1.03169578e+00 -1.26108279e+00
  -3.67731411e-01 -3.17484754e-01 -1.45034912e-01 -2.13124728e-01]
 [ 1.62213971e+00 -1.21671233e+00 -3.51330514e-01  1.17588028e+00
  -3.67731411e-01 -3.17484754e-01 -1.45034912e-01 -2.13124728e-01]
 [-1.09886505e+00  3.93037075e-01  2.10549491e+00 -1.26108279e+00
  -3.677

## Step-3: Instantiating K-Fold Cross-Validation Algorithm

In [10]:
# RepeatedStratifiedKFold_inner = KFold(n_splits=10)
# RepeatedStratifiedKFold_outer = KFold(n_splits=10)
# RepeatedStratifiedKFold_inner = KFold(n_splits=10,shuffle=True)
# RepeatedStratifiedKFold_outer = KFold(n_splits=5,shuffle=True)

#5,
KFold_param = KFold(n_splits=5, shuffle=True)
# RepeatedStratifiedKFold = KFold(n_splits=5, shuffle=True)

## Step-4: Performing Tradional Grid Search on both the Decision Tree model and the K-Nearest-Neighbour model
We perform grid first search on the decision tree model and the K-nearest-neighbour model seperately, as we first find the most optimal variation of each of the models seperately with its respective parameters and then the compare both. The best model is then chosen as our h<sup>*</sup>(D)

### Step-4.1: Initializing Decision-Tree model and selecting most relavant hyperparametes

The following hyperparameters were choosen to be used in the for the decision tree model:
1. Criterion: 
1. Splitter:
1. Max-Depth: The depth will detemine how well the model behaves on unseen data.
1. Min-Samples-Split: 
1. Class-weight: We have choosen the value of the class-weight parameter to be None because we assume there is no noise in the dataset. The reason for this assumption is because the it in mentioned in the project proposal that the data-preprpcessing stages for the pipeline have already been conducted for us.

In [11]:
decision_tree = DecisionTreeClassifier()
criterion_values = ['entropy','gini','log_loss']
splitter_values = ['best', 'random']
max_depth_values = range(5,30)
min_samples_split_values = range(2,4)
min_samples_leaf_values = range(1,3)
max_features_values = range(2,8)
decision_tree_parameters = {'criterion':criterion_values, 'splitter':splitter_values, 'max_depth':max_depth_values, 
                            'min_samples_split':min_samples_split_values, 'class_weight':['balanced'], 'min_samples_leaf':min_samples_leaf_values,
                            'max_features':max_features_values} # Need to verify if the class-weight parameter is important because there is no noise in the dataset


### Step-4.2: Performing Grid Search to find the optimal set of hyperparameters for the decision tree model

In [12]:
decision_tree_classifier = GridSearchCV(decision_tree, decision_tree_parameters, cv=KFold_param)
for _ in range(20):
    decision_tree_classifier.fit(features_train,labels_train)

In [13]:
decision_tree_classifier.best_params_

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 29,
 'max_features': 4,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'splitter': 'best'}

In [14]:
labels_predict = decision_tree_classifier.predict(features_test)
print(classification_report(labels_test, labels_predict))

              precision    recall  f1-score   support

           0       0.18      0.29      0.22         7
           1       0.77      0.83      0.80        52
           2       0.23      0.14      0.18        21

    accuracy                           0.60        80
   macro avg       0.39      0.42      0.40        80
weighted avg       0.58      0.60      0.58        80



### Step-4.3: Performing Grid Search to find the optimal set of hyperparameters for the K-Nearest-Neighbour model

In [15]:
knn = KNeighborsClassifier()
knn_neighbours_range = range(4,113)
knn_parameters = {'n_neighbors': knn_neighbours_range, 'weights':('uniform', 'distance'), 'p':[2], # p=2 for euclidian
                    'metric':['minkowski']}

In [16]:
knn_classifier = GridSearchCV(knn, knn_parameters, cv=KFold_param, refit=True)
knn_classifier.fit(features_train, labels_train)

NameError: name 'kfold_cv' is not defined

In [None]:
labels_predict = knn_classifier.predict(features_test)
print(accuracy_score(labels_test, labels_predict))
print("\n The best parameters across ALL searched params:\n",knn_classifier.best_params_)