# Generation of H'(D)

In [1]:
from scipy.stats import ttest_ind
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report 
from mlxtend.evaluate import paired_ttest_5x2cv
import numpy as np
import pandas as pd
import pickle

In this section we will be using the meta features that were generated for the dataset D and the model we selected back in part 1 to derive select us a model for the respective dataset D. We will then analyse the performance of the hyperparameter tuned model and the derived model through means of t-testing.

In [2]:
dataset_meta_features =np.array([0.33185840707964603, 0.01327433628318584, 0.0, 0.12389380530973451, 0.5309734513274337, 0.0, 0.0, 0.0])

In [3]:
dataset_meta_features 

array([0.33185841, 0.01327434, 0.        , 0.12389381, 0.53097345,
       0.        , 0.        , 0.        ])

In [4]:
loaded_model = pickle.load(open('h_star_d.sav', 'rb'))

In [5]:
h_star_d = loaded_model

In [6]:
h_prime_d = h_star_d.predict([dataset_meta_features])
h_prime_d

array([1])

As you can see from the above model the, predicted label is 1. This implies that the model selected for this particular dataset is a K-Nearest Neighbour algorithm.

## Performing evaluation test for the h'(D) model = KNN()

# Importing Meta-Learning Dataset 

In [7]:
dataset = pd.read_csv('./meta-dataset.csv')
dataset = dataset.iloc[:,1:]
dataset

Unnamed: 0,beta_0,beta_1,beta_2,beta_3,beta_4,beta_5,beta_6,beta_7,label
0,0.116358,0.134907,0.278246,0.470489,0.0,0.0,0.0,0.0,1
1,0.096939,0.210459,0.269133,0.423469,0.0,0.0,0.0,0.0,1
2,0.067696,0.157957,0.483373,0.290974,0.0,0.0,0.0,0.0,1
3,0.085299,0.132486,0.328494,0.453721,0.0,0.0,0.0,0.0,1
4,0.026059,0.262215,0.288274,0.423453,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...
221,0.000000,0.335975,0.664025,0.000000,0.0,0.0,0.0,0.0,1
222,0.000000,0.322368,0.677632,0.000000,0.0,0.0,0.0,0.0,1
223,0.062405,0.138508,0.512938,0.286149,0.0,0.0,0.0,0.0,1
224,0.001678,0.318792,0.677852,0.001678,0.0,0.0,0.0,0.0,1


In [8]:
features = dataset.iloc[:,:-1]
features

Unnamed: 0,beta_0,beta_1,beta_2,beta_3,beta_4,beta_5,beta_6,beta_7
0,0.116358,0.134907,0.278246,0.470489,0.0,0.0,0.0,0.0
1,0.096939,0.210459,0.269133,0.423469,0.0,0.0,0.0,0.0
2,0.067696,0.157957,0.483373,0.290974,0.0,0.0,0.0,0.0
3,0.085299,0.132486,0.328494,0.453721,0.0,0.0,0.0,0.0
4,0.026059,0.262215,0.288274,0.423453,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
221,0.000000,0.335975,0.664025,0.000000,0.0,0.0,0.0,0.0
222,0.000000,0.322368,0.677632,0.000000,0.0,0.0,0.0,0.0
223,0.062405,0.138508,0.512938,0.286149,0.0,0.0,0.0,0.0
224,0.001678,0.318792,0.677852,0.001678,0.0,0.0,0.0,0.0


In [9]:
labels = dataset.iloc[:,-1]
labels

0      1
1      1
2      1
3      1
4      1
      ..
221    1
222    1
223    1
224    1
225    1
Name: label, Length: 226, dtype: int64

In [19]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2)

In [20]:
h_prime_d = KNeighborsClassifier()
h_prime_d.fit(x_train, y_train)

In [21]:
y_pred = h_prime_d.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.69      0.94      0.79        31
           2       0.00      0.00      0.00        10

    accuracy                           0.63        46
   macro avg       0.23      0.31      0.26        46
weighted avg       0.47      0.63      0.54        46



### Performing t-testing on the selected model and the predicted model, that h_star_d and h_prime_d

To compare the selected and predicted model, we apply t-testing that compares the mean of the two models. A value of alpha = 0.05 implies that the null hypothesis is rejected 5 % of the time when it is in fact true. Therefore, if the p metric evaluated from t-testing between the two models is less than 0.05 then it is considered significant (the models are not similar) else it is considered insignificant (the models are similar).

In our case, we were able to get p as 0.617 which is larger than alpha thus confirming that the models are similar.

We also noted that the manually tuned model (KNN) is superior based on the 'accuracy' and 'weighted avg' comparison. However, t-testing has shown us that the difference between the manually tuned and predicted model is insignificant. We believe that the dataset being too small is a huge factor in this experiment.

In [24]:
t, p = paired_ttest_5x2cv(estimator1=h_star_d,estimator2=h_prime_d,X=features,
                          y=labels)
alpha = 0.05

print('t statistic: %.3f' % t)
print('aplha ', alpha)
print('p value: %.3f' % p)

if p > alpha:
    print("Fail to reject null hypotesis")
else:
    print("Reject null hypotesis")

t statistic: 0.533
aplha  0.05
p value: 0.617
Fail to reject null hypotesis
