# 1. Installing packages

In [87]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# 2. Extract meta-data from csv file

## meta-dataset csv:

In [88]:
df = pd.read_csv('./meta-dataset.csv')
print(df)

     Unnamed: 0    beta_0    beta_1    beta_2    beta_3  beta_4  beta_5  \
0             0  0.116358  0.134907  0.278246  0.470489     0.0     0.0   
1             1  0.096939  0.210459  0.269133  0.423469     0.0     0.0   
2             2  0.067696  0.157957  0.483373  0.290974     0.0     0.0   
3             3  0.085299  0.132486  0.328494  0.453721     0.0     0.0   
4             4  0.026059  0.262215  0.288274  0.423453     0.0     0.0   
..          ...       ...       ...       ...       ...     ...     ...   
221         221  0.000000  0.335975  0.664025  0.000000     0.0     0.0   
222         222  0.000000  0.322368  0.677632  0.000000     0.0     0.0   
223         223  0.062405  0.138508  0.512938  0.286149     0.0     0.0   
224         224  0.001678  0.318792  0.677852  0.001678     0.0     0.0   
225         225  0.000000  0.344648  0.655352  0.000000     0.0     0.0   

     beta_6  beta_7  label  
0       0.0     0.0      1  
1       0.0     0.0      1  
2       0.0 

## Split csv data into features and labels

In [70]:
# features = df.iloc[:, 1:-1].values
# labels = df.iloc[:, [-1]].values.reshape(-1, 1)
features = df.iloc[:, 1:-1]
labels = df.iloc[:, [-1]]
scaler = StandardScaler()

In [71]:
features

Unnamed: 0,beta_0,beta_1,beta_2,beta_3,beta_4,beta_5,beta_6,beta_7
0,0.116358,0.134907,0.278246,0.470489,0.0,0.0,0.0,0.0
1,0.096939,0.210459,0.269133,0.423469,0.0,0.0,0.0,0.0
2,0.067696,0.157957,0.483373,0.290974,0.0,0.0,0.0,0.0
3,0.085299,0.132486,0.328494,0.453721,0.0,0.0,0.0,0.0
4,0.026059,0.262215,0.288274,0.423453,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
221,0.000000,0.335975,0.664025,0.000000,0.0,0.0,0.0,0.0
222,0.000000,0.322368,0.677632,0.000000,0.0,0.0,0.0,0.0
223,0.062405,0.138508,0.512938,0.286149,0.0,0.0,0.0,0.0
224,0.001678,0.318792,0.677852,0.001678,0.0,0.0,0.0,0.0


In [72]:
features_std = scaler.fit_transform(features)
features_std

array([[ 1.21829768, -0.3165277 , -0.12547885, ..., -0.31748475,
        -0.14503491, -0.21312473],
       [ 0.83158988,  0.37134376, -0.16543095, ..., -0.31748475,
        -0.14503491, -0.21312473],
       [ 0.24924362, -0.10666632,  0.7737579 , ..., -0.31748475,
        -0.14503491, -0.21312473],
       ...,
       [ 0.14387595, -0.28374088,  0.90336384, ..., -0.31748475,
        -0.14503491, -0.21312473],
       [-1.06545201,  1.35767216,  1.62631896, ..., -0.31748475,
        -0.14503491, -0.21312473],
       [-1.09886505,  1.59307727,  1.52768379, ..., -0.31748475,
        -0.14503491, -0.21312473]])

In [73]:
labels

Unnamed: 0,label
0,1
1,1
2,1
3,1
4,1
...,...
221,1
222,1
223,1
224,1


In [74]:
lable_count = labels.value_counts()
lable_count

label
1        148
2         58
0         20
dtype: int64

In [75]:
# counter = Counter(labels)
# for k, v in counter.items():
#     dist = v / len(labels.values.reshape(-1, 1)) * 100
#     print(f"Class={k}, n={v} ({dist}%)")

# 3. Split features and Labels into train and test datasets

In [76]:
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.35, random_state=42)

## 4. Train with Decision Tree Classifier

### Initial f1_score accuracy before balancing data

In [77]:
# decision_tree = DecisionTreeClassifier()
# # decision_tree.fit(features_train, labels_train)
# scores = cross_validate(decision_tree, features_train, labels_train, cv=5, n_jobs=-1, scoring="f1_macro")
# scores

### Oversampling minority class Undersampling majority class

In [78]:
oversample = SMOTE()
undersample = RandomUnderSampler()
steps = [("o", oversample), ("u", undersample)]
pipeline = Pipeline(steps=steps)
# Transform the dataset
features_train, labels_train = pipeline.fit_resample(features_train, labels_train)

In [79]:
### Test again

In [80]:
decision_tree = DecisionTreeClassifier()
# decision_tree.fit(features_train, labels_train)
# scores = cross_validate(decision_tree, features_train, labels_train, cv=5, n_jobs=-1, scoring="f1_macro")
# scores

In [81]:
# labels_predict = decision_tree.predict(features_test)
# accuracy_score(labels_test, labels_predict)

In [82]:
param_dist = {
    "criterion": ["gini", "entropy"],
    "max_depth": range(5,30),
    
}

In [83]:
grid = GridSearchCV(decision_tree, param_grid=param_dist, cv=5, n_jobs=-1)
grid.fit(features_train, labels_train)

In [84]:
grid.best_estimator_

In [85]:
grid.best_score_

0.7542937853107345

# 5. Train with Knn classifier

In [89]:
knn = KNeighborsClassifier()
knn_neighbours_range = range(4,113)
knn_parameters = {'n_neighbors': knn_neighbours_range, 'weights':('uniform', 'distance'), 'p':[2], # p=2 for euclidian
                    'metric':['minkowski']}
knn_classifier = GridSearchCV(knn, knn_parameters, cv=5, n_jobs=-1, refit=True)
knn_classifier.fit(features_train, labels_train)

SyntaxError: invalid syntax. Perhaps you forgot a comma? (2287643669.py, line 5)