In [20]:
!pip install imblearn




In [21]:
# General
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# ML

from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier



# Custom
import sys,os
sys.path.append( '.' )
sys.path.append( '..' )
import Components.Outlier_Detection as Outlier_Detection
import Components.Feature_Selection as Feature_Selection
import Components.Normalisation as Normalisation
import Components.data_fetching as data_fetching
import Components.Data_Augmentation as Data_Augmentation
import Components.wrapper as wrapper
import Components.two_step_utils as two_step_utils




# CAREFUL:
# If you make changes to a custom module, you have to reload it, i.e rerun this cell
import importlib
importlib.reload(Data_Augmentation)
importlib.reload(Outlier_Detection)
importlib.reload(Feature_Selection)
importlib.reload(Normalisation)
importlib.reload(data_fetching)
importlib.reload(wrapper)
importlib.reload(two_step_utils)

<module 'Components.two_step_utils' from '../Components/two_step_utils.py'>

### Data Imports

In [22]:
X, y = data_fetching.get_train_data()
x_test = data_fetching.get_test_data()



### Normalisation

In [23]:
X = Normalisation.gaussian(X)
x_test = Normalisation.gaussian(x_test)

# TASK 1

In [24]:
# get X_1 y_1 etc:
X_1, y_1, X_2, y_2 = two_step_utils.transform(X,y)

### Pipeline setup

In [25]:
# we need: -scaling transformer -weighted loss function -multi-class model 
pipe = Pipeline([
                 ('classification',
                  SVC(C=1.0,kernel='rbf',gamma='scale',shrinking=True,cache_size=1000,class_weight='balanced'))
                ])
pipe_2 = Pipeline([('classification',
                   RandomForestClassifier())])

In [34]:
gamma_range = np.logspace(-5, -3, 3)
parameters = [{'classification__kernel': ['rbf'], 'classification__gamma': gamma_range,'classification__C': [0.1,0.5,1.0,5.0]},
              {'classification__kernel': ['linear'], 'classification__C': np.logspace(-5,-3,3)}]

param_grid = {
    'classification__bootstrap': [True,False],
    'classification__max_depth': [80, 100, 120],
    'classification__max_features': [2, 3],
    'classification__min_samples_leaf': [3, 4, 5],
    'classification__min_samples_split': [8, 10, 12],
    'classification__n_estimators': [100, 500, 1000, 1500]
}

clf = GridSearchCV(pipe_2, param_grid,cv=10,n_jobs=-1,scoring='balanced_accuracy', verbose=10)




In [35]:
clf.fit(X_1, y_1)

Fitting 10 folds for each of 432 candidates, totalling 4320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:   17.5s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:   20.9s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   31.7s
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:   38.8s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:   48.2s
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:   56.3s
[Parallel(n_jobs=-1)]: Done 113 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 189 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 233 tasks      | elapsed:  

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('classification',
                                        RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'classification__bootstrap': [True, False],
                         'classification__max_depth': [80, 100, 120],
                         'classification__max_features': [2, 3],
                         'classification__min_samples_leaf': [3, 4, 5],
                         'classification__min_samples_split': [8, 10, 12],
                         'classification__n_estimators': [100, 500, 1000,
                                                          1500]},
             scoring='balanced_accuracy', verbose=10)

In [36]:
print(clf.best_params_)
print(clf.best_score_)

{'classification__bootstrap': False, 'classification__max_depth': 120, 'classification__max_features': 3, 'classification__min_samples_leaf': 4, 'classification__min_samples_split': 8, 'classification__n_estimators': 100}
0.7308333333333333


In [37]:
results = pd.DataFrame(clf.cv_results_)
pd.set_option("display.max_rows", None, "display.max_columns", None, "display.max_colwidth",200) 
results[["params","mean_test_score"]]

Unnamed: 0,params,mean_test_score
0,"{'classification__bootstrap': True, 'classification__max_depth': 80, 'classification__max_features': 2, 'classification__min_samples_leaf': 3, 'classification__min_samples_split': 8, 'classificati...",0.713056
1,"{'classification__bootstrap': True, 'classification__max_depth': 80, 'classification__max_features': 2, 'classification__min_samples_leaf': 3, 'classification__min_samples_split': 8, 'classificati...",0.715
2,"{'classification__bootstrap': True, 'classification__max_depth': 80, 'classification__max_features': 2, 'classification__min_samples_leaf': 3, 'classification__min_samples_split': 8, 'classificati...",0.712917
3,"{'classification__bootstrap': True, 'classification__max_depth': 80, 'classification__max_features': 2, 'classification__min_samples_leaf': 3, 'classification__min_samples_split': 8, 'classificati...",0.712083
4,"{'classification__bootstrap': True, 'classification__max_depth': 80, 'classification__max_features': 2, 'classification__min_samples_leaf': 3, 'classification__min_samples_split': 10, 'classificat...",0.717361
5,"{'classification__bootstrap': True, 'classification__max_depth': 80, 'classification__max_features': 2, 'classification__min_samples_leaf': 3, 'classification__min_samples_split': 10, 'classificat...",0.719861
6,"{'classification__bootstrap': True, 'classification__max_depth': 80, 'classification__max_features': 2, 'classification__min_samples_leaf': 3, 'classification__min_samples_split': 10, 'classificat...",0.7125
7,"{'classification__bootstrap': True, 'classification__max_depth': 80, 'classification__max_features': 2, 'classification__min_samples_leaf': 3, 'classification__min_samples_split': 10, 'classificat...",0.714167
8,"{'classification__bootstrap': True, 'classification__max_depth': 80, 'classification__max_features': 2, 'classification__min_samples_leaf': 3, 'classification__min_samples_split': 12, 'classificat...",0.707222
9,"{'classification__bootstrap': True, 'classification__max_depth': 80, 'classification__max_features': 2, 'classification__min_samples_leaf': 3, 'classification__min_samples_split': 12, 'classificat...",0.709583


# TASK 2

In [None]:
clf.fit(X_1, y_1)

In [None]:
print(clf.best_params_)
print(clf.best_score_)