## Imports

In [186]:
# !pip install -U imbalanced-learn
# !pip install xgboost

# General
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# ML
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

from imblearn.under_sampling import RandomUnderSampler
import xgboost as xgb

# Custom
import sys,os
sys.path.append( '.' )
sys.path.append( '..' )
import Components.data_fetching as data_fetching
import Components.MultiSample as MultiSample

# CAREFUL:
# If you make changes to a custom module, you have to reload it, i.e rerun this cell
import importlib
importlib.reload(data_fetching)
importlib.reload(MultiSample)

<module 'Components.MultiSample' from '../Components/MultiSample.py'>

## Data Preprocessing

### Data Import

In [138]:
X, y = data_fetching.get_train_data()
y = np.ravel(y)
x_test = data_fetching.get_test_data()

## Model

### Pipeline Setup

In [86]:
pipe = Pipeline([('scaling',StandardScaler()),
                 ('classification', MultiSample.MultiDownSyndrome(verbose=True))])

### CV Score Test

In [179]:
scaler = StandardScaler()
X = scaler.fit_transform(X,y)

multi = MultiSample.MultiDownSyndrome(clf_type='xgb',max_depth=5,learning_rate=0.1,n_estimators=200,n_clf=10,verbose=True)
cv_score = cross_val_score(multi, X, y, cv=10, scoring='balanced_accuracy')
print(cv_score)
print(np.mean(cv_score))

Training  10 estimators
1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,
Predicting with  10 estimators
1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,
Training  10 estimators
1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,
Predicting with  10 estimators
1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,
Training  10 estimators
1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,
Predicting with  10 estimators
1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,
Training  10 estimators
1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,
Predicting with  10 estimators
1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,
Training  10 estimators
1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,
Predicting with  10 estimators
1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,
Training  10 estimators
1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,
Predicting with  10 estimators
1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,
Training  10 estimators
1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,
Predicting with  10 estimators
1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,
Training  10 estimators
1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,
Predicting with  10 estimators
1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,
Training  10 estimators
1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,

In [190]:
# scaler = StandardScaler()
# X = scaler.fit_transform(X,y)

# x_train,x_test_2,y_train,y_test = train_test_split(X,y)
# multi.fit(x_train,y_train)
# y_pred = multi.predict(x_test_2)
confusion_matrix(y_test, y_pred)

array([[ 99,  15,  26],
       [164, 660,  87],
       [ 29,   7, 113]])

#### Results of CV Score Test

Avg: **0.664286331** 
```python
multi = MultiSample.MultiDownSyndrome(clf_type='xgb',n_clf=5,verbose=True)
cv_score = cross_val_score(multi, X, y, cv=10, scoring='balanced_accuracy')
# cv_score = [0.70543071, 0.63651397, 0.70547798, 0.64895006, 0.66499338, 0.69224811, 0.6446569, 0.66709336, 0.63348891, 0.64400993]
```

Avg: **0.6546216998246621**
```python
multi = MultiSample.MultiDownSyndrome(clf_type='xgb',n_clf=10,verbose=True)
cv_score = cross_val_score(multi, X, y, cv=10, scoring='balanced_accuracy')
# cv_score = [0.6889318  0.62538747 0.69853016 0.64994673 0.65943782 0.68943121 0.61976481 0.6437276  0.63873324 0.63232614]
```

using average voting: 0.6639131742464246  
using most common voting: 0.6806365835931366  
    with better parameters: 0.6856723397392932  
    multi = MultiSample.MultiDownSyndrome(clf_type='xgb',max_depth=5,learning_rate=0.1,n_estimators=200,n_clf=10,verbose=True)




### GridSearch

In [136]:
from collections import Counter
from random import randint
lst = [[1,0,2,2,1],[1,1,1,2,1],[0,0,1,2,0]]

def most_common(lst):
    data = [Counter(sub_lst) for sub_lst in lst]
    # Get the most common class predictions (could be multiple)
    mc_dup = [ [(x[0],x[1]) for x in sub_data.most_common() if sub_data.most_common()[0][1] == x[1]] for sub_data in data]
    # If multiple most common class predictions, choose random (if len(x)=1 then we chose randint(0,0)=0)
    mc = [x[randint(0,len(x)-1)][0] for x in mc_dup]
    return mc

print(most_common(lst))

[2, 1, 0]


In [133]:
clf_type = ['xgb']
n_clf = [10]
max_depth = [3,5,7]
learning_rate = [0.1,0.05]
n_estimators_model = [100,200]

parameters = parameters = dict(classification__n_clf=n_clf,
                                  classification__clf_type=clf_type,
                                  classification__max_depth=max_depth,
                                  classification__learning_rate=learning_rate,
                                  classification__n_estimators=n_estimators_model)

clf = GridSearchCV(pipe, parameters, cv=10, n_jobs=-1, verbose=10, scoring='balanced_accuracy')
clf.fit(X, y)

# View The Best Parameters
print(clf.best_params_)
print(clf.best_score_)

# depth = 5
# rate = 0.1
# est = 200
# best_score = 0.68 something something

Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


KeyboardInterrupt: 

In [None]:
results = pd.DataFrame(clf.cv_results_)
pd.set_option("display.max_rows", None, "display.max_columns", None, "display.max_colwidth",200) 
results[["params","mean_test_score"]] #.query('mean_test_score >= 0.69')

## Final prediction

In [None]:
print("Train Score:", balanced_accuracy_score(y, clf.predict(X)))

y_pred = clf.predict(x_test)
plt.hist(y_pred)

y_pred_pd = pd.DataFrame(data=y_pred, columns=["y"])


In [None]:
y_pred_pd.to_csv('../../P/XGB_MultiDown.csv', index_label='id')