<a href="https://colab.research.google.com/github/lblum95/AML/blob/master/task3/GradientBoostClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GradientBoostingClassifier

## Connect to My Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/My Drive')

## Import libraries

In [None]:
#general
import pandas as pd
import numpy as np

#sklearn
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import  HistGradientBoostingClassifier, RandomForestClassifier, VotingClassifier, BaggingClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline

## Prepare data

### Import and combine data

In [None]:
#Import data
x_train = pd.read_csv("data/X_train_features.csv", index_col=0, header=0, low_memory=False)
x_train_yanick = pd.read_csv("data/X_train_features_yanick.csv", index_col=0, header=0, low_memory=False)
y_train = pd.read_csv("data/y_train.csv", index_col=0, header=0)
x_test = pd.read_csv("data/X_test_features.csv", index_col=0, header=0, low_memory=False)
x_test_yanick = pd.read_csv("data/X_test_features_yanick.csv", index_col=0, header=0, low_memory=False)

normal_features=x_train
tescht=x_test

#compose both files
normal_features[x_train_yanick.columns[-202:]] = x_train_yanick[x_train_yanick.columns[-202:]]
tescht[x_test_yanick.columns[-202:]] = x_test_yanick[x_test_yanick.columns[-202:]]

### Postprocess data

In [None]:
norm=normal_features.drop(['HRV_ULF','HRV_VLF','HRV_LF','HRV_LFHF','HRV_LFn'], axis=1)
tescht=tescht.drop(['HRV_ULF','HRV_VLF','HRV_LF','HRV_LFHF','HRV_LFn'], axis=1)
norm=norm.replace(np.inf, 10)
tescht=tescht.replace(np.inf, 10)
norm=norm.replace(-np.inf, -10)
tescht=tescht.replace(-np.inf, -10)

y_tr=y_train
selected=norm

#impute and select
imputer=SimpleImputer(strategy ='median')
feature_sel = SelectFromModel(RandomForestClassifier())

selected=imputer.fit_transform(selected)
tescht=imputer.transform(tescht)

selected=feature_sel.fit_transform(selected,y_tr)
tescht=feature_sel.transform(tescht)

## Model

### Create model

In [None]:
def get_model():
  estimators = [('l2', HistGradientBoostingClassifier(l2_regularization=3, max_leaf_nodes=None,min_samples_leaf=20,scoring='f1_micro', random_state=2)),
                ('full_steam', HistGradientBoostingClassifier(min_samples_leaf=20,scoring='f1_micro',random_state=1)),
                ('20', HistGradientBoostingClassifier(scoring='f1_micro',random_state=0)),
                ('rfc',Pipeline([('scaler', SimpleImputer(strategy ='median')),('svc', RandomForestClassifier(n_estimators=500, min_samples_leaf=2,random_state=36))])),
                ('rf_feat',Pipeline([('scaler', SimpleImputer(strategy='median')), ('svc', RandomForestClassifier(n_estimators=100, random_state=42))]))]
  clf = VotingClassifier(estimators=estimators,voting='soft',flatten_transform=True,n_jobs=-1)
  return clf

### Train model

In [None]:
skf1=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_index,test_index in skf1.split(selected,y_tr):
  X_train = selected[train_index]
  X_test = selected[test_index]
  Y_train =y_tr[train_index]
  Y_test = y_tr[test_index]
  model=get_model()
  model.fit(X_train,Y_train)
  Y_pred=model.predict(X_test)
  print(confusion_matrix(Y_test, Y_pred))
  print('Score')
  print(f1_score(Y_test, Y_pred, average='micro'))
  print()
  print()

[[568   4  33   1]
 [  9  60  19   1]
 [ 78   5 210   2]
 [ 13   2   5  14]]
Score
0.83203125


[[572   1  31   2]
 [  7  69  12   1]
 [ 72   8 211   4]
 [  7   0   5  22]]
Score
0.853515625






[[575   4  26   1]
 [  6  58  20   4]
 [ 81   9 201   4]
 [ 10   1   4  19]]
Score
0.833822091886608


[[570   0  35   1]
 [  3  65  19   1]
 [ 76  13 206   0]
 [ 12   2   3  17]]
Score
0.8387096774193549






[[557   3  39   7]
 [  3  68  18   0]
 [ 70   9 213   2]
 [ 17   0   2  15]]
Score
0.833822091886608




### Train final model

In [None]:
model=get_model()
model.fit(selected,y_tr)
y_pred = model.predict(tescht)

## Save predictions

In [None]:
df = pd.DataFrame(y_pred)
df.to_csv('Jannik_All_features.csv', header=['y'], index_label='id')