# Notebook to Implement Model Training - LGBM

---

### 1) Setup

In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import balanced_accuracy_score, make_scorer, f1_score, recall_score,precision_score
from sklearn.model_selection import RepeatedKFold, RepeatedStratifiedKFold, cross_val_score, GridSearchCV, cross_validate

import optuna
import lightgbm as lgb

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEED = 42

TRAIN_CLINICAL_FILENAME = "train_set_clinical.csv"
TEST_CLINICAL_FILENAME = "test_set_clinical.csv"

TRAIN_CLINICAL_ARTIGO_FILENAME = "E:\\work\\MAC5832_prognostico_covid\\train_set_clinical.csv"
TEST_CLINICAL_ARTIGO_FILENAME = "E:\\work\\MAC5832_prognostico_covid\\test_set_clinical.csv"

---

### 2) Read and Preprocess Data

In [3]:
datasets = {
    'onlynormalized': {
        'train': pd.read_csv(TRAIN_CLINICAL_FILENAME, sep=";", index_col="ID"),
        'test': pd.read_csv(TEST_CLINICAL_FILENAME, sep=";", index_col="ID")
    },
    'artigo': {
        'train': pd.read_csv(TRAIN_CLINICAL_ARTIGO_FILENAME, sep=";", index_col="ID"),
        'test':  pd.read_csv(TEST_CLINICAL_ARTIGO_FILENAME, sep=";", index_col="ID")
    }
}


In [4]:
##### Preprocessing all datasets
for d_key in datasets.keys():
    for d_type in datasets[d_key].keys():
        
        # Drop NaN Values 
        datasets[d_key][d_type].dropna(inplace=True)
        
        # Convert Sex column to boolean (Female: 1, Male: 0)
        datasets[d_key][d_type]["Sex"] = np.where(datasets[d_key][d_type]["Sex"]=="F", 1, 0)

---

In [5]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score

def evaluate_test(groundtruth, predicted, print_result=True):
    bal_accuracy = balanced_accuracy_score(groundtruth, predicted)
    accuracy = accuracy_score(groundtruth, predicted)
    tn, fp, fn, tp = confusion_matrix(groundtruth, predicted).ravel()
    specificity = tn / (tn+fp)
    sensitivity  = tp / (tp+fn)
    if(print_result):
        print(f"\n [test:]")
        print(f'Balanced accuracy: {bal_accuracy:.4f}') 
        print(f'Accuracy: {accuracy:.4f}') 
        print(f'Specificity:  {specificity:.4f}')
        print(f'Sensitivity:  {sensitivity:.4f}')
    return (accuracy, specificity, sensitivity)

# Get Features and Target
def getFeaturesTargets(dataset_name):
    dataset = datasets[dataset_name]
    X, y = dataset['train'].drop("Group", axis=1), dataset['train']["Group"]
    X_test, y_test = dataset['test'].drop("Group", axis=1), dataset['test']["Group"]
    return (X, y, X_test, y_test)

### 3) Baseline Model Training and CV

In [189]:
# Select a dataset
dataset_name = 'artigo'

# Features do Boruta
selectedFeatures = ['Freq.1324.07107187346', 'Freq.1399.46591504505',
       'Freq.1522.99914751846', 'Freq.1715.83254187774',
       'Freq.1794.21713030157', 'Freq.2032.98713905056',
       'Freq.2100.44990262345', 'Freq.2182.54012190969',
       'Freq.2187.26929655148', 'Freq.2241.01398322552',
       'Freq.2395.92741519698', 'Freq.2461.57721259156',
       'Freq.2822.17822957638', 'Freq.2981.05105455515',
       'Freq.3083.87231952593', 'Freq.3795.45160708473',
       'Freq.3983.61553339652', 'Freq.4266.97846896688',
       'Freq.4283.95182164633', 'Freq.4307.03317519015',
       'Freq.4395.11277752994', 'Freq.4495.09063766933',
       'Freq.4659.55667096198', 'Freq.4823.08140765752',
       'Freq.5084.14952111257', 'Freq.5224.84719303067',
       'Freq.5272.6322523475', 'Freq.5433.53206707083',
       'Freq.5485.69282171011', 'Freq.5720.79450801948',
       'Freq.7738.2889532685', 'Freq.8943.8000787644', 'Freq.9098.3102509794']

param = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'lambda_l1': 0.000000000001, #3.1642271775352635e-08,
    'lambda_l2': 0.0027,
    # 'num_leaves': 10,
    # 'feature_fraction': 0.99,
    'bagging_fraction': 0.7239607431842036,
    'bagging_freq': 4,
    'min_child_samples': 11,
    'n_estimators': 57,
    'max_depth': 5,
    # 'num_boost_round': 100,
    'min_data_in_leaf':23,
    # 'learning_rate':0.1
}

# Define Classifier (or pipeline)
clf = lgb.LGBMClassifier(random_state=SEED, **param)

# Get Features and Target
X, y, X_test, y_test = getFeaturesTargets(dataset_name)
X = X[selectedFeatures]
X_test = X_test[selectedFeatures]

# Defining RepeatedKFold Cross Validator
rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=SEED)

# Define metric scorer
metric_scorer = make_scorer(balanced_accuracy_score)

# No parameters to search now
parameters = {}

# Using GridSearchCV instead cross_val_score and cross_validate, because with GridSearchCV we could also evalute a test set
search = GridSearchCV(clf, parameters, n_jobs=-1, verbose=4, scoring=metric_scorer, cv=rkf, return_train_score=True)
search.fit(X, y)

print(f"Dataset: {dataset_name}")
print(f"Balanced accuracy mean_train: {search.cv_results_['mean_train_score'][0]}, mean_val: {search.cv_results_['mean_test_score'][0]} ")

predicted = search.best_estimator_.predict(X_test)
test_score = evaluate_test(y_test, predicted)



Fitting 50 folds for each of 1 candidates, totalling 50 fits
Dataset: artigo
Balanced accuracy mean_train: 0.9749733175303306, mean_val: 0.8288611096505833 

 [test:]
Balanced accuracy: 0.6775
Accuracy: 0.7188
Specificity:  0.8095
Sensitivity:  0.5455


### Resultados




**dataset do artigo**

De: 
```
Balanced accuracy mean_train: 1.0, mean_val: 0.778942822657219 

 [test:]
Balanced accuracy: 0.5390
Accuracy: 0.5938
Specificity:  0.7143
Sensitivity:  0.3636
```


Para:
```
Balanced accuracy mean_train: 0.9749733175303306, mean_val: 0.8288611096505833 

 [test:]
Balanced accuracy: 0.6775
Accuracy: 0.7188
Specificity:  0.8095
Sensitivity:  0.545
```
