In [53]:
#data science and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


#creation of dataset
import _lib.ml_workflow.create_dataset as cds
from _lib.export import to_csv
from _lib.raman_lib.misc import load_data

#quality control
import _lib.ml_workflow.quality_control as qc
from _lib.raman_lib.preprocessing import RangeLimiter
from _lib.raman_lib.visualization import plot_spectra_peaks
from _lib.raman_lib.spectra_scoring import score_names

#preprocessing
from _lib.ml_workflow.preprocess_data import preprocess

#model creation
from sklearn.model_selection import StratifiedKFold, cross_validate, GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.cluster import FeatureAgglomeration
from sklearn.decomposition import PCA, NMF
from sklearn.pipeline import Pipeline
from _lib.raman_lib.preprocessing import PeakPicker
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
#file handling
from pathlib import Path
import os

In [54]:
# define the paths to all experiment data
path_group_0 = "/Users/Praktikum/Documents/HL428/Roiss_L-428_aggregated/Control"
path_group_1 = "/Users/Praktikum/Documents/HL428/Roiss_L-428_aggregated/Etoposide"

# Define parameters
In order to function properly, the provided code depends on predefined parameters like output paths, limits and thresholds for the quality control, ...
## Define data paths
Define the location of the data, and where quality-controlled and preprocessed data should be stored. Both of them rely on a unique file-prefix that describes the data being analyzed.

In [55]:
FILE_PREFIX = "Enterobacter"
DATASET_OUT = "./03_StratifiedK/data/" + FILE_PREFIX + ".csv"
RESULT_DIR = "./03_StratifiedK/result/" + FILE_PREFIX
QC_OUT = RESULT_DIR + "/" + FILE_PREFIX + "_qc.csv"
PREP_OUT = RESULT_DIR + "/" + FILE_PREFIX + "_preprocessed.csv"
LDA_DIR = RESULT_DIR + "/" + "lda_dim_reduction"
REG_DIR = RESULT_DIR + "/" + "regularized_models/"
TREE_DIR = RESULT_DIR + "/" + "tree_based_models/"

## Define quality scoring parameters
The quality control only uses peaks in a given interval, recognizes peaks via a filter (Sav-Gol) and scores them based on some metrics. Finally, the best N spectra are selected.
### Spectral Range Limits

In [56]:
QC_LIM_LOW = 450
QC_LIM_HIGH = 1650

### Peak Detection

In [57]:
QC_WINDOW = 35
QC_THRESHOLD = 0.001
QC_MIN_HEIGHT = 50

### Scoring

In [58]:
QC_SCORE = 1
QC_PEAKS = 1

### Number of spectra to keep

In [59]:
QC_NUM = 300

## Define Preprocessing Parameter
### Spectral Range Limits

In [60]:
PREP_LIM_LOW = QC_LIM_LOW
PREP_LIM_HIGH = QC_LIM_HIGH

### Window-width for smoothing

In [61]:
PREP_WINDOW = 15

## Settings for Cross Validation

In [62]:
SCORING = ['accuracy', 'f1']
N_TRIALS = 20
N_FOLDS = 5
N_CORES = -1

# Define param for number of cores/threads to use

In [63]:
cnt_jobs = 40

# Create the dataset
Create the dataset using the implementation provided by D. Zimmermann.
For the creation of the dataset, the two source dirs, as well as the desired labels are needed.
Furthermore, an output directory is needed, to store the created dataset 

In [64]:
datadir = Path(DATASET_OUT).parent
if not os.path.exists(datadir):
    os.makedirs(datadir)

dataset = cds.create_dataset([path_group_0, path_group_1], ['d_E._aerogenes', 'e_E._cloacae'])
dataset.to_csv(DATASET_OUT, index=False)

root - INFO - Loading data
root - INFO - Loading data
root - INFO - Loading data
root - INFO - Loading files from /Users/Praktikum/Documents/HL428/Roiss_L-428_aggregated/Control
root - INFO - Loading files from /Users/Praktikum/Documents/HL428/Roiss_L-428_aggregated/Control
root - INFO - Loading files from /Users/Praktikum/Documents/HL428/Roiss_L-428_aggregated/Control
root - INFO - Loading files from /Users/Praktikum/Documents/HL428/Roiss_L-428_aggregated/Etoposide
root - INFO - Loading files from /Users/Praktikum/Documents/HL428/Roiss_L-428_aggregated/Etoposide
root - INFO - Loading files from /Users/Praktikum/Documents/HL428/Roiss_L-428_aggregated/Etoposide
root - INFO - Finished loading data.
root - INFO - Finished loading data.
root - INFO - Finished loading data.


# Do quality control
Asses the spectra based on their quality, and remove low quality spectra

In [65]:
path_in = Path(DATASET_OUT)
path_out = Path(RESULT_DIR)

if not os.path.exists(path_out):
    os.makedirs(path_out)

path_out_data = path_out / (path_in.stem + "_qc.csv")
path_out_scores = path_out / (path_in.stem + "_qc_scores.csv")

data = pd.read_csv(path_in)

data_out, _, score_dict = qc.score_sort_spectra(data,
                                                n=QC_NUM,
                                                limits=[QC_LIM_LOW, QC_LIM_HIGH],
                                                bl_method="asls",
                                                sg_window=QC_WINDOW,
                                                threshold=QC_THRESHOLD,
                                                min_height=QC_MIN_HEIGHT,
                                                score_measure=QC_SCORE,
                                                n_peaks_influence=QC_PEAKS,
                                                detailed=True)

visualize = False
if visualize:
    data_vis = data.drop(columns=["label", "file"]).values.astype(float)
    wns_vis = data.drop(columns=["label", "file"]).columns.astype(float)

    rl = RangeLimiter(lim=[QC_LIM_LOW, QC_LIM_HIGH],
                      reference=wns_vis)

    data_rl = rl.fit_transform(data_vis)
    wns_rl = wns_vis[rl.lim_[0]:rl.lim_[1]]

    plot_spectra_peaks(wns_rl,
                       data_rl,
                       score_dict["peak_pos"],
                       labels=score_dict["total_scores"])

data_out.to_csv(path_out_data, index=False)

pd.DataFrame({score_names[QC_SCORE]: score_dict["intensity_scores"],
              "N Peaks": score_dict["peak_scores"]}).to_csv(
    path_out_scores, index=False
)

Analyzed 835 spectra in 5.77 seconds.
Mean Score: 13553

1st Quartile: 6171
Median Score: 11007
3rd Quartile: 17458

Min Score: 0
Max Score: 78737


# Preprocess the data

In [66]:
path_in = Path(QC_OUT)
path_out = Path(RESULT_DIR)

filename = path_in.stem.removesuffix("_qc")

if not os.path.exists(path_out):
    os.makedirs(path_out)

path_out = path_out / (filename + "_preprocessed.csv")

data = load_data(QC_OUT)

data_prep = preprocess(data, limits=[PREP_LIM_LOW, PREP_LIM_HIGH], sg_window=PREP_WINDOW)

data_prep.to_csv(path_out, index=False)

[CV 6/10] END pca__n_components=1; accuracy: (train=0.874, test=0.950) f1: (train=0.862, test=0.947) total time=   0.6s
[CV 4/10] END pca__n_components=21; accuracy: (train=0.952, test=0.983) f1: (train=0.951, test=0.984) total time=   4.3s
[CV 9/10] END agglo__n_clusters=5; accuracy: (train=0.848, test=0.933) f1: (train=0.844, test=0.931) total time=   0.4s
[CV 3/10] END agglo__n_clusters=40; accuracy: (train=0.970, test=0.850) f1: (train=0.970, test=0.836) total time=   0.6s
[CV 4/10] END peaks__min_dist=40; accuracy: (train=0.950, test=1.000) f1: (train=0.949, test=1.000) total time=   0.1s
[CV 8/10] END logreg__C=0.01; accuracy: (train=0.898, test=0.917) f1: (train=0.888, test=0.909) total time=   0.4s
[CV 9/10] END logreg__C=0.015848931924611134; accuracy: (train=0.920, test=0.950) f1: (train=0.915, test=0.947) total time=   0.4s
[CV 5/10] END logreg__C=0.1; accuracy: (train=0.978, test=0.983) f1: (train=0.978, test=0.983) total time=   1.1s
[CV 10/10] END logreg__C=0.630957344480

# Implement Stratified KFold CV

In [67]:
path_in = PREP_OUT

data = load_data(path_in)

X = data.drop(columns=["label", "file"])
wns = np.asarray(X.columns.astype(float))
X = np.asarray(X)
y = np.array(data.label)
y, y_key = pd.factorize(y)

stk = StratifiedKFold(n_splits=10)

[CV] END  accuracy: (train=1.000, test=0.883) f1: (train=1.000, test=0.877) total time=   1.7s
[CV 3/10] END pca__n_components=21; accuracy: (train=0.970, test=0.800) f1: (train=0.970, test=0.778) total time=   0.3s
[CV 8/10] END pca__n_components=41; accuracy: (train=0.965, test=0.967) f1: (train=0.964, test=0.967) total time=   0.4s
[CV 5/10] END pca__n_components=21; accuracy: (train=0.954, test=0.983) f1: (train=0.953, test=0.983) total time=   4.3s
[CV 7/10] END agglo__n_clusters=10; accuracy: (train=0.880, test=0.950) f1: (train=0.868, test=0.947) total time=   0.4s
[CV 5/10] END agglo__n_clusters=35; accuracy: (train=0.952, test=0.983) f1: (train=0.951, test=0.983) total time=   0.7s
[CV 5/10] END peaks__min_dist=20; accuracy: (train=0.948, test=0.950) f1: (train=0.947, test=0.947) total time=   0.0s
[CV 1/10] END peaks__min_dist=80; accuracy: (train=0.937, test=0.683) f1: (train=0.935, test=0.655) total time=   0.0s
[CV 10/10] END peaks__min_dist=80; accuracy: (train=0.926, tes

## LDA Dimensionality Reduction
### Baseline with LDA alone

In [68]:
clf = LinearDiscriminantAnalysis()
result = cross_validate(clf, X, y, cv=stk, scoring=SCORING, return_train_score=True, verbose=3, n_jobs=cnt_jobs)
to_csv(result, path=LDA_DIR + "/lda", scoring=SCORING, param_opt=False)

[CV 9/10] END pca__n_components=21; accuracy: (train=0.961, test=0.967) f1: (train=0.960, test=0.966) total time=   1.0s
[CV 2/10] END pca__n_components=41; accuracy: (train=0.954, test=0.983) f1: (train=0.952, test=0.983) total time=   7.4s
[CV 8/10] END agglo__n_clusters=15; accuracy: (train=0.913, test=0.917) f1: (train=0.907, test=0.912) total time=   0.3s
[CV 6/10] END agglo__n_clusters=35; accuracy: (train=0.959, test=0.983) f1: (train=0.958, test=0.983) total time=   0.6s
[CV 5/10] END peaks__min_dist=30; accuracy: (train=0.944, test=0.950) f1: (train=0.943, test=0.947) total time=   0.0s
[CV 6/10] END logreg__C=0.06309573444801933; accuracy: (train=0.967, test=0.983) f1: (train=0.966, test=0.983) total time=   1.0s
[CV 8/10] END logreg__C=0.3981071705534973; accuracy: (train=0.998, test=0.950) f1: (train=0.998, test=0.949) total time=   0.6s
[CV 6/10] END logreg__C=2.5118864315095824; accuracy: (train=1.000, test=0.983) f1: (train=1.000, test=0.984) total time=   0.6s
[CV 3/10]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


[CV 1/10] END pca__n_components=31; accuracy: (train=0.967, test=0.800) f1: (train=0.966, test=0.778) total time=   1.1s
[CV 9/10] END pca__n_components=11; accuracy: (train=0.948, test=0.967) f1: (train=0.947, test=0.966) total time=   2.4s
[CV 8/10] END pca__n_components=41; accuracy: (train=0.952, test=0.950) f1: (train=0.950, test=0.949) total time=   6.0s
[CV 1/10] END agglo__n_clusters=25; accuracy: (train=0.959, test=0.750) f1: (train=0.958, test=0.706) total time=   1.2s
[CV 8/10] END peaks__min_dist=20; accuracy: (train=0.952, test=0.917) f1: (train=0.951, test=0.912) total time=   0.1s
[CV 6/10] END logreg__C=0.01; accuracy: (train=0.902, test=0.917) f1: (train=0.892, test=0.915) total time=   0.5s
[CV 5/10] END logreg__C=0.06309573444801933; accuracy: (train=0.967, test=0.967) f1: (train=0.966, test=0.966) total time=   1.2s
[CV 7/10] END logreg__C=0.6309573444801934; accuracy: (train=1.000, test=0.967) f1: (train=1.000, test=0.966) total time=   0.8s
[CV 9/10] END logreg__C



[CV 2/10] END pca__n_components=11; accuracy: (train=0.950, test=0.967) f1: (train=0.948, test=0.966) total time=   0.7s
[CV 9/10] END pca__n_components=21; accuracy: (train=0.956, test=0.967) f1: (train=0.955, test=0.966) total time=   4.9s
[CV 2/10] END agglo__n_clusters=25; accuracy: (train=0.935, test=0.950) f1: (train=0.933, test=0.949) total time=   1.1s
[CV 1/10] END agglo__n_clusters=35; accuracy: (train=0.965, test=0.733) f1: (train=0.964, test=0.692) total time=   0.9s
[CV 5/10] END peaks__min_dist=10; accuracy: (train=0.948, test=0.950) f1: (train=0.947, test=0.947) total time=   0.1s
[CV 9/10] END peaks__min_dist=20; accuracy: (train=0.948, test=0.967) f1: (train=0.947, test=0.966) total time=   0.4s
[CV 4/10] END peaks__min_dist=70; accuracy: (train=0.915, test=0.967) f1: (train=0.911, test=0.967) total time=   0.0s
[CV 5/10] END peaks__min_dist=80; accuracy: (train=0.930, test=0.933) f1: (train=0.926, test=0.931) total time=   0.2s
[CV 4/10] END peaks__min_dist=110; accur

[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    3.5s remaining:    8.1s





[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    4.2s remaining:    1.8s




[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.6s finished


### Feature Selection with PCA followed by LDA

In [69]:
param_grid = {"pca__n_components": range(
    1, 51, 10
)}
clf = Pipeline([("pca", PCA()),
                ("lda", LinearDiscriminantAnalysis())])
grid_rf = GridSearchCV(clf, param_grid=param_grid, cv=stk, scoring=SCORING, refit='accuracy',
                       return_train_score=True, verbose=3, n_jobs=cnt_jobs).fit(X, y)

to_csv(grid_rf.cv_results_, scoring=SCORING, path=LDA_DIR + "/pca_lda", param_opt=True)

print(" Results from Grid Search ")
print("\n The best estimator across ALL searched params:\n", grid_rf.best_estimator_)
print("\n The best score across ALL searched params:\n", grid_rf.best_score_)
print("\n The best parameters across ALL searched params:\n", grid_rf.best_params_)

Fitting 10 folds for each of 5 candidates, totalling 50 fits









 Results from Grid Search 

 The best estimator across ALL searched params:
 Pipeline(steps=[('pca', PCA(n_components=31)),
                ('lda', LinearDiscriminantAnalysis())])

 The best score across ALL searched params:
 0.945

 The best parameters across ALL searched params:
 {'pca__n_components': 31}


### Feature Selection with Nonnegative Matric Factorization followed by LDA

In [70]:
param_grid = {"pca__n_components": range(
    1, 51, 10
)}
clf = Pipeline([("pca", NMF(init="nndsvda", tol=1e-2, max_iter=5000)),
                ("lda", LinearDiscriminantAnalysis())])
grid_rf = GridSearchCV(clf, param_grid=param_grid, cv=stk, scoring=SCORING, refit='accuracy',
                       return_train_score=True, verbose=3, n_jobs=cnt_jobs).fit(X, y)

to_csv(grid_rf.cv_results_, scoring=SCORING, path=LDA_DIR + "/nmf_lda", param_opt=True)

print(" Results from Grid Search ")
print("\n The best estimator across ALL searched params:\n", grid_rf.best_estimator_)
print("\n The best score across ALL searched params:\n", grid_rf.best_score_)
print("\n The best parameters across ALL searched params:\n", grid_rf.best_params_)


### Feature Selection with Feature Agglomeration followed by LDA

Fitting 10 folds for each of 5 candidates, totalling 50 fits



















































 Results from Grid Search 

 The best estimator across ALL searched params:
 Pipeline(steps=[('pca',
                 NMF(init='nndsvda', max_iter=5000, n_components=21, tol=0.01)),
                ('lda', LinearDiscriminantAnalysis())])

 The best score across ALL searched params:
 0.9383333333333332

 The best parameters across ALL searched params:
 {'pca__n_components': 21}


### Feature Selection with Feature Agglomeration followed by LDA

In [71]:
param_grid = {"agglo__n_clusters": range(
    5, 41, 5
)}
clf = Pipeline([("agglo", FeatureAgglomeration(connectivity=np.diag(np.ones(len(wns))) +
                                                            np.diag(np.ones(len(wns) - 1), 1) +
                                                            np.diag(np.ones(len(wns) - 1), -1))),
                ("lda", LinearDiscriminantAnalysis())])
grid_rf = GridSearchCV(clf, param_grid=param_grid, cv=stk, scoring=SCORING, refit='accuracy',
                       return_train_score=True, verbose=3, n_jobs=cnt_jobs).fit(X, y)
to_csv(grid_rf.cv_results_, scoring=SCORING, path=LDA_DIR + "/fa_lda", param_opt=True)

print(" Results from Grid Search ")
print("\n The best estimator across ALL searched params:\n", grid_rf.best_estimator_)
print("\n The best score across ALL searched params:\n", grid_rf.best_score_)
print("\n The best parameters across ALL searched params:\n", grid_rf.best_params_)

Fitting 10 folds for each of 8 candidates, totalling 80 fits
 Results from Grid Search 

 The best estimator across ALL searched params:
 Pipeline(steps=[('agglo',
                 FeatureAgglomeration(connectivity=array([[1., 1., 0., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 1., 0.],
       [0., 0., 0., ..., 1., 1., 1.],
       [0., 0., 0., ..., 0., 1., 1.]]),
                                      n_clusters=40)),
                ('lda', LinearDiscriminantAnalysis())])

 The best score across ALL searched params:
 0.9416666666666667

 The best parameters across ALL searched params:
 {'agglo__n_clusters': 40}


### Feature Selection with PeakPicker followed by LDA

In [72]:
param_grid = {"peaks__min_dist": range(
    10, 151, 10
)}
clf = Pipeline([("peaks", PeakPicker()),
                ("lda", LinearDiscriminantAnalysis())])
grid_rf = GridSearchCV(clf, param_grid=param_grid, cv=stk, scoring=SCORING, refit='accuracy', return_train_score=True,
                       verbose=3, n_jobs=cnt_jobs).fit(X, y)
to_csv(grid_rf.cv_results_, scoring=SCORING, path=LDA_DIR + "/peak_lda", param_opt=True)

print(" Results from Grid Search ")
print("\n The best estimator across ALL searched params:\n", grid_rf.best_estimator_)
print("\n The best score across ALL searched params:\n", grid_rf.best_score_)
print("\n The best parameters across ALL searched params:\n", grid_rf.best_params_)


Fitting 10 folds for each of 15 candidates, totalling 150 fits
 Results from Grid Search 

 The best estimator across ALL searched params:
 Pipeline(steps=[('peaks', PeakPicker(min_dist=20)),
                ('lda', LinearDiscriminantAnalysis())])

 The best score across ALL searched params:
 0.9366666666666665

 The best parameters across ALL searched params:
 {'peaks__min_dist': 20}


## Regularized Models
### Logistic Regression L1 Penalty

In [73]:
param_grid = {
    "logreg__C": np.logspace(-2, 1, 16)
}
clf = Pipeline([
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(solver="liblinear",
    penalty="l1", max_iter=1000, random_state=41))
])

grid_rf = GridSearchCV(clf, param_grid=param_grid, cv=stk, scoring=SCORING, refit='accuracy', return_train_score=True,
                       verbose=3, n_jobs=cnt_jobs).fit(X, y)
to_csv(grid_rf.cv_results_, scoring=SCORING, path=REG_DIR + "/logreg_l1", param_opt=True)

print(" Results from Grid Search ")
print("\n The best estimator across ALL searched params:\n", grid_rf.best_estimator_)
print("\n The best score across ALL searched params:\n", grid_rf.best_score_)
print("\n The best parameters across ALL searched params:\n", grid_rf.best_params_)

Fitting 10 folds for each of 16 candidates, totalling 160 fits
 Results from Grid Search 

 The best estimator across ALL searched params:
 Pipeline(steps=[('scaler', StandardScaler()),
                ('logreg',
                 LogisticRegression(C=0.6309573444801934, max_iter=1000,
                                    penalty='l1', random_state=41,
                                    solver='liblinear'))])

 The best score across ALL searched params:
 0.9616666666666667

 The best parameters across ALL searched params:
 {'logreg__C': 0.6309573444801934}


### Logistic Regression L2 Penalty

In [74]:
param_grid = {
    "logreg__C": np.logspace(-5, 1, 13)
}
clf = Pipeline([
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(solver="liblinear",
                                  penalty="l2", max_iter=1000, random_state=51))
])

grid_rf = GridSearchCV(clf, param_grid=param_grid, cv=stk, scoring=SCORING, refit='accuracy', return_train_score=True,
                       verbose=3, n_jobs=cnt_jobs).fit(X, y)
to_csv(grid_rf.cv_results_, scoring=SCORING, path=REG_DIR + "/logreg_l2", param_opt=True)

print(" Results from Grid Search ")
print("\n The best estimator across ALL searched params:\n", grid_rf.best_estimator_)
print("\n The best score across ALL searched params:\n", grid_rf.best_score_)
print("\n The best parameters across ALL searched params:\n", grid_rf.best_params_)

Fitting 10 folds for each of 13 candidates, totalling 130 fits
 Results from Grid Search 

 The best estimator across ALL searched params:
 Pipeline(steps=[('scaler', StandardScaler()),
                ('logreg',
                 LogisticRegression(C=0.01, max_iter=1000, random_state=51,
                                    solver='liblinear'))])

 The best score across ALL searched params:
 0.945

 The best parameters across ALL searched params:
 {'logreg__C': 0.01}


### Linear SVM L1 Penalty

In [75]:
param_grid = {
    "svm__C": np.logspace(-3, 0, 16)
}
clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", LinearSVC(penalty="l1", dual=False, max_iter=15000))
])

grid_rf = GridSearchCV(clf, param_grid=param_grid, cv=stk, scoring=SCORING, refit='accuracy', return_train_score=True,
                       verbose=3, n_jobs=cnt_jobs).fit(X, y)
to_csv(grid_rf.cv_results_, scoring=SCORING, path=REG_DIR + "/svm_l1", param_opt=True)

print(" Results from Grid Search ")
print("\n The best estimator across ALL searched params:\n", grid_rf.best_estimator_)
print("\n The best score across ALL searched params:\n", grid_rf.best_score_)
print("\n The best parameters across ALL searched params:\n", grid_rf.best_params_)

Fitting 10 folds for each of 16 candidates, totalling 160 fits
 Results from Grid Search 

 The best estimator across ALL searched params:
 Pipeline(steps=[('scaler', StandardScaler()),
                ('svm',
                 LinearSVC(C=0.06309573444801933, dual=False, max_iter=15000,
                           penalty='l1'))])

 The best score across ALL searched params:
 0.9633333333333333

 The best parameters across ALL searched params:
 {'svm__C': 0.06309573444801933}


### Linear SVM L2 Penalty

In [76]:
param_grid = {
    "svm__C": np.logspace(-5, -1, 13)
}
clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", LinearSVC(penalty="l2", max_iter=5000))
])

grid_rf = GridSearchCV(clf, param_grid=param_grid, cv=stk, scoring=SCORING, refit='accuracy', return_train_score=True,
                       verbose=3, n_jobs=cnt_jobs).fit(X, y)
to_csv(grid_rf.cv_results_, scoring=SCORING, path=REG_DIR + "/svm_l2", param_opt=True)

print(" Results from Grid Search ")
print("\n The best estimator across ALL searched params:\n", grid_rf.best_estimator_)
print("\n The best score across ALL searched params:\n", grid_rf.best_score_)
print("\n The best parameters across ALL searched params:\n", grid_rf.best_params_)

Fitting 10 folds for each of 13 candidates, totalling 130 fits
 Results from Grid Search 

 The best estimator across ALL searched params:
 Pipeline(steps=[('scaler', StandardScaler()),
                ('svm', LinearSVC(C=0.00021544346900318823, max_iter=5000))])

 The best score across ALL searched params:
 0.9466666666666667

 The best parameters across ALL searched params:
 {'svm__C': 0.00021544346900318823}


## Tree-based Models
### Basic Decision Tree

In [77]:
param_grid = {
    "ccp_alpha": np.logspace(-3, -1, 9)
}
clf = DecisionTreeClassifier(random_state=653)

grid_rf = GridSearchCV(clf, param_grid=param_grid, cv=stk, scoring=SCORING, refit='accuracy', return_train_score=True,
                       verbose=3, n_jobs=cnt_jobs).fit(X, y)
to_csv(grid_rf.cv_results_, scoring=SCORING, path=TREE_DIR + "/decision_tree", param_opt=True)

print(" Results from Grid Search ")
print("\n The best estimator across ALL searched params:\n", grid_rf.best_estimator_)
print("\n The best score across ALL searched params:\n", grid_rf.best_score_)
print("\n The best parameters across ALL searched params:\n", grid_rf.best_params_)

Fitting 10 folds for each of 9 candidates, totalling 90 fits
 Results from Grid Search 

 The best estimator across ALL searched params:
 DecisionTreeClassifier(ccp_alpha=0.01, random_state=653)

 The best score across ALL searched params:
 0.9099999999999999

 The best parameters across ALL searched params:
 {'ccp_alpha': 0.01}


### Random Forest

In [78]:
param_grid = {
    "colsample_bytree": np.linspace(0.01, 0.2, 20)
}
clf = LGBMClassifier(boosting_type="rf",
                     subsample=0.8,
                     subsample_freq=1,
                     max_bin=10,
                     max_depth=8,
                     random_state=2434)

grid_rf = GridSearchCV(clf, param_grid=param_grid, cv=stk, scoring=SCORING, refit='accuracy', return_train_score=True,
                       verbose=3, n_jobs=cnt_jobs).fit(X, y)
to_csv(grid_rf.cv_results_, scoring=SCORING, path=TREE_DIR + "/random_forest", param_opt=True)

print(" Results from Grid Search ")
print("\n The best estimator across ALL searched params:\n", grid_rf.best_estimator_)
print("\n The best score across ALL searched params:\n", grid_rf.best_score_)
print("\n The best parameters across ALL searched params:\n", grid_rf.best_params_)

Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV 1/10] END pca__n_components=1; accuracy: (train=0.906, test=0.667) f1: (train=0.897, test=0.583) total time=   0.1s
[CV 2/10] END pca__n_components=1; accuracy: (train=0.700, test=0.733) f1: (train=0.657, test=0.692) total time=   7.8s
[CV 4/10] END pca__n_components=11; accuracy: (train=0.943, test=0.967) f1: (train=0.941, test=0.966) total time=   0.6s
[CV 9/10] END pca__n_components=11; accuracy: (train=0.948, test=0.967) f1: (train=0.947, test=0.966) total time=   0.5s
[CV 5/10] END pca__n_components=21; accuracy: (train=0.952, test=0.983) f1: (train=0.951, test=0.983) total time=   0.9s
[CV 2/10] END pca__n_components=31; accuracy: (train=0.948, test=0.983) f1: (train=0.947, test=0.984) total time=   1.3s
[CV 8/10] END pca__n_components=31; accuracy: (train=0.957, test=0.950) f1: (train=0.956, test=0.949) total time=   1.0s
[CV 3/10] END pca__n_components=41; accuracy: (train=0.974, test=0.817) f1: (train=0.974, te



[CV] END  accuracy: (train=1.000, test=0.867) f1: (train=1.000, test=0.857) total time=   0.7s
[CV] END  accuracy: (train=1.000, test=0.767) f1: (train=1.000, test=0.750) total time=   0.7s
[CV 8/10] END pca__n_components=1; accuracy: (train=0.885, test=0.883) f1: (train=0.874, test=0.877) total time=   0.1s
[CV 5/10] END pca__n_components=11; accuracy: (train=0.948, test=1.000) f1: (train=0.946, test=1.000) total time=   0.1s
[CV 7/10] END pca__n_components=21; accuracy: (train=0.957, test=0.967) f1: (train=0.957, test=0.966) total time=   0.1s
[CV 8/10] END pca__n_components=21; accuracy: (train=0.957, test=0.950) f1: (train=0.956, test=0.949) total time=   0.1s
[CV 1/10] END pca__n_components=41; accuracy: (train=0.967, test=0.750) f1: (train=0.966, test=0.706) total time=   0.2s
[CV 2/10] END pca__n_components=41; accuracy: (train=0.963, test=0.983) f1: (train=0.962, test=0.983) total time=   0.2s
[CV 8/10] END pca__n_components=1; accuracy: (train=0.711, test=0.650) f1: (train=0.6

### Gradient-boosted Decision Trees

In [79]:
param_grid = {
    "learning_rate": np.linspace(0.01, 0.2, 20)
}
clf = LGBMClassifier(colsample_bytree=0.2,
                     max_bin=10,
                     max_depth=5,
                     random_state=6233)

grid_rf = GridSearchCV(clf, param_grid=param_grid, cv=stk, scoring=SCORING, refit='accuracy', return_train_score=True,
                       verbose=3, n_jobs=cnt_jobs).fit(X, y)
to_csv(grid_rf.cv_results_, scoring=SCORING, path=TREE_DIR + "/gbdt", param_opt=True)

print(" Results from Grid Search ")
print("\n The best estimator across ALL searched params:\n", grid_rf.best_estimator_)
print("\n The best score across ALL searched params:\n", grid_rf.best_score_)
print("\n The best parameters across ALL searched params:\n", grid_rf.best_params_)

Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV] END  accuracy: (train=1.000, test=0.900) f1: (train=1.000, test=0.889) total time=   0.6s
[CV 2/10] END pca__n_components=1; accuracy: (train=0.881, test=0.883) f1: (train=0.870, test=0.868) total time=   0.1s
[CV 9/10] END pca__n_components=1; accuracy: (train=0.872, test=0.933) f1: (train=0.859, test=0.929) total time=   0.1s
[CV 6/10] END pca__n_components=11; accuracy: (train=0.939, test=1.000) f1: (train=0.937, test=1.000) total time=   0.1s
[CV 9/10] END pca__n_components=21; accuracy: (train=0.961, test=0.967) f1: (train=0.960, test=0.966) total time=   0.1s
[CV 10/10] END pca__n_components=21; accuracy: (train=0.954, test=0.983) f1: (train=0.952, test=0.983) total time=   0.1s
[CV 9/10] END pca__n_components=31; accuracy: (train=0.957, test=0.967) f1: (train=0.956, test=0.966) total time=   0.2s
[CV 10/10] END pca__n_components=31; accuracy: (train=0.957, test=0.983) f1: (train=0.956, test=0.984) total time=   



 Results from Grid Search 

 The best estimator across ALL searched params:
 LGBMClassifier(colsample_bytree=0.2, learning_rate=0.2, max_bin=10, max_depth=5,
               random_state=6233)

 The best score across ALL searched params:
 0.9716666666666667

 The best parameters across ALL searched params:
 {'learning_rate': 0.2}
