In [15]:
import pandas as pd
import multiprocessing
from numpy import mean
from collections import Counter

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as IMBLpipeline
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV, cross_validate, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline as SKLpipeline

from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, KFold

In [16]:
# df_alg = pd.read_pickle("objects/df_alg")
df = pd.read_pickle("objects/df_cons4-HAB_preprocessing_4_2")

# labeled and unlabeled part
dfl = df[df['lipophylic_toxins'].notnull()]
dfu = df[df['lipophylic_toxins'].isnull()]

data = dfl.drop(columns=["ASP", "Chl-a", "salinity", "T", "SECCHI", "NH4-N", "NO2-N", "NO3-N",
                        "PO4-P", "SiO3-Si", "O2", "pH"]).dropna()

X = data.drop("lipophylic_toxins", axis=1)
y = data["lipophylic_toxins"]

le = LabelEncoder()
le.fit(y)
y = le.transform(y)
print(f"class encoding: ['neg','poz'] -> {le.transform(['neg','poz'])}")

class encoding: ['neg','poz'] -> [0 1]


### Simple Random forest: 60/40 split (shuffle is True by default) and 5-fold CV with shuffling

In [17]:
# print(metrics.get_scorer_names())
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.40)
lrn = RandomForestClassifier()
# lrn = DecisionTreeClassifier()

model = lrn.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(metrics.classification_report(y_test, y_pred, target_names=['neg','poz']))
print('AUC-ROC:', metrics.roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]), '\n\n')

cvfolds = 5
cv_results = cross_validate(lrn, X, y, scoring=['recall','precision','f1', 'roc_auc'], 
                            cv=StratifiedKFold(n_splits=cvfolds, shuffle=True), 
                            n_jobs=-1)
print(f'CV {cvfolds} folds')
dfr = pd.DataFrame(cv_results)
dfr.loc['mean'] = dfr.mean()
display(dfr)

              precision    recall  f1-score   support

         neg       0.91      0.99      0.95       355
         poz       0.83      0.22      0.35        45

    accuracy                           0.91       400
   macro avg       0.87      0.61      0.65       400
weighted avg       0.90      0.91      0.88       400

AUC-ROC: 0.8534272300469483 


CV 5 folds


Unnamed: 0,fit_time,score_time,test_recall,test_precision,test_f1,test_roc_auc
0,0.176957,0.039498,0.304348,0.777778,0.4375,0.898428
1,0.177017,0.043546,0.347826,0.888889,0.5,0.928028
2,0.183318,0.039654,0.391304,0.75,0.514286,0.86674
3,0.186851,0.03613,0.5,0.846154,0.628571,0.909346
4,0.193092,0.031413,0.227273,0.833333,0.357143,0.800591
mean,0.183447,0.038048,0.35415,0.819231,0.4875,0.880627


### Simple Random forest, SMOTE and under-sampling, testing on 60/40 split

In [20]:
pipeline = IMBLpipeline([('smt', SMOTE()), ('under', RandomUnderSampler()), ('RF',RandomForestClassifier())])
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.40)
pipeline.fit(X_train, y_train) 

y_pred = pipeline.predict(X_test)
print(metrics.classification_report(y_test, y_pred, target_names=['neg','poz']))
print('AUC-ROC:', metrics.roc_auc_score(y_test, pipeline.predict_proba(X_test)[:,1]))

              precision    recall  f1-score   support

         neg       0.93      0.94      0.94       355
         poz       0.49      0.47      0.48        45

    accuracy                           0.89       400
   macro avg       0.71      0.70      0.71       400
weighted avg       0.88      0.89      0.88       400

AUC-ROC: 0.8434741784037558


### Random forest with grid search for parameters, testing on 5-fold CV with shuffling

In [21]:
pd.set_option("display.max_rows", 50)

pipeline = IMBLpipeline([
    ('smt', SMOTE()), ('under', RandomUnderSampler()), ('clf', RandomForestClassifier())
])

parameters = {
              'clf__n_estimators': [150,300,500],
              'clf__criterion': ['gini', 'entropy'],
              'clf__class_weight': ['balanced', 'balanced_subsample', None],
               'smt__sampling_strategy': [ 0.1, 0.2, 0.3],
               'under__sampling_strategy': [0.6, 0.7, 0.8],
                'smt__k_neighbors': [1, 3, 5]
             }
nfolds = 3
scores = ['recall', 'precision', 'f1', 'roc_auc']
gscv = GridSearchCV(pipeline, 
                    parameters, 
                    scoring=scores,
                    cv=StratifiedKFold(n_splits=5, shuffle=True),
                    return_train_score=False, 
                    verbose=1, 
                    refit=False,
                    n_jobs=-1)
resultsGSCV = gscv.fit(X, y)
results = pd.DataFrame(resultsGSCV.cv_results_)
display(results.sort_values(by=[f'rank_test_recall']).transpose())

Fitting 5 folds for each of 486 candidates, totalling 2430 fits


810 fits failed out of a total of 2430.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
810 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/mmarzi/.pyenv/versions/3.8.0/envs/hab/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/mmarzi/.pyenv/versions/3.8.0/envs/hab/lib/python3.8/site-packages/imblearn/pipeline.py", line 268, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
  File "/Users/mmarzi/.pyenv/versions/3.8.0/envs/hab/lib/python3.8/site-packages/imblearn/pipeline.py", line 226, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
  File "/Users/mmarzi/.pyenv/versi

Unnamed: 0,14,293,482,410,437,265,104,77,374,400,...,297,298,299,74,306,307,308,73,289,0
mean_fit_time,0.489764,1.460099,1.906826,0.614685,1.081449,0.758222,0.592608,1.777803,1.241367,2.115797,...,0.006666,0.004437,0.004915,0.004543,0.005295,0.004302,0.004257,0.005181,0.004254,0.004288
std_fit_time,0.010819,0.025931,0.017565,0.007975,0.010602,0.006755,0.009492,0.01477,0.04767,0.034397,...,0.002577,0.000537,0.000399,0.000523,0.001636,0.000179,0.000406,0.001278,0.000288,0.000191
mean_score_time,0.082916,0.20476,0.314828,0.110945,0.179958,0.098027,0.097497,0.312757,0.231758,0.314271,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std_score_time,0.003175,0.027096,0.017566,0.003376,0.002642,0.002917,0.000904,0.013963,0.020018,0.02023,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
param_clf__class_weight,balanced,balanced_subsample,,,,balanced_subsample,balanced,balanced,,,...,balanced_subsample,balanced_subsample,balanced_subsample,balanced,balanced_subsample,balanced_subsample,balanced_subsample,balanced,balanced_subsample,balanced
param_clf__criterion,gini,entropy,entropy,entropy,entropy,entropy,entropy,gini,gini,gini,...,entropy,entropy,entropy,gini,entropy,entropy,entropy,gini,entropy,gini
param_clf__n_estimators,150,300,500,150,300,150,150,500,300,500,...,500,500,500,500,500,500,500,500,300,150
param_smt__k_neighbors,3,5,5,1,1,5,5,5,5,5,...,1,1,1,5,3,3,3,5,5,1
param_smt__sampling_strategy,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,...,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
param_under__sampling_strategy,0.8,0.8,0.8,0.8,0.8,0.7,0.8,0.8,0.8,0.7,...,0.6,0.7,0.8,0.8,0.6,0.7,0.8,0.7,0.7,0.6
