In [1]:
import pandas as pd
import multiprocessing
from numpy import mean
from collections import Counter

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as IMBLpipeline
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV, cross_validate, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline as SKLpipeline

from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, KFold

In [2]:
# df_alg = pd.read_pickle("objects/df_alg")
df = pd.read_pickle("objects/df_cons4-HAB_preprocessing_4_2")

# labeled and unlabeled part
dfl = df[df['lipophylic_toxins'].notnull()]
dfu = df[df['lipophylic_toxins'].isnull()]

data = dfl.drop(columns=["ASP", "Chl-a", "salinity", "T", "SECCHI", "NH4-N", "NO2-N", "NO3-N",
                        "PO4-P", "SiO3-Si", "O2", "pH"]).dropna()

X = data.drop("lipophylic_toxins", axis=1)
y = data["lipophylic_toxins"]

le = LabelEncoder()
le.fit(y)
y = le.transform(y)
print(f"class encoding: ['neg','poz'] -> {le.transform(['neg','poz'])}")

class encoding: ['neg','poz'] -> [0 1]


### Simple Random forest: 60/40 split (shuffle is True by default) and 5-fold CV with shuffling

In [3]:
# print(metrics.get_scorer_names())
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.40)
lrn = RandomForestClassifier()
# lrn = DecisionTreeClassifier()

model = lrn.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(metrics.classification_report(y_test, y_pred, target_names=['neg','poz']))
print('AUC-ROC:', metrics.roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]), '\n\n')

cvfolds = 5
cv_results = cross_validate(lrn, X, y, scoring=['recall','precision','f1', 'roc_auc'], 
                            cv=StratifiedKFold(n_splits=cvfolds, shuffle=True), 
                            n_jobs=-1)
print(f'CV {cvfolds} folds')
dfr = pd.DataFrame(cv_results)
dfr.loc['mean'] = dfr.mean()
display(dfr)

              precision    recall  f1-score   support

         neg       0.91      0.99      0.95       355
         poz       0.82      0.20      0.32        45

    accuracy                           0.91       400
   macro avg       0.86      0.60      0.64       400
weighted avg       0.90      0.91      0.88       400

AUC-ROC: 0.8839123630672927 


CV 5 folds


Unnamed: 0,fit_time,score_time,test_recall,test_precision,test_f1,test_roc_auc
0,0.184929,0.028797,0.347826,0.8,0.484848,0.914763
1,0.185522,0.029275,0.391304,0.818182,0.529412,0.922378
2,0.185275,0.028254,0.26087,0.6,0.363636,0.815524
3,0.186521,0.030896,0.272727,1.0,0.428571,0.923391
4,0.183289,0.029598,0.409091,0.818182,0.545455,0.903955
mean,0.185107,0.029364,0.336364,0.807273,0.470385,0.896002


### Simple Random forest, SMOTE and under-sampling, testing on 60/40 split

In [4]:
pipeline = IMBLpipeline([('smt', SMOTE()), ('under', RandomUnderSampler()), ('RF',RandomForestClassifier())])
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.40)
pipeline.fit(X_train, y_train) 

y_pred = pipeline.predict(X_test)
print(metrics.classification_report(y_test, y_pred, target_names=['neg','poz']))
print('AUC-ROC:', metrics.roc_auc_score(y_test, pipeline.predict_proba(X_test)[:,1]))

              precision    recall  f1-score   support

         neg       0.94      0.96      0.95       355
         poz       0.63      0.49      0.55        45

    accuracy                           0.91       400
   macro avg       0.78      0.73      0.75       400
weighted avg       0.90      0.91      0.91       400

AUC-ROC: 0.8951486697965572


### Random forest with grid search for parameters, testing on 5-fold CV with shuffling

In [8]:
pd.set_option("display.max_rows", 50)

pipeline = IMBLpipeline([
    ('smt', SMOTE()), ('under', RandomUnderSampler()), ('clf', RandomForestClassifier())
])

parameters = {
              'clf__n_estimators': [150,300,500],
              'clf__criterion': ['gini', 'entropy'],
              'clf__class_weight': ['balanced', 'balanced_subsample', None],
               'smt__sampling_strategy': [ 0.1, 0.2, 0.3],
               'under__sampling_strategy': [0.6, 0.7, 0.8],
                'smt__k_neighbors': [1, 3, 5]
             }
nfolds = 3
scores = ['recall', 'precision', 'f1', 'roc_auc']
gscv = GridSearchCV(pipeline, 
                    parameters, 
                    scoring=scores,
                    cv=StratifiedKFold(n_splits=5, shuffle=True),
                    return_train_score=False, 
                    verbose=1, 
                    refit=False,
                    n_jobs=-1)
resultsGSCV = gscv.fit(X, y)
results = pd.DataFrame(resultsGSCV.cv_results_)
display(results.sort_values(by=[f'rank_test_recall']).transpose())

Fitting 5 folds for each of 486 candidates, totalling 2430 fits


810 fits failed out of a total of 2430.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
810 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/mmarzi/.pyenv/versions/3.8.0/envs/hab/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/mmarzi/.pyenv/versions/3.8.0/envs/hab/lib/python3.8/site-packages/imblearn/pipeline.py", line 268, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
  File "/Users/mmarzi/.pyenv/versions/3.8.0/envs/hab/lib/python3.8/site-packages/imblearn/pipeline.py", line 226, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
  File "/Users/mmarzi/.pyenv/versi

Unnamed: 0,419,50,158,302,140,482,212,5,347,95,...,135,333,334,335,128,127,126,342,344,0
mean_fit_time,0.563759,0.859637,2.13169,2.353418,2.008276,1.738012,1.540568,0.482846,0.617647,0.526936,...,0.003968,0.004248,0.004472,0.003645,0.003841,0.006186,0.004375,0.004863,0.003914,0.006306
std_fit_time,0.006196,0.00964,0.032763,0.143673,0.105019,0.041451,0.044802,0.061568,0.010662,0.00886,...,0.000348,0.000935,0.000715,0.000128,0.000325,0.002118,0.000577,0.001736,0.000105,0.000581
mean_score_time,0.098659,0.145744,0.341661,0.299194,0.325948,0.27786,0.195038,0.088019,0.098769,0.087904,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std_score_time,0.003562,0.004497,0.020754,0.005522,0.005259,0.009974,0.009826,0.011273,0.009066,0.002996,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
param_clf__class_weight,,balanced,balanced,balanced_subsample,balanced,,balanced_subsample,balanced,,balanced,...,balanced,,,,balanced,balanced,balanced,,,balanced
param_clf__criterion,entropy,gini,entropy,entropy,entropy,entropy,gini,gini,gini,entropy,...,entropy,gini,gini,gini,entropy,entropy,entropy,gini,gini,gini
param_clf__n_estimators,150,300,500,500,500,500,300,150,150,150,...,500,150,150,150,300,300,300,150,150,150
param_smt__k_neighbors,3,5,5,1,1,5,5,1,5,3,...,1,3,3,3,5,5,5,5,5,1
param_smt__sampling_strategy,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,...,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
param_under__sampling_strategy,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.8,...,0.6,0.6,0.7,0.8,0.8,0.7,0.6,0.6,0.8,0.6
