In [1]:
import pandas as pd
import multiprocessing
from numpy import mean
from collections import Counter

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as IMBLpipeline
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV, cross_validate, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline as SKLpipeline

from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, KFold

In [2]:
# df_alg = pd.read_pickle("objects/df_alg")
df = pd.read_pickle("objects/df_cons4-HAB_preprocessing_4_2")

# labeled and unlabeled part
dfl = df[df['lipophylic_toxins'].notnull()]
dfu = df[df['lipophylic_toxins'].isnull()]

data = dfl.drop(columns=["ASP", "Chl-a", "salinity", "T", "SECCHI", "NH4-N", "NO2-N", "NO3-N",
                        "PO4-P", "SiO3-Si", "O2", "pH"]).dropna()

X = data.drop("lipophylic_toxins", axis=1)
y = data["lipophylic_toxins"]

le = LabelEncoder()
le.fit(y)
y = le.transform(y)
print(f"class encoding: ['neg','poz'] -> {le.transform(['neg','poz'])}")

class encoding: ['neg','poz'] -> [0 1]


### Simple Random forest: 60/40 split (shuffle is True by default) and 5-fold CV with shuffling

In [29]:
# print(metrics.get_scorer_names())
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.40)
lrn = RandomForestClassifier()
# lrn = DecisionTreeClassifier()

model = lrn.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(metrics.classification_report(y_test, y_pred, target_names=['neg','poz']))
print('AUC-ROC:', metrics.roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]), '\n\n')

cvfolds = 5
cv_results = cross_validate(lrn, X, y, scoring=['recall','precision','f1', 'roc_auc'], 
                            cv=StratifiedKFold(n_splits=cvfolds, shuffle=True), 
                            n_jobs=-1)
print(f'CV {cvfolds} folds')
dfr = pd.DataFrame(cv_results)
dfr.loc['mean'] = dfr.mean()
display(dfr)

              precision    recall  f1-score   support

         neg       0.91      0.98      0.95       355
         poz       0.63      0.27      0.38        45

    accuracy                           0.90       400
   macro avg       0.77      0.62      0.66       400
weighted avg       0.88      0.90      0.88       400

AUC-ROC: 0.8980281690140846 


CV 5 folds


Unnamed: 0,fit_time,score_time,test_recall,test_precision,test_f1,test_roc_auc
0,0.182682,0.028374,0.347826,1.0,0.516129,0.898796
1,0.204304,0.023506,0.347826,0.8,0.484848,0.915377
2,0.215873,0.037488,0.26087,0.857143,0.4,0.915746
3,0.2016,0.02308,0.409091,0.9,0.5625,0.912283
4,0.217033,0.036403,0.227273,0.833333,0.357143,0.838084
mean,0.204298,0.02977,0.318577,0.878095,0.464124,0.896057


### Simple Random forest, SMOTE and under-sampling, testing on 60/40 split

In [30]:
pipeline = IMBLpipeline([('smt', SMOTE()), ('under', RandomUnderSampler()), ('RF',RandomForestClassifier())])
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.40)
pipeline.fit(X_train, y_train) 

y_pred = pipeline.predict(X_test)
print(metrics.classification_report(y_test, y_pred, target_names=['neg','poz']))
print('AUC-ROC:', metrics.roc_auc_score(y_test, pipeline.predict_proba(X_test)[:,1]))

              precision    recall  f1-score   support

         neg       0.92      0.96      0.94       355
         poz       0.52      0.36      0.42        45

    accuracy                           0.89       400
   macro avg       0.72      0.66      0.68       400
weighted avg       0.88      0.89      0.88       400

AUC-ROC: 0.8694209702660407


### Random forest with grid search for parameters, testing on 5-fold CV with shuffling

In [32]:
pd.set_option("display.max_rows", 50)

pipeline = SKLpipeline([
    ('clf', RandomForestClassifier())
])

parameters = {
              'clf__n_estimators': [50,100,150,200,500],
              'clf__criterion': ['gini', 'entropy'],
              'clf__class_weight': ['balanced', 'balanced_subsample', None]
             }
nfolds = 3
scores = ['recall', 'precision', 'f1', 'roc_auc']
gscv = GridSearchCV(pipeline, 
                    parameters, 
                    scoring=scores,
                    cv=StratifiedKFold(n_splits=5, shuffle=True),
                    return_train_score=False, 
                    verbose=1, 
                    refit=False,
                    n_jobs=-1)
resultsGSCV = gscv.fit(X, y)
results = pd.DataFrame(resultsGSCV.cv_results_)
display(results.sort_values(by=[f'rank_test_recall']).transpose())

Fitting 5 folds for each of 30 candidates, totalling 150 fits


Unnamed: 0,2,16,28,23,19,26,29,18,22,20,...,13,17,9,21,3,5,6,1,10,11
mean_fit_time,0.345875,0.297216,0.55691,0.489411,1.595968,0.253679,1.003689,0.58373,0.340518,0.11307,...,0.557424,0.485184,1.267103,0.238392,0.459758,0.1289,0.273697,0.247223,0.16423,0.285956
std_fit_time,0.024225,0.01738,0.048778,0.024081,0.073769,0.02272,0.08919,0.024983,0.007102,0.004459,...,0.039798,0.060492,0.040173,0.017209,0.014042,0.009237,0.030334,0.030926,0.039123,0.013647
mean_score_time,0.053501,0.037897,0.067889,0.070556,0.160989,0.038604,0.097414,0.067292,0.074306,0.022804,...,0.072395,0.05437,0.17683,0.048341,0.068237,0.024414,0.037782,0.038514,0.024503,0.041321
std_score_time,0.003099,0.006234,0.00236,0.007603,0.005924,0.003522,0.003924,0.003862,0.038749,0.001243,...,0.007219,0.004529,0.025707,0.021631,0.005757,0.003192,0.001458,0.002855,0.004907,0.001918
param_clf__class_weight,balanced,balanced_subsample,,,balanced_subsample,,,balanced_subsample,,,...,balanced_subsample,balanced_subsample,balanced,,balanced,balanced,balanced,balanced,balanced_subsample,balanced_subsample
param_clf__criterion,gini,entropy,entropy,gini,entropy,entropy,entropy,entropy,gini,gini,...,gini,entropy,entropy,gini,gini,entropy,entropy,gini,gini,gini
param_clf__n_estimators,150,100,200,200,500,100,500,200,150,50,...,200,150,500,100,200,50,100,100,50,100
params,"{'clf__class_weight': 'balanced', 'clf__criter...","{'clf__class_weight': 'balanced_subsample', 'c...","{'clf__class_weight': None, 'clf__criterion': ...","{'clf__class_weight': None, 'clf__criterion': ...","{'clf__class_weight': 'balanced_subsample', 'c...","{'clf__class_weight': None, 'clf__criterion': ...","{'clf__class_weight': None, 'clf__criterion': ...","{'clf__class_weight': 'balanced_subsample', 'c...","{'clf__class_weight': None, 'clf__criterion': ...","{'clf__class_weight': None, 'clf__criterion': ...",...,"{'clf__class_weight': 'balanced_subsample', 'c...","{'clf__class_weight': 'balanced_subsample', 'c...","{'clf__class_weight': 'balanced', 'clf__criter...","{'clf__class_weight': None, 'clf__criterion': ...","{'clf__class_weight': 'balanced', 'clf__criter...","{'clf__class_weight': 'balanced', 'clf__criter...","{'clf__class_weight': 'balanced', 'clf__criter...","{'clf__class_weight': 'balanced', 'clf__criter...","{'clf__class_weight': 'balanced_subsample', 'c...","{'clf__class_weight': 'balanced_subsample', 'c..."
split0_test_recall,0.478261,0.478261,0.434783,0.391304,0.478261,0.478261,0.434783,0.478261,0.434783,0.347826,...,0.391304,0.391304,0.478261,0.391304,0.434783,0.434783,0.391304,0.391304,0.391304,0.347826
split1_test_recall,0.217391,0.217391,0.26087,0.26087,0.173913,0.173913,0.217391,0.173913,0.217391,0.217391,...,0.173913,0.173913,0.086957,0.217391,0.173913,0.173913,0.086957,0.173913,0.130435,0.086957
