In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
from dateutil.parser import parse
import datetime
from dateutil.parser import parse
import math
from numpy import mean

from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV, cross_validate, StratifiedKFold
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline as SKLpipeline
from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text
from dtreeviz.trees import dtreeviz 
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as IMBLpipeline

from sklearn.inspection import permutation_importance
import shap
from matplotlib import pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

pd.set_option("display.max_rows", 30)
pd.set_option("display.max_columns", 30)

In [2]:
# read df pickle
df_alg = pd.read_pickle("objects/df_alg-HAB_preprocessing_5_1")
data = pd.read_pickle("data/preprocessed/hab_org-data-HAB_part2-preprocessing-5_2")
# data = pd.read_pickle("data/preprocessed/hab_interp_data-HAB_part2-preprocessing-5_2")

# data.drop(columns=["sampling station", "date"], inplace=True)
# data.drop(columns=["date"], inplace=True)
data.isnull().sum()

date                       0
sampling station           0
DSP                        1
Dinophysis caudata         1
Dinophysis fortii          1
Phalacroma rotundatum      1
Dinophysis sacculus        1
Dinophysis tripos          1
sun [h]                    0
air temp                   0
wind strength              0
precipitation              0
Chl-a                    661
salinity                 191
T                        244
SECCHI                   696
DIN                      542
PO4-P                    534
Soca                       0
month                      0
lipophylic_toxins        320
dtype: int64

In [3]:
data.drop(columns=["Chl-a", "SECCHI", "DIN", "PO4-P"], inplace=True)#
data.isnull().sum()

date                       0
sampling station           0
DSP                        1
Dinophysis caudata         1
Dinophysis fortii          1
Phalacroma rotundatum      1
Dinophysis sacculus        1
Dinophysis tripos          1
sun [h]                    0
air temp                   0
wind strength              0
precipitation              0
salinity                 191
T                        244
Soca                       0
month                      0
lipophylic_toxins        320
dtype: int64

In [4]:
# Prepare for ML in scikit-learn
# labeled and unlabeled part
data_l = data[data['lipophylic_toxins'].notnull()]
data_ul = data[data['lipophylic_toxins'].isnull()]

# Remove missing values
data_l = data_l.dropna(how="any")
data_ul = data_ul.dropna(how="any")

data_l_full = data_l.copy()

data_l.drop(columns=["sampling station", "date"], inplace=True)
data_ul.drop(columns=["sampling station", "date"], inplace=True)

print(f"class distribution:")
print(data_l["lipophylic_toxins"].value_counts(dropna=False))

X = data_l.drop("lipophylic_toxins", axis=1)
y = data_l["lipophylic_toxins"]

# sklearn lable encoding
le = LabelEncoder()
le.fit(y)
y = le.transform(y)
print(f"class encoding: ['neg','poz'] -> {le.transform(['neg','poz'])}")

class distribution:
neg    839
poz    114
Name: lipophylic_toxins, dtype: int64
class encoding: ['neg','poz'] -> [0 1]


In [5]:
worst_score = 1
worst_split = None
best_score = 0
best_split = None

nfolds = 5
nrepeats = 10000
score = 'recall'

all_scores = []
for i in range(nrepeats):
    skf = StratifiedKFold(n_splits=nfolds, shuffle=True)
    train_test_indices = list(skf.split(X, y))
    model = RandomForestClassifier()
    fold_scores = cross_val_score(model, X, y, scoring=score, cv=train_test_indices, n_jobs=-1)
    all_scores.append(fold_scores)
    imin = np.argmin(fold_scores)
    imax = np.argmax(fold_scores)
    
    if fold_scores[imin] < worst_score:
        worst_score = fold_scores[imin]
        worst_split = train_test_indices[imin]
    
    if fold_scores[imax] > best_score:
        best_score = fold_scores[imax]
        best_split = train_test_indices[imax]
print(best_score, worst_score)

0.6956521739130435 0.0


In [6]:
worst_train_data = data_l_full.iloc[worst_split[0]]
worst_test_data = data_l_full.iloc[worst_split[1]]

best_train_data = data_l_full.iloc[best_split[0]]
best_test_data = data_l_full.iloc[best_split[1]]

worst_train_data.to_csv('data/worst_train.csv')
worst_test_data.to_csv('data/worst_test.csv')

best_train_data.to_csv('data/best_train.csv')
best_test_data.to_csv('data/best_test.csv')