In [996]:
from datetime import datetime
import pandas as pd
import numpy as np
from dateutil.parser import parse
pd.set_option('max_rows', 15)
pd.set_option('max_columns', 15)
import datetime
from dateutil.parser import parse
import math

from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

### Research Context and Goal
The goal of the modelling at hand is to predict toxicity test values for the toxin type DSP in seashell based on the abundance of micro-algae in combination with available environmental data. Marine biologists are also interested to understand which species of micro-algae affects the accumulation of toxins in the seashells. 

The monitoring of abundance of micro-algae is carried out by the National Institute of Biology, Marine Biological Station Piran. The toxicity tests are performed by the National Veterinary Institute, Faculty of Veterinary Medicine (UL), which depending on the results of these tests administers bans on sales of shellfish. 

# Data preprocessing

## Micro-algae dataset

In [997]:
# Create micro-algae dataframe (empty cells -> NaN)
df_alg = pd.read_csv("data/Algae_Podatki_1994_onwards_24062021_Martin_1_0.csv", na_values = ["NA", "?"], sep=";")

# Keep only columns: PSP,  DSP,  Dinophysis species of special interest,  DSP_like,  ASP
df_alg = df_alg[["date", "sampling station", "sampling depth", "sampling method","PSP", "DSP", "DSP_like", "ASP", "Dinophysis caudata", "Dinophysis fortii", "Phalacroma rotundatum", "Dinophysis sacculus", "Dinophysis tripos"]]
#replace , with .
df_alg.replace(",", ".", regex=True, inplace=True)

# change str numbers to floats
df_alg.loc[:,"PSP":"Dinophysis tripos"] = df_alg.loc[:,"PSP":"Dinophysis tripos"].astype(float)

# create timestamp (Option to use parameter dayfirst=True, but seems unreliable)
df_alg["date"] = pd.to_datetime(df_alg["date"])

# sort by date attribute
df_alg = df_alg.sort_values('date')

# change str numbers to floats
df_alg.loc[:,"PSP":"Dinophysis tripos"] = df_alg.loc[:,"PSP":"Dinophysis tripos"].astype(float)

# pd.set_option('max_rows', 15)
df_alg

Unnamed: 0,date,sampling station,sampling depth,sampling method,PSP,DSP,DSP_like,ASP,Dinophysis caudata,Dinophysis fortii,Phalacroma rotundatum,Dinophysis sacculus,Dinophysis tripos
0,1994-05-17,35,0,Niskin,1206.0,68.0,,,27.0,0.0,21.0,0.0,0.0
1,1994-05-17,35,12,Niskin,31.0,16.0,,,13.0,0.0,3.0,0.0,0.0
2,1994-05-17,35,5,Niskin,0.0,41.0,,,38.0,0.0,3.0,0.0,0.0
3,1994-05-17,24,0,Niskin,4188.0,17.0,,,8.0,0.0,0.0,0.0,0.0
4,1994-05-17,24,12,Niskin,222.0,2.0,,,0.0,0.0,2.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1577,2020-11-27,24,integrated,hose sampler,10.0,10.0,0.0,13000.0,0.0,0.0,0.0,0.0,0.0
1579,2020-11-27,0DB2,integrated,hose sampler,10.0,10.0,0.0,5000.0,0.0,0.0,10.0,0.0,0.0
1581,2020-12-15,35,integrated,hose sampler,0.0,20.0,0.0,1800.0,0.0,0.0,0.0,0.0,0.0
1580,2020-12-15,24,integrated,hose sampler,10.0,20.0,0.0,1200.0,0.0,0.0,10.0,0.0,0.0


In [998]:
# Rescale values of samples with sampling method type "integrated phytoplankton net" by factor
phyto_net_factor = 100
idxs_net = df_alg[df_alg["sampling method"] == "phytoplankton net"].index
df_alg.loc[idxs_net, "PSP":"Dinophysis tripos"] = df_alg.loc[idxs_net, "PSP":"Dinophysis tripos"].loc[:,"PSP":"Dinophysis tripos"]*phyto_net_factor

# # Solution with loop
# df_alg_loop = df_alg.copy()
# for index, row in df_alg.iterrows():
#     if df_alg["sampling method"][index] == "phytoplankton net":
#         df_alg_loop.loc[index, "PSP":"Dinophysis tripos"] = df_alg.loc[index, "PSP":"Dinophysis tripos"]*100
# df_alg = df_alg_loop.copy()

In [999]:
# Resolve multiple samples from the same day and location with different sampling depth by keeping only those with the highest abundace value of priority microalgae type (DSP).
df_alg_depth = df_alg.copy()
df_alg_depth["org_index"] = df_alg_depth.index
df_max_DSP = df_alg_depth.groupby(["date", "sampling station"], as_index=False).agg({"DSP": lambda x: np.argmax(x), "org_index": lambda x: tuple(x)})
# df_max_DSP

# Use max value index of tuple consisting of original indexes to find the maxid (for DSP) of original dataset
maxid_list = []
for df_idx, group_idxs in df_max_DSP.org_index.iteritems():
    tuple_idx = df_max_DSP.loc[df_idx, "DSP"]
    maxid_list.append(group_idxs[tuple_idx]) 

df_alg = df_alg.iloc[maxid_list]

In [1000]:
# Count of missing values
df_alg.isnull().sum()

date                       0
sampling station           0
sampling depth             0
sampling method            0
PSP                        5
DSP                        1
DSP_like                  60
ASP                      344
Dinophysis caudata         1
Dinophysis fortii          1
Phalacroma rotundatum      1
Dinophysis sacculus        1
Dinophysis tripos          1
dtype: int64

In [1001]:
pd.set_option("max_rows", 15)
df_alg

Unnamed: 0,date,sampling station,sampling depth,sampling method,PSP,DSP,DSP_like,ASP,Dinophysis caudata,Dinophysis fortii,Phalacroma rotundatum,Dinophysis sacculus,Dinophysis tripos
3,1994-05-17,24,0,Niskin,4188.0,17.0,,,8.0,0.0,0.0,0.0,0.0
0,1994-05-17,35,0,Niskin,1206.0,68.0,,,27.0,0.0,21.0,0.0,0.0
6,1994-06-06,0DB2,0,bucket,0.0,27.0,,,16.0,3.0,5.0,0.0,0.0
7,1994-06-13,24,0,bucket,324.0,23.0,,,3.0,0.0,0.0,0.0,0.0
8,1994-06-21,35,0,bucket,0.0,20.0,,,8.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1578,2020-11-27,35,integrated,hose sampler,0.0,0.0,0.0,10800.0,0.0,0.0,0.0,0.0,0.0
1577,2020-11-27,24,integrated,hose sampler,10.0,10.0,0.0,13000.0,0.0,0.0,0.0,0.0,0.0
1582,2020-12-15,0DB2,integrated,hose sampler,0.0,20.0,0.0,2600.0,0.0,0.0,20.0,0.0,0.0
1581,2020-12-15,35,integrated,hose sampler,0.0,20.0,0.0,1800.0,0.0,0.0,0.0,0.0,0.0


## Toxins dataset

In [1004]:
# Create toxins dataframe
# empty cells [test not available, not in the program of testing] -> NaN
df_tox = pd.read_csv("data/Toxins_Podatki_1994_onwards_24062021_Martin_1_0.csv", na_values=["x", "? (no data)", "unreliable", 
            "niso več delali analiz, ker nimajo školjk konzumne velikosti"], sep=";")  
df_tox = df_tox.drop(columns=["data source", "PSP toxins", "ASP toxins", "yessotoxins"])

# create timestamp
df_tox["date"] = pd.to_datetime(df_tox["date"])

# sort by date attribute
df_tox = df_tox.sort_values('date')

# set DateTime as index
df_tox.set_index('date', inplace=True)

pd.set_option("max_rows", 15)
df_tox

Unnamed: 0_level_0,sampling station,lipophylic toxins,lipophylic toxins (OA (µg/kg)),lipophylic toxins (DTX2 (µg/kg)),ban start,ban stop
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1989-09-15,35,poz,,,,
1989-09-15,24,poz,,,,
1989-09-25,35,,,,y,
1989-09-25,24,,,,y,
1989-09-25,0DB2,poz,,,y,
...,...,...,...,...,...,...
2019-12-20,35,< MD,< MD,< MD,,
2019-12-20,0DB2,< MD,< MD,< MD,,
2019-12-30,24,< MD,< MD,< MD,,
2019-12-30,35,< MD,< MD,< MD,,


In [1005]:
# Adjustments to the dataset values ["< MD", "< 3 mg DA/kg", "< 0,2 mg DA/kg", "0,2 mg DA/kg (neg)", "0.29 mg/kg", "neg (na meji)"] -> neg
df_tox.replace(to_replace=["< MD", "< 3 mg DA/kg", "< 0,2 mg DA/kg", "0,2 mg DA/kg (neg)", "0.29 mg/kg", "neg (na meji)"], value="neg", inplace=True)
# ">320, ocena 920" -> 920 [estimated value], 
df_tox.replace(">320, ocena 920", 920, inplace=True)
#>320, ocena 1580" -> 1580
df_tox.replace(">320, ocena 1580", 1580, inplace=True)
#">320, ocena 1880" -> 1880
df_tox.replace(">320, ocena 1880", 1880, inplace=True)
#">320, ocena 336" -> 336
df_tox.replace(">320, ocena 336", 336, inplace=True)
# ">320, ocena 470" -> 470, 
df_tox.replace(">320, ocena 470", 470, inplace=True)
#">240, ocena 510" -> 510, 
df_tox.replace(">240, ocena 510", 510, inplace=True)

# at index 947: yessotoxins="0.29 mg/kg" [need to conver to poz / neg); Or should we ignore yesstoxinsv since only one 4 values?

In [1006]:
# Standardise station names
df_tox["sampling station"].replace("0035", 35, inplace=True)
df_tox["sampling station"].replace("0024", 24, inplace=True)

# Change station names to str
df_tox["sampling station"] = df_tox["sampling station"].astype(str)

In [947]:
# Mapping num values -> "poz" or "neg" based on treshold (173 µg/kg = legal limit concentration for DSP toxins, that induce shellfish harvesting ban)
numbers = [str(x) for x in range(10)]
for idx, value in df_tox["lipophylic toxins (OA (µg/kg))"].iteritems():
    if any(x in str(value) for x in numbers):
        if int(value) >= 173:
            #print(f"old value: {value}")
            value = "poz"
            #print(f"new value: {value}")
            df_tox["lipophylic toxins (OA (µg/kg))"][idx] = value
        else:
            #print(f"old value: {value}")
            value = "neg"
            #print(f"new value: {value}")
            df_tox["lipophylic toxins (OA (µg/kg))"][idx] = value

In [948]:
## NExt time make a loop and make new colum and chek if poz or neg ...
# Joining lipophylic toxins into one column.
# Just copying values from "lipophylic toxins (OA (µg/kg))"[930:] to "lipophylic toxins" and keep the latter as "lipophylic toxins (DTX2 (µg/kg))" has no positive values.
df_tox["lipophylic toxins"][930:] = df_tox["lipophylic toxins (OA (µg/kg))"][930:]

# Remove columns "lipophylic toxins (OA (µg/kg))" and "lipophylic toxins (DTX2 (µg/kg))" from dataframe
df_tox.drop(columns=["lipophylic toxins (OA (µg/kg))", "lipophylic toxins (DTX2 (µg/kg))"], inplace=True)
print(f"lipophylic toxins classes:") 
df_tox["lipophylic toxins"].value_counts(dropna=False)



lipophylic toxins classes:


neg    1066
NaN     159
poz     148
Name: lipophylic toxins, dtype: int64

In [949]:
# #Old code: Loop to map to neg/poz, discuss with Vid!
# df_tox_OA_cat = df_tox["lipophylic toxins (OA (µg/kg))"].applymap(
#     lambda x: "poz" if type(x) == int or type(x) == float and x >= 173 else("neg" if type(x) == int or type(x) == float and x < 173)
# )
# df_tox_OA_cat

In [950]:
pd.set_option('max_rows', 15)
df_tox

Unnamed: 0_level_0,sampling station,lipophylic toxins,ban start,ban stop
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1989-09-15,35,poz,,
1989-09-15,24,poz,,
1989-09-25,35,,y,
1989-09-25,24,,y,
1989-09-25,0DB2,poz,y,
...,...,...,...,...
2019-12-20,35,neg,,
2019-12-20,0DB2,neg,,
2019-12-30,24,neg,,
2019-12-30,35,neg,,


In [951]:
df_tox.describe()

Unnamed: 0,sampling station,lipophylic toxins,ban start,ban stop
count,1373,1214,83,83
unique,3,2,1,1
top,24,neg,y,y
freq,515,1066,83,83


In [952]:
# Show Missing values
df_tox.isnull().sum()

sampling station        0
lipophylic toxins     159
ban start            1290
ban stop             1290
dtype: int64

# Consolidating datasets 

Divide algae dataset by locations 35, 24 and 0DB2.

In [953]:
# df_alg for station 35
df_alg_35 = df_alg[df_alg["sampling station"] == "35"].copy()
# df for station 24
df_alg_24 = df_alg[df_alg["sampling station"] == "24"].copy()
# df for station 0DB2
df_alg_0DB2 = df_alg[df_alg["sampling station"] == "0DB2"].copy()

Divide toxin dataset by locations 35, 24 and 0DB2.

In [954]:
# df_tox for station 35
df_tox_35 = df_tox[df_tox["sampling station"] == "35"].copy()
# df_tox for station 24
df_tox_24 = df_tox[df_tox["sampling station"] == "24"].copy()
# df_tox for station 0DB2
df_tox_0DB2 = df_tox[df_tox["sampling station"] == "0DB2"].copy()

Prepare df for consolidated datasets by locations 35, 24 and 0DB2.

In [955]:
# df_cons for station 35
df_cons_35 = df_alg_35.copy()
df_cons_35["lipophylic_toxins"] = np.nan

# df_cons for station 24
df_cons_24 = df_alg_24.copy()
df_cons_24["lipophylic_toxins"] = np.nan

# df_cons for station 0DB2 
df_cons_0DB2 = df_alg_0DB2.copy()
df_cons_0DB2["lipophylic_toxins"] = np.nan

#### Consolidation of location

In [956]:
# Loop to add appropriate toxin test results to algae sample; add first toxin test after timestamp of an algae sample but no older tests then 30 days.
max_range = pd.Timedelta(30, unit="day")
min_range = pd.Timedelta(0, unit="day")

df_cons_list = [df_cons_35, df_cons_24, df_cons_0DB2]
df_tox_list = [df_tox_35, df_tox_24, df_tox_0DB2]


for _ in range(3):           
    matches = 0
    # Iterate over consolidated df and select a date of the algae sample
    for id_alg, date_alg in df_cons_list[_]["date"].iteritems():
        # Iterate over toxin df and select the date of test 
        for date_tox in df_tox_list[_].index:
            # Calculate the timespan between algae sample and toxin test 
            timespan = date_tox - date_alg
            # If timespan within 30 days
            if min_range <= timespan <= max_range:
                # The toxin result on first acceptable toxin test day (date_tox) 
                tox_result = df_tox_list[_].loc[date_tox, "lipophylic toxins"]
                # Use only non-NaN tox_result (neg/pos)
                if type(tox_result) == str:
                    df_cons_list[_].loc[id_alg, "lipophylic_toxins"] = tox_result
                    matches += 1
                    break # problem: a tox_result should be used multiple times as long as it fits the time_frame (I think they do!) ampak
                    # problem je določat katero vrednost vzame kadar je več možnih (sedaj vzame najbližjo datumu).
    print(f"Found matching test dates for {_}: {matches}")

Found matching test dates for 0: 397
Found matching test dates for 1: 391
Found matching test dates for 2: 286


In [987]:
# # ALternative loop prefering poz values (-5, *5 days), Not done yet!
# # Loop to add appropriate toxin test results to algae sample; add first toxin test after timestamp of an algae sample but no older tests then 30 days.
# max_range = pd.Timedelta(30, unit="day")
# min_range = pd.Timedelta(0, unit="day")

# df_cons_list = [df_cons_35, df_cons_24, df_cons_0DB2]
# df_tox_list = [df_tox_35, df_tox_24, df_tox_0DB2]

# for _ in range(3):           
#     matches = 0
#     # Iterate over consolidated df and select a date of the algae sample
#     for id_alg, date_alg in df_cons_list[_]["date"].iteritems():
#         # Iterate over toxin df and select the date of test 
#         for date_tox in df_tox_list[_].index:
#             # Calculate the timespan between algae sample and toxin test 
#             timespan = date_tox - date_alg
#             # If timespan within 30 days
#             if min_range <= timespan <= max_range:
#                 # The toxin result on first acceptable toxin test day (date_tox) 
#                 tox_result = df_tox_list[_].loc[date_tox, "lipophylic toxins"]
#                 # Use only non-NaN tox_result (neg/pos)
#                 if tox_result == "poz":
#                     df_cons_list[_].loc[id_alg, "lipophylic_toxins"] = tox_result
#                     matches += 1
#                     break 
#                 elif tox_result == "neg":
                    
#                     # problem: a tox_result should be used multiple times as long as it fits the time_frame (I think they do!) ampak
#                     # problem je določat katero vrednost vzame kadar je več možnih (sedaj vzame najbližjo datumu).
#     print(f"Found matching test dates for {_}: {matches}")

IndentationError: expected an indented block (4072970307.py, line 30)

#### Consolidate datasets

In [873]:
# Concatenated dataframe from each location 
df_cons = pd.concat([df_cons_35, df_cons_24, df_cons_0DB2])

# sort by date attribute
df_cons = df_cons.sort_index()

pd.set_option("max_rows", 15)
df_cons

Unnamed: 0,date,sampling station,sampling depth,sampling method,PSP,DSP,DSP_like,ASP,Dinophysis caudata,Dinophysis fortii,Phalacroma rotundatum,Dinophysis sacculus,Dinophysis tripos,lipophylic_toxins
0,1994-05-17,35,0,Niskin,1206.0,68.0,,,27.0,0.0,21.0,0.0,0.0,
3,1994-05-17,24,0,Niskin,4188.0,17.0,,,8.0,0.0,0.0,0.0,0.0,
6,1994-06-06,0DB2,0,bucket,0.0,27.0,,,16.0,3.0,5.0,0.0,0.0,
7,1994-06-13,24,0,bucket,324.0,23.0,,,3.0,0.0,0.0,0.0,0.0,
8,1994-06-21,35,0,bucket,0.0,20.0,,,8.0,0.0,0.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1578,2020-11-27,35,integrated,hose sampler,0.0,0.0,0.0,10800.0,0.0,0.0,0.0,0.0,0.0,
1579,2020-11-27,0DB2,integrated,hose sampler,10.0,10.0,0.0,5000.0,0.0,0.0,10.0,0.0,0.0,
1580,2020-12-15,24,integrated,hose sampler,10.0,20.0,0.0,1200.0,0.0,0.0,10.0,0.0,0.0,
1581,2020-12-15,35,integrated,hose sampler,0.0,20.0,0.0,1800.0,0.0,0.0,0.0,0.0,0.0,


In [874]:
# slice df to end of 2019 for when toxicity tests are available
df_cons.drop(list(range(1523, 1583)), inplace=True)
df_cons

Unnamed: 0,date,sampling station,sampling depth,sampling method,PSP,DSP,DSP_like,ASP,Dinophysis caudata,Dinophysis fortii,Phalacroma rotundatum,Dinophysis sacculus,Dinophysis tripos,lipophylic_toxins
0,1994-05-17,35,0,Niskin,1206.0,68.0,,,27.0,0.0,21.0,0.0,0.0,
3,1994-05-17,24,0,Niskin,4188.0,17.0,,,8.0,0.0,0.0,0.0,0.0,
6,1994-06-06,0DB2,0,bucket,0.0,27.0,,,16.0,3.0,5.0,0.0,0.0,
7,1994-06-13,24,0,bucket,324.0,23.0,,,3.0,0.0,0.0,0.0,0.0,
8,1994-06-21,35,0,bucket,0.0,20.0,,,8.0,0.0,0.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1518,2019-12-16,35,integrated,hose sampler,0.0,0.0,0.0,300.0,0.0,0.0,0.0,0.0,0.0,neg
1519,2019-12-16,0DB2,integrated,hose sampler,0.0,20.0,0.0,2500.0,0.0,10.0,0.0,10.0,0.0,neg
1520,2020-01-15,24,integrated,hose sampler,0.0,10.0,0.0,1000.0,0.0,0.0,10.0,0.0,0.0,
1521,2020-01-15,35,integrated,hose sampler,0.0,0.0,0.0,2100.0,0.0,0.0,0.0,0.0,0.0,


In [877]:
df_cons.lipophylic_toxins.value_counts(dropna=False)

neg    948
NaN    249
poz    126
Name: lipophylic_toxins, dtype: int64

In [1019]:
pd.set_option("max_rows", 15)
df_cons

Unnamed: 0,date,sampling station,sampling depth,sampling method,PSP,DSP,DSP_like,ASP,Dinophysis caudata,Dinophysis fortii,Phalacroma rotundatum,Dinophysis sacculus,Dinophysis tripos,lipophylic_toxins
0,1994-05-17,Seca,0,Niskin,1206.0,68.0,,,27.0,0.0,21.0,0.0,0.0,
3,1994-05-17,Strunjan,0,Niskin,4188.0,17.0,,,8.0,0.0,0.0,0.0,0.0,
6,1994-06-06,Debeli_rtic,0,bucket,0.0,27.0,,,16.0,3.0,5.0,0.0,0.0,
7,1994-06-13,Strunjan,0,bucket,324.0,23.0,,,3.0,0.0,0.0,0.0,0.0,
8,1994-06-21,Seca,0,bucket,0.0,20.0,,,8.0,0.0,0.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1518,2019-12-16,Seca,integrated,hose sampler,0.0,0.0,0.0,300.0,0.0,0.0,0.0,0.0,0.0,neg
1519,2019-12-16,Debeli_rtic,integrated,hose sampler,0.0,20.0,0.0,2500.0,0.0,10.0,0.0,10.0,0.0,neg
1520,2020-01-15,Strunjan,integrated,hose sampler,0.0,10.0,0.0,1000.0,0.0,0.0,10.0,0.0,0.0,
1521,2020-01-15,Seca,integrated,hose sampler,0.0,0.0,0.0,2100.0,0.0,0.0,0.0,0.0,0.0,


# Prepare exports for modelling

In [1014]:
# Change sampling station names
df_cons["sampling station"].replace("35", "Seca", inplace=True)
df_cons["sampling station"].replace("24", "Strunjan", inplace=True)
df_cons["sampling station"].replace("0DB2", "Debeli_rtic", inplace=True)

In [1017]:
# Export csv
df_cons.to_csv("HAB_1_0.csv", sep=",", na_rep="?", index=False)

In [1028]:
# For covnerting sampling depth to nominal in Weka
dict(df_cons["sampling depth"].value_counts()).keys()

dict_keys(['integrated', '0', '10', '2.5', '0.5', '6', '1', '14', '12'])

![J48](tree.J48-C0.25-M4-10F.png)