In [160]:
from datetime import datetime
import pandas as pd
import numpy as np
from dateutil.parser import parse
#pd.set_option('max_rows', 10)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Cilj raziskave
Namen modeliranja je napovedati preseženo vrednost toksinov v školjkah na podlagi abundance mikroalg v kombinaciji z okoljskimi parametri. (Pristojni se odločajo o zapori prodaje školjk na podlagi testov toksičnostu)

# Data preprocessing

## Micro-algae dataset

In [140]:
# Create micro-algae dataframe (empty cells -> NaN)
df_alg = pd.read_csv("data/Algae_Podatki_1994_onwards_24062021.csv", na_values = ["NA", "?"], sep=";")

# Keep only columns: PSP,  DSP,  Dinophysis species of special interest,  DSP_like,  ASP
df_alg = df_alg[["date", "sampling station", "sampling depth", "sampling method","PSP", "DSP", "DSP_like", "ASP", "Dinophysis caudata", "Dinophysis fortii", "Phalacroma rotundatum", "Dinophysis sacculus", "Dinophysis tripos"]]
df_alg

Unnamed: 0,date,sampling station,sampling depth,sampling method,PSP,...,Dinophysis caudata,Dinophysis fortii,Phalacroma rotundatum,Dinophysis sacculus,Dinophysis tripos
0,17/05/1994,35,0,Niskin,1206,...,27,0,21,0,0
1,17/05/1994,35,12,Niskin,31,...,13,0,3,0,0
2,17/05/1994,35,5,Niskin,0,...,38,0,3,0,0
3,17/05/1994,24,0,Niskin,4188,...,8,0,0,0,0
4,17/05/1994,24,12,Niskin,222,...,0,0,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1578,27/11/2020,35,integrated,hose sampler,0,...,0,0,0,0,0
1579,27/11/2020,0DB2,integrated,hose sampler,10,...,0,0,10,0,0
1580,15/12/2020,24,integrated,hose sampler,10,...,0,0,10,0,0
1581,15/12/2020,35,integrated,hose sampler,0,...,0,0,0,0,0


In [141]:
# Count of missing values
df_alg.isnull().sum()

date                     0
sampling station         0
sampling depth           0
sampling method          0
PSP                      6
                        ..
Dinophysis caudata       1
Dinophysis fortii        1
Phalacroma rotundatum    1
Dinophysis sacculus      1
Dinophysis tripos        1
Length: 13, dtype: int64

## Toxins dataset

In [None]:
# Create toxins dataframe
# empty cells [test not available, not in the program of testing] -> NaN
df_tox = pd.read_csv("data/Toxins_Podatki_1994_onwards_24062021.csv", na_values=["x", "? (no data)", "unreliable", 
            "niso več delali analiz, ker nimajo školjk konzumne velikosti"], sep=";")  
df_tox = df_tox.drop(columns=["data source", "comments"])
df_tox

In [None]:
# Adjustments to the dataset values
# ["< MD", "< 3 mg DA/kg", "< 0,2 mg DA/kg", "0,2 mg DA/kg (neg)", "0.29 mg/kg", "neg (na meji)"] -> neg
df_tox.replace(to_replace=["< MD", "< 3 mg DA/kg", "< 0,2 mg DA/kg", "0,2 mg DA/kg (neg)", "0.29 mg/kg", "neg (na meji)"], value="neg", inplace=True)
# ">320, ocena 920" -> 920 [estimated value], 
df_tox.replace(">320, ocena 920", 920, inplace=True)
#>320, ocena 1580" -> 1580
df_tox.replace(">320, ocena 1580", 1580, inplace=True)
#">320, ocena 1880" -> 1880
df_tox.replace(">320, ocena 1880", 1880, inplace=True)
#">320, ocena 336" -> 336
df_tox.replace(">320, ocena 336", 336, inplace=True)
# ">320, ocena 470" -> 470, 
df_tox.replace(">320, ocena 470", 470, inplace=True)
#">240, ocena 510" -> 510, 
df_tox.replace(">240, ocena 510", 510, inplace=True)

# at index 947: yessotoxins="0.29 mg/kg" [need to conver to poz / neg); Or should we ignore yesstoxinsv since only one 4 values?

In [None]:
# Mapping num values -> "poz" or "neg" based on treshold (173 µg/kg = legal limit concentration for DSP toxins, that induce shellfish harvesting ban)
numbers = [str(x) for x in range(9 + 1)]
for idx, value in df_tox["lipophylic toxins (OA (µg/kg))"].iteritems():
    if any(x in str(value) for x in numbers):
        if int(value) >= 173:
            #print(f"old value: {value}")
            value = "poz"
            #print(f"new value: {value}")
            df_tox["lipophylic toxins (OA (µg/kg))"][idx] = value
        else:
            #print(f"old value: {value}")
            value = "neg"
            #print(f"new value: {value}")
            df_tox["lipophylic toxins (OA (µg/kg))"][idx] = value

In [161]:
# Joining lipophylic toxins into one column.
# Just copying values from "lipophylic toxins (OA (µg/kg))"[930:] to "lipophylic toxins" and keep the latter as "lipophylic toxins (DTX2 (µg/kg))" has no positive values.
#df_tox["lipophylic toxins"][930:] = df_tox["lipophylic toxins (OA (µg/kg))"][930:]

# Remove columns "lipophylic toxins (OA (µg/kg))" and "lipophylic toxins (DTX2 (µg/kg))" from dataframe
#df_tox.drop(columns=["lipophylic toxins (OA (µg/kg))", "lipophylic toxins (DTX2 (µg/kg))"], inplace=True)
print(f"lipophylic toxins classes:") 
df_tox["lipophylic toxins"].value_counts()

print(f"PSP toxins classes:") 
df_tox["PSP toxins"].value_counts()

print(f"ASP toxins classes:") 
df_tox["ASP toxins"].value_counts()

print(f"yessotoxins classes:") 
df_tox["yessotoxins"].value_counts()

lipophylic toxins classes:


neg    1062
poz     152
Name: lipophylic toxins, dtype: int64

PSP toxins classes:


neg    465
Name: PSP toxins, dtype: int64

ASP toxins classes:


neg    591
Name: ASP toxins, dtype: int64

yessotoxins classes:


neg    4
poz    1
Name: yessotoxins, dtype: int64

In [118]:
# #Old code: Loop to map to neg/poz, discuss with Vid!
# df_tox_OA_cat = df_tox["lipophylic toxins (OA (µg/kg))"].applymap(
#     lambda x: "poz" if type(x) == int or type(x) == float and x >= 173 else("neg" if type(x) == int or type(x) == float and x < 173)
# )
# df_tox_OA_cat

In [167]:
pd.set_option('max_rows', 10)
df_tox

Unnamed: 0,date,sampling station,lipophylic toxins,PSP toxins,ASP toxins,yessotoxins,ban start,ban stop
0,15/09/1989,0035,poz,,,,,
1,15/09/1989,0024,poz,,,,,
2,25/09/1989,0035,,,,,y,
3,25/09/1989,0024,,,,,y,
4,25/09/1989,0DB2,poz,,,,y,
...,...,...,...,...,...,...,...,...
1368,20/12/2019,0024,neg,,,,,
1369,20/12/2019,0DB2,neg,,,,,
1370,30/12/2019,0035,neg,,,,,
1371,30/12/2019,0024,neg,,,,,


In [165]:
df_tox.describe()

Unnamed: 0,date,sampling station,lipophylic toxins,PSP toxins,ASP toxins,yessotoxins,ban start,ban stop
count,1373,1373,1214,465,591,5,83,83
unique,757,3,2,1,1,2,1,1
top,17/08/2010,24,neg,neg,neg,neg,y,y
freq,4,515,1062,465,591,4,83,83
