In [62]:
from datetime import datetime
import pandas as pd
import numpy as np
from dateutil.parser import parse
pd.set_option("display.max_rows", 15)
pd.set_option("display.max_columns", 30)
import datetime
from dateutil.parser import parse
import math

from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

### Research Context and Goal
The goal of the modelling at hand is to predict toxicity test values for the toxin type DSP in seashell based on the abundance of micro-algae in combination with available environmental data. Marine biologists are also interested to understand which species of micro-algae affects the accumulation of toxins in the seashells. 

The monitoring of abundance of micro-algae is carried out by the National Institute of Biology, Marine Biological Station Piran. The toxicity tests are performed by the National Veterinary Institute, Faculty of Veterinary Medicine (UL), which depending on the results of these tests administers bans on sales of shellfish. 

# Data preprocessing

## Micro-algae dataset

In [2]:
# Create micro-algae dataframe (empty cells -> NaN)
df_alg = pd.read_csv("data/Algae_Podatki_1994_onwards_24062021_Martin_1_0.csv", na_values = ["NA", "?"], sep=";")

# Keep only columns: PSP,  DSP,  Dinophysis species of special interest,  DSP_like,  ASP
df_alg = df_alg[["date", "sampling station", "sampling depth", "sampling method","PSP", "DSP", 
                 "DSP_like", "ASP", "Dinophysis caudata", "Dinophysis fortii", "Phalacroma rotundatum", 
                 "Dinophysis sacculus", "Dinophysis tripos"]]
#replace , with .
df_alg.replace(",", ".", regex=True, inplace=True)

# create timestamp (Option to use parameter dayfirst=True, but seems unreliable)
df_alg["date"] = pd.to_datetime(df_alg["date"])

# sort by date attribute
df_alg = df_alg.sort_values('date')

# change str numbers to floats
df_alg.loc[:,"PSP":"Dinophysis tripos"] = df_alg.loc[:,"PSP":"Dinophysis tripos"].astype(float)

df_alg

Unnamed: 0,date,sampling station,sampling depth,sampling method,PSP,DSP,DSP_like,ASP,Dinophysis caudata,Dinophysis fortii,Phalacroma rotundatum,Dinophysis sacculus,Dinophysis tripos
0,1994-05-17,35,0,Niskin,1206.0,68.0,,,27.0,0.0,21.0,0.0,0.0
1,1994-05-17,35,12,Niskin,31.0,16.0,,,13.0,0.0,3.0,0.0,0.0
2,1994-05-17,35,5,Niskin,0.0,41.0,,,38.0,0.0,3.0,0.0,0.0
3,1994-05-17,24,0,Niskin,4188.0,17.0,,,8.0,0.0,0.0,0.0,0.0
4,1994-05-17,24,12,Niskin,222.0,2.0,,,0.0,0.0,2.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1577,2020-11-27,24,integrated,hose sampler,10.0,10.0,0.0,13000.0,0.0,0.0,0.0,0.0,0.0
1579,2020-11-27,0DB2,integrated,hose sampler,10.0,10.0,0.0,5000.0,0.0,0.0,10.0,0.0,0.0
1581,2020-12-15,35,integrated,hose sampler,0.0,20.0,0.0,1800.0,0.0,0.0,0.0,0.0,0.0
1580,2020-12-15,24,integrated,hose sampler,10.0,20.0,0.0,1200.0,0.0,0.0,10.0,0.0,0.0


In [3]:
# Rescale values of samples with sampling method type "integrated phytoplankton net" by factor
phyto_net_factor = 100
idxs_net = df_alg[df_alg["sampling method"] == "phytoplankton net"].index
df_alg.loc[idxs_net, "PSP":"Dinophysis tripos"] = df_alg.loc[idxs_net, "PSP":"Dinophysis tripos"]*phyto_net_factor

In [4]:
# Resolve multiple samples from the same day and location with different sampling depth by keeping only 
# those with the highest abundace value of priority microalgae type (DSP).
df_alg_depth = df_alg.copy()
df_alg_depth["org_index"] = df_alg_depth.index
df_max_DSP = df_alg_depth.groupby(["date", "sampling station"], 
                                  as_index=False).agg({"DSP": lambda x: np.argmax(x), "org_index": lambda x: tuple(x)})
# df_max_DSP

# Use max value index of tuple consisting of original indexes to find the maxid (for DSP) of original dataset
maxid_list = []
for df_idx, group_idxs in df_max_DSP.org_index.iteritems():
    tuple_idx = df_max_DSP.loc[df_idx, "DSP"]
    maxid_list.append(group_idxs[tuple_idx]) 

df_alg = df_alg.iloc[maxid_list]

In [5]:
# Count of missing values
df_alg.isnull().sum()

date                       0
sampling station           0
sampling depth             0
sampling method            0
PSP                        5
DSP                        1
DSP_like                  60
ASP                      344
Dinophysis caudata         1
Dinophysis fortii          1
Phalacroma rotundatum      1
Dinophysis sacculus        1
Dinophysis tripos          1
dtype: int64

## Toxins dataset

In [6]:
# Create toxins dataframe
# empty cells [test not available, not in the program of testing] -> NaN
df_tox = pd.read_csv("data/Toxins_Podatki_1994_onwards_24062021_Martin_1_0.csv", 
                     na_values=["x", "? (no data)", "unreliable", 
                                "niso več delali analiz, ker nimajo školjk konzumne velikosti"], sep=";")  
df_tox = df_tox.drop(columns=["data source", "PSP toxins", "ASP toxins", "yessotoxins"])

# # create timestamp
df_tox["date"] = pd.to_datetime(df_tox["date"], infer_datetime_format=True)

# # sort by date attribute
df_tox = df_tox.sort_values('date')

# # set DateTime as index
df_tox.set_index('date', inplace=True)

# df_tox

In [7]:
# Adjustments to the dataset values ["< MD", "< 3 mg DA/kg", "< 0,2 mg DA/kg", 
# "0,2 mg DA/kg (neg)", "0.29 mg/kg", "neg (na meji)"] -> neg
df_tox.replace(to_replace=["< MD", "< 3 mg DA/kg", "< 0,2 mg DA/kg", "0,2 mg DA/kg (neg)", 
                           "0.29 mg/kg", "neg (na meji)"], value="neg", inplace=True)
# ">320, ocena 920" -> 920 [estimated value], 
df_tox.replace(">320, ocena 920", 920, inplace=True)
#>320, ocena 1580" -> 1580
df_tox.replace(">320, ocena 1580", 1580, inplace=True)
#">320, ocena 1880" -> 1880
df_tox.replace(">320, ocena 1880", 1880, inplace=True)
#">320, ocena 336" -> 336
df_tox.replace(">320, ocena 336", 336, inplace=True)
# ">320, ocena 470" -> 470, 
df_tox.replace(">320, ocena 470", 470, inplace=True)
#">240, ocena 510" -> 510, 
df_tox.replace(">240, ocena 510", 510, inplace=True)

# at index 947: yessotoxins="0.29 mg/kg" [need to convert to poz / neg); 
# Or should we ignore yesstoxins since only one 4 values?

In [8]:
# Standardise station names
df_tox["sampling station"].replace("0035", 35, inplace=True)
df_tox["sampling station"].replace("0024", 24, inplace=True)

# Change station names to str
df_tox["sampling station"] = df_tox["sampling station"].astype(str)

In [9]:
# Mapping num values -> "poz" or "neg" based on treshold (173 µg/kg = legal limit concentration for DSP toxins, 
# that induce shellfish harvesting ban)
numbers = [str(x) for x in range(10)]
for idx, value in df_tox["lipophylic toxins (OA (µg/kg))"].iteritems():
    if any(x in str(value) for x in numbers):
        if int(value) >= 173:
            #print(f"old value: {value}")
            value = "poz"
            #print(f"new value: {value}")
            df_tox["lipophylic toxins (OA (µg/kg))"][idx] = value
        else:
            #print(f"old value: {value}")
            value = "neg"
            #print(f"new value: {value}")
            df_tox["lipophylic toxins (OA (µg/kg))"][idx] = value

In [10]:
# Joining lipophylic toxins into one column.
# Just copying values from "lipophylic toxins (OA (µg/kg))"[930:] to "lipophylic toxins" and 
# keep the latter as "lipophylic toxins (DTX2 (µg/kg))" has no positive values.
df_tox["lipophylic toxins"][930:] = df_tox["lipophylic toxins (OA (µg/kg))"][930:]

# Remove columns "lipophylic toxins (OA (µg/kg))" and "lipophylic toxins (DTX2 (µg/kg))" from dataframe
df_tox.drop(columns=["lipophylic toxins (OA (µg/kg))", "lipophylic toxins (DTX2 (µg/kg))"], inplace=True)
print(f"lipophylic toxins classes:") 
df_tox["lipophylic toxins"].value_counts(dropna=False)



lipophylic toxins classes:


neg    1066
NaN     159
poz     148
Name: lipophylic toxins, dtype: int64

In [11]:
# Print out toxins dataset
pd.set_option('display.max_rows', 15)
df_tox

Unnamed: 0_level_0,sampling station,lipophylic toxins,ban start,ban stop
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1989-09-15,35,poz,,
1989-09-15,24,poz,,
1989-09-25,35,,y,
1989-09-25,24,,y,
1989-09-25,0DB2,poz,y,
...,...,...,...,...
2019-12-20,35,neg,,
2019-12-20,0DB2,neg,,
2019-12-30,24,neg,,
2019-12-30,35,neg,,


In [12]:
# Descriptive statistics of toxins dataset
df_tox.describe()

Unnamed: 0,sampling station,lipophylic toxins,ban start,ban stop
count,1373,1214,83,83
unique,3,2,1,1
top,24,neg,y,y
freq,515,1066,83,83


In [13]:
# Show Missing values
df_tox.isnull().sum()

sampling station        0
lipophylic toxins     159
ban start            1290
ban stop             1290
dtype: int64

# Consolidating datasets 

In order to learn a model that can make predictions about the toxicity tests' results from the algae abundance we need to alocate to the algae abundace instances a matching toxicity test, while the datasets have different temporal resolution and frequency of sampling. We will do this separately for each of the three locations.

Divide algae dataset by locations 35, 24 and 0DB2.

In [14]:
# df_alg for station 35
df_alg_35 = df_alg[df_alg["sampling station"] == "35"].copy()
# df for station 24
df_alg_24 = df_alg[df_alg["sampling station"] == "24"].copy()
# df for station 0DB2
df_alg_0DB2 = df_alg[df_alg["sampling station"] == "0DB2"].copy()

Divide toxin dataset by locations 35, 24 and 0DB2.

In [15]:
# df_tox for station 35
df_tox_35 = df_tox[df_tox["sampling station"] == "35"].copy()
# df_tox for station 24
df_tox_24 = df_tox[df_tox["sampling station"] == "24"].copy()
# df_tox for station 0DB2
df_tox_0DB2 = df_tox[df_tox["sampling station"] == "0DB2"].copy()

Prepare new dataframes for consolidated datasets by locations 35, 24 and 0DB2.

In [16]:
# df_cons for station 35
df_cons_35 = df_alg_35.copy()
df_cons_35["lipophylic_toxins"] = np.nan

# df_cons for station 24
df_cons_24 = df_alg_24.copy()
df_cons_24["lipophylic_toxins"] = np.nan

# df_cons for station 0DB2 
df_cons_0DB2 = df_alg_0DB2.copy()
df_cons_0DB2["lipophylic_toxins"] = np.nan

#### Consolidation of abundance and toxin data

In [17]:
# Loop to add appropriate toxin test results to algae samples; add first toxin test 
# after timestamp of an algae sample but no older tests then 30 days.
max_range = pd.Timedelta(30, unit="day")
min_range = pd.Timedelta(0, unit="day")

df_cons_list = [df_cons_35, df_cons_24, df_cons_0DB2]
df_tox_list = [df_tox_35, df_tox_24, df_tox_0DB2]


for _ in range(3):           
    matches = 0
    # Iterate over consolidated df and select a date of the algae sample
    for id_alg, date_alg in df_cons_list[_]["date"].iteritems():
        # Iterate over toxin df and select the date of test 
        for date_tox in df_tox_list[_].index:
            # Calculate the timespan between algae sample and toxin test 
            timespan = date_tox - date_alg
            # If timespan within 30 days
            if min_range <= timespan <= max_range:
                # The toxin result on first acceptable toxin test day (date_tox) 
                tox_result = df_tox_list[_].loc[date_tox, "lipophylic toxins"]
                # Use only non-NaN tox_result (neg/pos)
                if type(tox_result) == str:
                    df_cons_list[_].loc[id_alg, "lipophylic_toxins"] = tox_result
                    matches += 1
                    break # problem: a tox_result should be used multiple times as long as 
                    # it fits the time_frame (I think they do!) ampak
                    # problem je določat katero vrednost vzame kadar je več možnih (sedaj vzame najbližjo datumu).
    print(f"Found matching test dates for {_}: {matches}")

Found matching test dates for 0: 397
Found matching test dates for 1: 391
Found matching test dates for 2: 286


#### Consolidate datasets from each location

In [18]:
# Concatenated dataframe from each location to get final consolidated dataset
df_cons = pd.concat([df_cons_35, df_cons_24, df_cons_0DB2])

# sort by date attribute
df_cons = df_cons.sort_values('date')

# slice df to end of 2019 for when toxicity tests are available
df_cons.drop(list(range(1523, 1583)), inplace=True)
df_cons

Unnamed: 0,date,sampling station,sampling depth,sampling method,PSP,DSP,DSP_like,ASP,Dinophysis caudata,Dinophysis fortii,Phalacroma rotundatum,Dinophysis sacculus,Dinophysis tripos,lipophylic_toxins
0,1994-05-17,35,0,Niskin,1206.0,68.0,,,27.0,0.0,21.0,0.0,0.0,
3,1994-05-17,24,0,Niskin,4188.0,17.0,,,8.0,0.0,0.0,0.0,0.0,
6,1994-06-06,0DB2,0,bucket,0.0,27.0,,,16.0,3.0,5.0,0.0,0.0,
7,1994-06-13,24,0,bucket,324.0,23.0,,,3.0,0.0,0.0,0.0,0.0,
8,1994-06-21,35,0,bucket,0.0,20.0,,,8.0,0.0,0.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1519,2019-12-16,0DB2,integrated,hose sampler,0.0,20.0,0.0,2500.0,0.0,10.0,0.0,10.0,0.0,neg
1517,2019-12-16,24,integrated,hose sampler,0.0,0.0,0.0,2100.0,0.0,0.0,0.0,0.0,0.0,neg
1522,2020-01-15,0DB2,integrated,hose sampler,20.0,40.0,0.0,200.0,10.0,20.0,10.0,0.0,0.0,
1521,2020-01-15,35,integrated,hose sampler,0.0,0.0,0.0,2100.0,0.0,0.0,0.0,0.0,0.0,


In [19]:
# Check for unique values and missing values in the final consolidated dataset.
df_cons.lipophylic_toxins.value_counts(dropna=False)

neg    948
NaN    249
poz    126
Name: lipophylic_toxins, dtype: int64

# Prepare exports for modelling

In [20]:
# Change sampling station names
df_cons["sampling station"].replace("35", "Seca", inplace=True)
df_cons["sampling station"].replace("24", "Strunjan", inplace=True)
df_cons["sampling station"].replace("0DB2", "Debeli_rtic", inplace=True)

In [21]:
# Export csv
df_cons.to_csv("data/HAB_bio-tox_0.csv", sep=",", na_rep="?", index=False)

In [22]:
# Create pickle
df_cons.to_pickle("objects/df_cons")

# Adding environmental and chemical data

## River inflow dataset

In [23]:
import openpyxl

# Create Soca dataframe
df_soca = pd.read_excel("data/ORIGINAL/okoljski/Pretok Soča 1985-2020.xlsx", usecols=["Year_month", "Soca"])  
df_soca["Year_month"] = pd.to_datetime(df_soca["Year_month"], format="%Y%m")
df_soca = df_soca.set_index("Year_month")
df_soca = df_soca.sort_values('Year_month')
df_soca = df_soca.to_period()
# df_soca["Year"] = df_soca["Year_month"].dt.year
# df_soca["Day_of_the_Year"] = df_soca["Year_month"].dt.dayofyear

df_soca

Unnamed: 0_level_0,Soca
Year_month,Unnamed: 1_level_1
1985-01,203.988065
1985-02,107.671071
1985-03,105.638710
1985-04,163.196667
1985-05,188.430645
...,...
2020-08,59.099935
2020-09,116.928467
2020-10,219.806032
2020-11,58.129567


In [24]:
# Create Po dataframe
df_po = pd.read_excel("data/ORIGINAL/okoljski/Pretok_Pad_1985-2020.xlsx", usecols=["Year_month", "Po"])  
df_po["Year_month"] = pd.to_datetime(df_po["Year_month"], format="%Y%m")
df_po = df_po.set_index("Year_month")
df_po = df_po.to_period()
df_po = df_po.sort_values('Year_month')
# df_po["Year"] = df_po["Year_month"].dt.year
df_po

Unnamed: 0_level_0,Po
Year_month,Unnamed: 1_level_1
1985-01,1345.387097
1985-02,1613.214286
1985-03,2799.032258
1985-04,1630.666667
1985-05,2705.419355
...,...
2020-08,753.224194
2020-09,1201.106333
2020-10,2358.285806
2020-11,1218.319000


## Sun radiation dataset

In [25]:
# Global sun radiation in kWh/m3
df_sun_kWh = pd.read_excel("data/ORIGINAL/okoljski/Soncno sevanje 2000-2016.xlsx", header=3)

## ARSO dataset

In [26]:
df_arso = pd.read_csv("data/ORIGINAL/ARSO/ARSO 1993-2022.txt") 
df_arso = df_arso.loc[:, " valid":].copy()
df_arso.rename(columns = {" valid":"date", "trajanje sonca [h]":"sun [h]"}, inplace = True)
df_arso["date"] = pd.to_datetime(df_arso["date"])
df_arso = df_arso.set_index("date")
df_arso = df_arso.sort_values('date')
df_arso

Unnamed: 0_level_0,povp. dnevna T [°C],povp. veter [m/s],povp. rel. vla. [%],povp. tlak [hPa],količina padavin [mm],sun [h]
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1993-01-01,0.7,6.3,43,1026,0.0,6.6
1993-01-02,-3.4,6.6,46,1027,0.0,4.0
1993-01-03,-1.7,6.5,47,1025,0.0,1.6
1993-01-04,-1.0,3.6,46,1031,0.0,6.4
1993-01-05,-1.4,2.8,59,1036,0.0,7.2
...,...,...,...,...,...,...
2022-06-26,25.3,2.7,57,1015,0.0,14.3
2022-06-27,26.8,2.1,54,1014,0.0,13.6
2022-06-28,27.4,3.2,53,1013,0.0,8.0
2022-06-29,25.7,2.7,66,1010,0.5,11.2


### Adding sun radiation to dataset

In [27]:
# Loop to add appropriate summed sun radiation period before timestamp of an algae sample in the consolidated dataset.
df_cons2 = df_cons.copy()
df_cons2["sun [h]"] = np.nan

max_range = pd.Timedelta(20, unit="day")
min_range = pd.Timedelta(0, unit="day")

# Iterate over consolidated df and select a date of the algae sample
for id_alg, date_alg in df_cons["date"].iteritems():
    # Iterate over ARSO df and selecet and instances in period and sum radiation values
    rad_values = []
    for date_sun in df_arso.index:
        # Calculate the timespan between algae sample and sun radiation
        timespan = date_alg - date_sun
        # If timespan within radiation period
        if min_range <= timespan <= max_range:
            # Append value to rad_values list
            rad_values.append(df_arso.loc[date_sun, "sun [h]"])
        # Add sum of rad values to df_cons once all period values are found
        if len(rad_values) > 20:
            sum_sun = sum(rad_values)
            df_cons2.loc[id_alg, "sun [h]"] = sum_sun
            break

## Seawater (chemical) dataset

In [40]:
pd.set_option("display.max_rows", 20)


In [41]:
df_sea = pd.read_excel("data/ORIGINAL/kemijski/Fi_Ke_1994-2019_checked.xlsx")
# filter out our three stations and choose 0 depth for ODB2 (but many NaN, so decide if consider all locations!)
df_sea = df_sea[((df_sea["station_id"] == "0DB2") | (df_sea["station_id"] == "0035") | (df_sea["station_id"] == "0024")) & (df_sea["depth"] == 0)]

df_sea["datetime"] = pd.to_datetime(df_sea["datetime"])
df_sea = df_sea.set_index("datetime")
df_sea = df_sea.sort_values('datetime')
df_sea

Unnamed: 0_level_0,station_id,depth,Chl-a,salinity,T,SECCHI,NH4-N,...,SiO3-Si,Org-N,TOT-N,TOT-P,O2,O2_sat,pH
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1997-04-23 02:00:00,0024,0,1.58,36.00,11.60,,2.05,...,,28.000000,34.889999,0.64,,,
1997-04-23 02:00:00,0035,0,0.92,35.75,11.60,,2.67,...,,23.370001,28.700001,0.36,,,
1997-05-14 02:00:00,0024,0,0.47,35.25,16.30,,2.00,...,,20.309999,27.080000,0.31,,,
1997-05-14 02:00:00,0035,0,0.65,35.25,16.60,,1.77,...,,15.500000,21.110001,0.27,,,
1997-06-12 02:00:00,0024,0,1.34,33.75,23.20,,3.94,...,,3.050000,14.190000,0.26,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-10-16 10:55:00,0DB2,0,0.43,37.50,19.90,8.0,0.64,...,2.03,,,,208.970591,92.080002,8.19
2019-11-21 08:25:00,0035,0,,36.69,16.71,4.0,,...,,,,,212.577029,87.389999,8.20
2019-11-21 10:00:00,0024,0,,36.72,16.41,4.0,,...,,,,,215.875342,88.519997,8.20
2019-11-21 11:55:00,0DB2,0,0.23,35.84,16.05,3.5,1.26,...,6.99,,,,220.518203,89.309998,8.20


In [30]:
pd.set_option("display.max_rows", 20)
df_sea.isnull().sum()

datetime         0
station_id       0
depth            0
Chl-a          895
salinity        14
T              115
SECCHI        1779
NH4-N          529
NO2-N          499
NO3-N          496
PO4-P          514
SiO3-Si        610
Org-N         1796
TOT-N         1146
TOT-P         1146
O2             182
O2_sat         768
pH             386
dtype: int64

In [31]:
df_sea.columns

Index(['datetime', 'station_id', 'depth', 'Chl-a', 'salinity', 'T', 'SECCHI',
       'NH4-N', 'NO2-N', 'NO3-N', 'PO4-P', 'SiO3-Si', 'Org-N', 'TOT-N',
       'TOT-P', 'O2', 'O2_sat', 'pH'],
      dtype='object')

### Adding chemical data to dataset

In [63]:
# Loop to add appropriate summed sun radiation period before timestamp of an algae sample in the consolidated dataset.
df_cons3 = df_cons2.copy()
df_cons3['Chl-a'], df_cons3['salinity'], df_cons3['T'], df_cons3['SECCHI'], df_cons3['NH4-N'], df_cons3['NO2-N'], df_cons3['NO3-N'], df_cons3['PO4-P'], df_cons3['SiO3-Si'], df_cons3['Org-N'], df_cons3['TOT-N'], df_cons3['TOT-P'], df_cons3['O2'], df_cons3['O2_sat'], df_cons3['pH'] = [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan]


max_range = pd.Timedelta(20, unit="day")
min_range = pd.Timedelta(0, unit="day")

#NEED TO ADDAPT:
# # Iterate over consolidated df and select a date of the algae sample
# for id_alg, date_alg in df_cons["date"].iteritems():
#     # Iterate over ARSO df and selecet and instances in period and sum radiation values
#     rad_values = []
#     for date_sun in df_arso.index:
#         # Calculate the timespan between algae sample and sun radiation
#         timespan = date_alg - date_sun
#         # If timespan within radiation period
#         if min_range <= timespan <= max_range:
#             # Append value to rad_values list
#             rad_values.append(df_arso.loc[date_sun, "sun [h]"])
#         # Add sum of rad values to df_cons once all period values are found
#         if len(rad_values) > 20:
#             sum_sun = sum(rad_values)
#             df_cons2.loc[id_alg, "sun [h]"] = sum_sun
#             break

In [64]:
pd.set_option("display.max_rows", 15)
df_cons3

Unnamed: 0,date,sampling station,sampling depth,sampling method,PSP,DSP,DSP_like,ASP,Dinophysis caudata,Dinophysis fortii,Phalacroma rotundatum,Dinophysis sacculus,Dinophysis tripos,lipophylic_toxins,sun [h],Chl-a,salinity,T,SECCHI,NH4-N,NO2-N,NO3-N,PO4-P,SiO3-Si,Org-N,TOT-N,TOT-P,O2,O2_sat,pH
0,1994-05-17,Seca,0,Niskin,1206.0,68.0,,,27.0,0.0,21.0,0.0,0.0,,197.1,,,,,,,,,,,,,,,
3,1994-05-17,Strunjan,0,Niskin,4188.0,17.0,,,8.0,0.0,0.0,0.0,0.0,,197.1,,,,,,,,,,,,,,,
6,1994-06-06,Debeli_rtic,0,bucket,0.0,27.0,,,16.0,3.0,5.0,0.0,0.0,,166.3,,,,,,,,,,,,,,,
7,1994-06-13,Strunjan,0,bucket,324.0,23.0,,,3.0,0.0,0.0,0.0,0.0,,179.1,,,,,,,,,,,,,,,
8,1994-06-21,Seca,0,bucket,0.0,20.0,,,8.0,0.0,0.0,0.0,0.0,,157.5,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1519,2019-12-16,Debeli_rtic,integrated,hose sampler,0.0,20.0,0.0,2500.0,0.0,10.0,0.0,10.0,0.0,neg,64.3,,,,,,,,,,,,,,,
1517,2019-12-16,Strunjan,integrated,hose sampler,0.0,0.0,0.0,2100.0,0.0,0.0,0.0,0.0,0.0,neg,64.3,,,,,,,,,,,,,,,
1522,2020-01-15,Debeli_rtic,integrated,hose sampler,20.0,40.0,0.0,200.0,10.0,20.0,10.0,0.0,0.0,,124.8,,,,,,,,,,,,,,,
1521,2020-01-15,Seca,integrated,hose sampler,0.0,0.0,0.0,2100.0,0.0,0.0,0.0,0.0,0.0,,124.8,,,,,,,,,,,,,,,
