In [44]:
import pandas as pd
import numpy as np

# Data coupling for abiotic and biotic data based on location and date.
***
_Authors: Rijk van der Meer and Ardjano Mark (Team jam.rs)._

_Data: Dr. Louis Peperzak (NIOZ) and Rijkswaterstaat._

_Project: Coupling plankton and abiotic long-term data sets._


## Importing and preparing data.
***

In [45]:
abio_df = pd.read_excel('../../data/ABIO.xlsx', sheet_name='ABIO_SURF')
phyto_df = pd.read_excel('../../data/PHYTO.xlsx', sheet_name='PHYTO_SURF')
irradiance_df = pd.read_excel('../../data/Irradiance.xlsx', sheet_name='PAR')

In [46]:
display("Abiotic dataframe")
display(abio_df.head())
display("Phytoplankton dataframe")
display(phyto_df.head())
display("Irradiance dataframe")
display(irradiance_df.head())

'Abiotic dataframe'

Unnamed: 0,LOC_CODE,LOC,DATUMTIJDWAARDE,DATUM,TIJD,Year,Month,Day,Q_clndr,Q_eco,PAROMS,PLT:REFVLAK,VAR,BGC,Value_original,Value_interm,VALUE,KWC,EHD
0,GROOTGND,EDG,24-2-2017 15:29:00,20170224,'1529,2017,2,24,1,1,Zwevende stof,WATSGL,ZS,,1380.0,1380.0,1380.0,0,mg/l
1,GROOTGND,EDG,1-4-2015 14:10:00,20150401,'1410,2015,4,1,2,2,Zwevende stof,WATSGL,ZS,,732.0,732.0,732.0,0,mg/l
2,GROOTGND,EDG,19-4-2016 15:38:00,20160419,'1538,2016,4,19,2,2,Zwevende stof,WATSGL,ZS,,712.0,712.0,712.0,0,mg/l
3,HUIBGOT,EDH,21-10-2004 05:59:00,20041021,'0559,2004,10,21,4,4,Zwevende stof,WATSGL,ZS,,710.0,,,56,mg/l
4,GROOTGND,EDG,13-10-1997 11:37:00,19971013,'1137,1997,10,13,4,4,Zwevende stof,WATSGL,ZS,,677.0,677.0,677.0,50,mg/l


'Phytoplankton dataframe'

Unnamed: 0,LOC_CODE,LOC,WATERBODY,TYPE,DATE_SMP,YEAR,MONTH,DAY,Q_clndr,Q_eco,PROD_CODE,SPECIES,SPEC,GROUP,AMT_MEAS,cL,LcL
0,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,2005-01-21,,,,,1,OW,Actinocyclus normanii,Acn,DIAT,1.0,5000.0,
1,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,1994-01-25,,,,,1,WATSGL,Actinocyclus normanii,Acn,DIAT,,2000.0,
2,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,1994-02-24,,,,,1,WATSGL,Actinocyclus normanii,Acn,DIAT,,485.0,
3,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,2001-03-22,,,,,2,OW,Actinocyclus normanii,Acn,DIAT,1.0,833.333333,
4,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,2002-03-13,,,,,2,OW,Actinocyclus normanii,Acn,DIAT,2.0,4000.0,


'Irradiance dataframe'

Unnamed: 0,YYYYMMDD,Q,YEAR,MONTH,PAR [J/m2d],PAR [kJ/m2d],kPAR_7d,kPAR_14d
0,19900101,49,1990,1,220500.0,220.5,,
1,19900102,161,1990,1,724500.0,724.5,,
2,19900103,75,1990,1,337500.0,337.5,,
3,19900104,51,1990,1,229500.0,229.5,,
4,19900105,196,1990,1,882000.0,882.0,,


### Prepare abiotic data
***
_Preparing the abiotic datasets on the following points_:

- Extracting the unique variables measured in the dataset.
- Extracting the different measurement locations in the dataset.
- Move all measurements on a particular time and in a row together. 

In [47]:
unique_PAROMS = list(abio_df.PAROMS.unique())
unique_VARS = list(abio_df.VAR.unique())
unique_tuple = list(zip(unique_PAROMS, unique_VARS))
display(unique_tuple)

[('Zwevende stof', 'ZS'),
 ('Doorzicht', 'ZICHT'),
 ('Temperatuur', 'T'),
 ('silicaat', 'SiO2'),
 ('Saliniteit', 'SALNTT'),
 ('orthofosfaat', 'PO4'),
 ('Zuurgraad', 'pH'),
 ('nitraat', 'NO3'),
 ('nitriet', 'NO2'),
 ('ammonium', 'NH4'),
 ('Extinctiecoefficient', 'E'),
 ('chlorofyl-a', 'CHLFa')]

In [48]:
locations = list(abio_df["LOC_CODE"].unique())
locations_abv = list(abio_df["LOC"].unique())
locations_tuple = list(zip(locations, locations_abv))
locations_tuple

[('GROOTGND', 'EDG'),
 ('HUIBGOT', 'EDH'),
 ('SCHAARVODDL', 'WSO'),
 ('DANTZGT', 'WZD'),
 ('VLISSGBISSVH', 'WSV'),
 ('MARSDND', 'WZM'),
 ('HANSWGL', 'WSH'),
 ('GOERE6', 'GOE6'),
 ('WALCRN2', 'WA2'),
 ('NOORDWK20', 'NW20'),
 ('ROTTMPT3', 'RP3'),
 ('SOELKKPDOT', 'VMS'),
 ('NOORDWK2', 'NW2'),
 ('WALCRN70', 'WA70'),
 ('NOORDWK70', 'NW70'),
 ('TERSLG10', 'TS10'),
 ('NOORDWK10', 'NW10'),
 ('WALCRN20', 'WA20'),
 ('TERSLG4', 'TS4'),
 ('ROTTMPT70', 'RP70'),
 ('LODSGT', 'OSL'),
 ('TERSLG135', 'TS135'),
 ('ROTTMPT50', 'RP50'),
 ('TERSLG235', 'TS235'),
 ('DREISR', 'GMD'),
 ('TERSLG100', 'TS100'),
 ('TERSLG175', 'TS175')]

#### Move all rows from particular time and location into 1 row together.

In [49]:
def from_df_to_row_df(df: pd.DataFrame, location: str, time: str, col_list: list) -> pd.DataFrame:
    """
    Transforms a batch of rows to a single row. Both as pd.DataFrame type.

    Variables:
    df: Group of same location and same time in pd.DataFrame type.
    location: Location 
    time: Time 
    col_list = list of columns to use. The first two are the location and time.

    Output:
    A single row dataframe of type pd. DataFrame.
    """
    row_df = df
    row_df = row_df[col_list]

    measurements = list(zip(*(row_df[col] for col in col_list[2:])))
    measurements_dict = dict([t[:2] for t in measurements])
    loc, date = location, time

    row = pd.Series(measurements_dict)
    row[col_list[0]], row[col_list[1]] = loc, date

    final_df = row.to_frame().T

    return final_df

In [50]:
# group the DataFrame by 'LOC_CODE' and 'DATUMTIJDWAARDE'
# create a column list for the new dataframe
new_col_list = ["LOC_CODE", "DATUMTIJDWAARDE"] + unique_VARS
abio_flattened = pd.DataFrame(columns=new_col_list)

abio_grouped = abio_df.groupby(['LOC_CODE', 'DATUMTIJDWAARDE'])

# Iterate over the groups
for (location, time), group_df in abio_grouped:
    # 'location' and 'time' are the current group keys
    # 'group_df' is the DataFrame for the current group
    # Process the group_df as needed
    # print(group_df)
    group = from_df_to_row_df(group_df, location, time, col_list=["LOC_CODE", "DATUMTIJDWAARDE", 'VAR',"VALUE", "KWC", "EHD"])
    abio_flattened = pd.concat([abio_flattened, group], axis=0, join='outer')


#### Datumtijdwaarde
***
- Turn col='datumtijdwaarde' into datetime type. 
- Extract time to a different column

In [51]:
abio_flattened["DATUMTIJDWAARDE"] = pd.to_datetime(abio_flattened["DATUMTIJDWAARDE"], format='mixed', dayfirst=True)

abio_flattened['TIJD'] = abio_flattened["DATUMTIJDWAARDE"].dt.time
abio_flattened['DATUM'] = abio_flattened["DATUMTIJDWAARDE"].dt.date

In [52]:
display(abio_flattened.head())

Unnamed: 0,LOC_CODE,DATUMTIJDWAARDE,ZS,ZICHT,T,SiO2,SALNTT,PO4,pH,NO3,NO2,NH4,E,CHLFa,TIJD,DATUM
0,DANTZGT,2000-10-01 09:30:00,26.0,,,12.428571,,1.483871,,1.142857,1.0,9.5,,8.0,09:30:00,2000-10-01
0,DANTZGT,2002-10-01 09:30:00,,10.0,15.17,,31.18,,8.06,,,,,,09:30:00,2002-10-01
0,DANTZGT,2003-10-01 07:11:00,89.0,5.0,13.25,5.821429,33.03,1.354839,8.13,1.214286,0.714286,9.142857,3.23678,20.6,07:11:00,2003-10-01
0,DANTZGT,2009-10-01 10:19:00,,,15.0,,31.4,,,,,,,,10:19:00,2009-10-01
0,DANTZGT,2010-10-01 08:02:00,160.0,3.0,12.4,18.928571,28.9,0.806452,8.0,3.571429,1.642857,15.714286,4.61,16.0,08:02:00,2010-10-01


In [53]:
abio_flattened = abio_flattened.drop('DATUMTIJDWAARDE', axis=1)
new_order = ['LOC_CODE', 'DATUM', 'TIJD'] + unique_VARS

abio_flattened = abio_flattened[new_order]
abio_flattened.head()

Unnamed: 0,LOC_CODE,DATUM,TIJD,ZS,ZICHT,T,SiO2,SALNTT,PO4,pH,NO3,NO2,NH4,E,CHLFa
0,DANTZGT,2000-10-01,09:30:00,26.0,,,12.428571,,1.483871,,1.142857,1.0,9.5,,8.0
0,DANTZGT,2002-10-01,09:30:00,,10.0,15.17,,31.18,,8.06,,,,,
0,DANTZGT,2003-10-01,07:11:00,89.0,5.0,13.25,5.821429,33.03,1.354839,8.13,1.214286,0.714286,9.142857,3.23678,20.6
0,DANTZGT,2009-10-01,10:19:00,,,15.0,,31.4,,,,,,,
0,DANTZGT,2010-10-01,08:02:00,160.0,3.0,12.4,18.928571,28.9,0.806452,8.0,3.571429,1.642857,15.714286,4.61,16.0


#### Append the units to the variable columns

In [54]:
var_unit_dict = {}
for var in unique_VARS:
    var_df = abio_df.loc[abio_df["VAR"] == var]
    first_row = var_df.iloc[0]
    unit = first_row['EHD']
    var_unit_dict[var] = str(var) + f" [{unit}]"

In [55]:
abio_flattened.rename(columns=var_unit_dict)

Unnamed: 0,LOC_CODE,DATUM,TIJD,ZS [mg/l],ZICHT [dm],T [oC],SiO2 [umol/L],SALNTT [DIMSLS],PO4 [umol/L],pH [DIMSLS],NO3 [umol/L],NO2 [umol/L],NH4 [umol/L],E [/m],CHLFa [ug/l]
0,DANTZGT,2000-10-01,09:30:00,26.0,,,12.428571,,1.483871,,1.142857,1.0,9.5,,8.0
0,DANTZGT,2002-10-01,09:30:00,,10.0,15.17,,31.18,,8.06,,,,,
0,DANTZGT,2003-10-01,07:11:00,89.0,5.0,13.25,5.821429,33.03,1.354839,8.13,1.214286,0.714286,9.142857,3.23678,20.6
0,DANTZGT,2009-10-01,10:19:00,,,15.0,,31.4,,,,,,,
0,DANTZGT,2010-10-01,08:02:00,160.0,3.0,12.4,18.928571,28.9,0.806452,8.0,3.571429,1.642857,15.714286,4.61,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,WALCRN70,1992-04-09,09:19:00,5.0,,7.9,0.5,35.31,0.080645,8.3,3.642857,0.071429,0.642857,,4.64
0,WALCRN70,2001-04-09,23:16:00,6.0,,8.338,1.035714,34.67,0.129032,8.227,3.142857,0.142857,0.142857,,9.2
0,WALCRN70,1994-06-09,03:08:00,2.0,,12.1,0.321429,34.83,0.709677,8.5,0.178571,0.035714,0.285714,,2.6
0,WALCRN70,1999-08-09,19:15:00,2.0,,18.12,1.25,35.08,0.129032,7.989,0.107143,0.214286,0.5,,1.18


#### Combine multiple measurements on a day to one per day

### Prepare phytoplankton data

In [56]:
# rename the date_smp column
phyto_df.rename(columns={'DATE_SMP':'DATUMTIJDWAARDE'}, inplace=True)
phyto_df.head()

Unnamed: 0,LOC_CODE,LOC,WATERBODY,TYPE,DATUMTIJDWAARDE,YEAR,MONTH,DAY,Q_clndr,Q_eco,PROD_CODE,SPECIES,SPEC,GROUP,AMT_MEAS,cL,LcL
0,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,2005-01-21,,,,,1,OW,Actinocyclus normanii,Acn,DIAT,1.0,5000.0,
1,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,1994-01-25,,,,,1,WATSGL,Actinocyclus normanii,Acn,DIAT,,2000.0,
2,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,1994-02-24,,,,,1,WATSGL,Actinocyclus normanii,Acn,DIAT,,485.0,
3,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,2001-03-22,,,,,2,OW,Actinocyclus normanii,Acn,DIAT,1.0,833.333333,
4,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,2002-03-13,,,,,2,OW,Actinocyclus normanii,Acn,DIAT,2.0,4000.0,


#### Use function from_df_to_row_df() from before to flatten the dataset based on Abbreviations.

In [57]:
# recompute 'LcL' column
phyto_df['LcL'] = np.log10(phyto_df['cL'] + 1)

# Extract unique species
unique_SPEC = phyto_df['SPEC'].unique()

# Group the DataFrame by 'LOC_CODE' and 'DATUMTIJDWAARDE'
new_col_list = ["LOC_CODE", "DATUMTIJDWAARDE"] + list(unique_SPEC)
phyto_flattened = pd.DataFrame(columns=new_col_list)

phyto_grouped = phyto_df.groupby(['LOC_CODE', 'DATUMTIJDWAARDE'])

# Iterate over the groups
for (location, time), group_df in phyto_grouped:
    # 'location' and 'time' are the current group keys
    # 'group_df' is the DataFrame for the current group
    # Process the group_df as needed
    # print(group_df)
    group = from_df_to_row_df(group_df, location, time, col_list=["LOC_CODE", "DATUMTIJDWAARDE", 'SPEC', "LcL"])
    phyto_flattened = pd.concat([phyto_flattened, group], axis=0, join='outer')
     
phyto_flattened.head()

Unnamed: 0,LOC_CODE,DATUMTIJDWAARDE,Acn,Aco,Ata,Agl,Cfu,Ccu,Cda,Cdeb,...,Rst,Rte,Stu,Tni,Tec,Tle,Tno,Tro,Tor,Dat
0,DANTZGT,1990-04-04 00:00:00,,,,3.271842,,,,,...,,,,4.669596,3.572755,3.572755,,4.475787,,
0,DANTZGT,1990-04-24 00:00:00,,,,4.590418,,,,,...,,,2.448706,3.447933,,,,3.447933,,
0,DANTZGT,1990-05-09 00:00:00,,,,4.669596,,,,,...,,,,,,,,4.012035,1.973128,
0,DANTZGT,1990-05-23 00:00:00,,,,,,,,,...,,,,,,,,3.590842,,
0,DANTZGT,1990-06-07 00:00:00,,,2.267172,5.300487,2.444045,,,,...,,,,3.566909,1.968483,3.442009,,3.496376,1.968483,


In [58]:
# Extract date and time
phyto_flattened['DATUMTIJDWAARDE'] = pd.to_datetime(phyto_flattened['DATUMTIJDWAARDE'])

phyto_flattened['DATUM'] = phyto_flattened['DATUMTIJDWAARDE'].dt.date
phyto_flattened['TIJD'] = phyto_flattened['DATUMTIJDWAARDE'].dt.time

In [59]:
# drop 'datumtijdwaarde'

phyto_flattened = phyto_flattened.drop('DATUMTIJDWAARDE', axis=1)

In [60]:
display(phyto_flattened)

Unnamed: 0,LOC_CODE,Acn,Aco,Ata,Agl,Cfu,Ccu,Cda,Cdeb,Cden,...,Stu,Tni,Tec,Tle,Tno,Tro,Tor,Dat,DATUM,TIJD
0,DANTZGT,,,,3.271842,,,,,,...,,4.669596,3.572755,3.572755,,4.475787,,,1990-04-04,00:00:00
0,DANTZGT,,,,4.590418,,,,,,...,2.448706,3.447933,,,,3.447933,,,1990-04-24,00:00:00
0,DANTZGT,,,,4.669596,,,,,,...,,,,,,4.012035,1.973128,,1990-05-09,00:00:00
0,DANTZGT,,,,,,,,,,...,,,,,,3.590842,,,1990-05-23,00:00:00
0,DANTZGT,,,2.267172,5.300487,2.444045,,,,,...,,3.566909,1.968483,3.442009,,3.496376,1.968483,,1990-06-07,00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,WALCRN70,,2.034717,,2.333737,,,,,,...,,2.809514,,,,,2.333737,,2019-06-13,00:00:00
0,WALCRN70,,,,,,,,,,...,,,,,,,2.961517,,2019-07-15,00:00:00
0,WALCRN70,,2.008643,,,,,,,,...,,2.959085,,,,,3.004795,,2019-08-15,11:37:00
0,WALCRN70,,,,,,2.980947,3.032049,2.980947,,...,,2.380246,,,,,3.28175,,2019-09-12,13:29:00


In [61]:
# check for duplicates
dups = phyto_flattened[phyto_flattened[['LOC_CODE','DATUM']].duplicated(keep=False)]
display(dups)

Unnamed: 0,LOC_CODE,Acn,Aco,Ata,Agl,Cfu,Ccu,Cda,Cdeb,Cden,...,Stu,Tni,Tec,Tle,Tno,Tro,Tor,Dat,DATUM,TIJD


In [69]:
# reorder phyto_flattened columns

new_order = ['LOC_CODE', 'DATUM', 'TIJD'] + list(unique_SPEC)

phyto_flattened = phyto_flattened[new_order]
phyto_flattened.head()

Unnamed: 0,LOC_CODE,DATUM,TIJD,Acn,Aco,Ata,Agl,Cfu,Ccu,Cda,...,Rst,Rte,Stu,Tni,Tec,Tle,Tno,Tro,Tor,Dat
0,DANTZGT,1990-04-04,00:00:00,,,,3.271842,,,,...,,,,4.669596,3.572755,3.572755,,4.475787,,
0,DANTZGT,1990-04-24,00:00:00,,,,4.590418,,,,...,,,2.448706,3.447933,,,,3.447933,,
0,DANTZGT,1990-05-09,00:00:00,,,,4.669596,,,,...,,,,,,,,4.012035,1.973128,
0,DANTZGT,1990-05-23,00:00:00,,,,,,,,...,,,,,,,,3.590842,,
0,DANTZGT,1990-06-07,00:00:00,,,2.267172,5.300487,2.444045,,,...,,,,3.566909,1.968483,3.442009,,3.496376,1.968483,


### Prepare irradiance data

In [62]:
irradiance_df.head()

Unnamed: 0,YYYYMMDD,Q,YEAR,MONTH,PAR [J/m2d],PAR [kJ/m2d],kPAR_7d,kPAR_14d
0,19900101,49,1990,1,220500.0,220.5,,
1,19900102,161,1990,1,724500.0,724.5,,
2,19900103,75,1990,1,337500.0,337.5,,
3,19900104,51,1990,1,229500.0,229.5,,
4,19900105,196,1990,1,882000.0,882.0,,


In [63]:
irradiance_df = irradiance_df.rename(columns={'YYYYMMDD': 'DATUM'})
irradiance_df["DATUM"] = pd.to_datetime(irradiance_df['DATUM'], format='%Y%m%d').dt.date
display(irradiance_df.head())

Unnamed: 0,DATUM,Q,YEAR,MONTH,PAR [J/m2d],PAR [kJ/m2d],kPAR_7d,kPAR_14d
0,1990-01-01,49,1990,1,220500.0,220.5,,
1,1990-01-02,161,1990,1,724500.0,724.5,,
2,1990-01-03,75,1990,1,337500.0,337.5,,
3,1990-01-04,51,1990,1,229500.0,229.5,,
4,1990-01-05,196,1990,1,882000.0,882.0,,


## Merge prepared data into one dataset
***
_We merge the datasets by date and location_

### Sort al datasets on date and location (only date for Irradiance)

In [72]:
abio_sorted = abio_flattened.sort_values(by=['LOC_CODE', 'DATUM'])
phyto_sorted = phyto_flattened.sort_values(by=['LOC_CODE', 'DATUM'])
irradiance_sorted = irradiance_df.sort_values(by='DATUM')

display("abio_sorted dataset", abio_sorted.head())
display("phyto_sorted dataset", phyto_sorted.head())
display("irradiance_sorted dataset", irradiance_sorted.head())

'abio_sorted dataset'

Unnamed: 0,LOC_CODE,DATUM,TIJD,ZS,ZICHT,T,SiO2,SALNTT,PO4,pH,NO3,NO2,NH4,E,CHLFa
0,DANTZGT,1990-01-10,15:00:00,135.0,2.0,4.0,20.178571,29.19,1.645161,7.8,37.571429,3.714286,14.071429,,1.3
0,DANTZGT,1990-02-06,13:40:00,295.0,0.5,6.0,,27.37,,,,,,,
0,DANTZGT,1990-03-08,13:45:00,103.0,3.0,7.3,19.428571,24.99,0.709677,8.0,89.285714,2.071429,8.642857,,21.1
0,DANTZGT,1990-04-04,10:00:00,113.0,3.0,8.2,6.285714,28.79,0.806452,8.1,40.0,2.0,6.428571,,25.0
0,DANTZGT,1990-05-09,15:30:00,20.0,11.0,17.4,1.714286,33.28,1.16129,8.3,0.214286,0.142857,1.928571,,10.2


'phyto_sorted dataset'

Unnamed: 0,LOC_CODE,DATUM,TIJD,Acn,Aco,Ata,Agl,Cfu,Ccu,Cda,...,Rst,Rte,Stu,Tni,Tec,Tle,Tno,Tro,Tor,Dat
0,DANTZGT,1990-04-04,00:00:00,,,,3.271842,,,,...,,,,4.669596,3.572755,3.572755,,4.475787,,
0,DANTZGT,1990-04-24,00:00:00,,,,4.590418,,,,...,,,2.448706,3.447933,,,,3.447933,,
0,DANTZGT,1990-05-09,00:00:00,,,,4.669596,,,,...,,,,,,,,4.012035,1.973128,
0,DANTZGT,1990-05-23,00:00:00,,,,,,,,...,,,,,,,,3.590842,,
0,DANTZGT,1990-06-07,00:00:00,,,2.267172,5.300487,2.444045,,,...,,,,3.566909,1.968483,3.442009,,3.496376,1.968483,


'irradiance_sorted dataset'

Unnamed: 0,DATUM,Q,YEAR,MONTH,PAR [J/m2d],PAR [kJ/m2d],kPAR_7d,kPAR_14d
0,1990-01-01,49,1990,1,220500.0,220.5,,
1,1990-01-02,161,1990,1,724500.0,724.5,,
2,1990-01-03,75,1990,1,337500.0,337.5,,
3,1990-01-04,51,1990,1,229500.0,229.5,,
4,1990-01-05,196,1990,1,882000.0,882.0,,


## Calculate extra columns based on recomendations from Dr. Louis Peperzak
***
_Variables:_

- 
