In [57]:
import pandas as pd
import numpy as np

# Data coupling for abiotic and biotic data based on location and date.
***
_Authors: Rijk van der Meer and Ardjano Mark (Team jam.rs)._

_Data: Dr. Louis Peperzak (NIOZ) and Rijkswaterstaat._

_Project: Coupling plankton and abiotic long-term data sets._


## Importing and preparing data.
***

In [58]:
abio_df = pd.read_excel('../../data/ABIO.xlsx', sheet_name='ABIO_SURF')
phyto_df = pd.read_excel('../../data/PHYTO.xlsx', sheet_name='PHYTO_SURF')
irradiance_df = pd.read_excel('../../data/Irradiance.xlsx', sheet_name='PAR')

In [59]:
display("Abiotic dataframe")
display(abio_df.head())
display("Phytoplankton dataframe")
display(phyto_df.head())
display("Irradiance dataframe")
display(irradiance_df.head())

'Abiotic dataframe'

Unnamed: 0,LOC_CODE,LOC,DATUMTIJDWAARDE,DATUM,TIJD,Year,Month,Day,Q_clndr,Q_eco,PAROMS,PLT:REFVLAK,VAR,BGC,Value_original,Value_interm,VALUE,KWC,EHD
0,GROOTGND,EDG,24-2-2017 15:29:00,20170224,'1529,2017,2,24,1,1,Zwevende stof,WATSGL,ZS,,1380.0,1380.0,1380.0,0,mg/l
1,GROOTGND,EDG,1-4-2015 14:10:00,20150401,'1410,2015,4,1,2,2,Zwevende stof,WATSGL,ZS,,732.0,732.0,732.0,0,mg/l
2,GROOTGND,EDG,19-4-2016 15:38:00,20160419,'1538,2016,4,19,2,2,Zwevende stof,WATSGL,ZS,,712.0,712.0,712.0,0,mg/l
3,HUIBGOT,EDH,21-10-2004 05:59:00,20041021,'0559,2004,10,21,4,4,Zwevende stof,WATSGL,ZS,,710.0,,,56,mg/l
4,GROOTGND,EDG,13-10-1997 11:37:00,19971013,'1137,1997,10,13,4,4,Zwevende stof,WATSGL,ZS,,677.0,677.0,677.0,50,mg/l


'Phytoplankton dataframe'

Unnamed: 0,LOC_CODE,LOC,WATERBODY,TYPE,DATE_SMP,YEAR,MONTH,DAY,Q_clndr,Q_eco,PROD_CODE,SPECIES,SPEC,GROUP,AMT_MEAS,cL,LcL
0,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,2005-01-21,,,,,1,OW,Actinocyclus normanii,Acn,DIAT,1.0,5000.0,
1,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,1994-01-25,,,,,1,WATSGL,Actinocyclus normanii,Acn,DIAT,,2000.0,
2,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,1994-02-24,,,,,1,WATSGL,Actinocyclus normanii,Acn,DIAT,,485.0,
3,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,2001-03-22,,,,,2,OW,Actinocyclus normanii,Acn,DIAT,1.0,833.333333,
4,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,2002-03-13,,,,,2,OW,Actinocyclus normanii,Acn,DIAT,2.0,4000.0,


'Irradiance dataframe'

Unnamed: 0,YYYYMMDD,Q,YEAR,MONTH,PAR [J/m2d],PAR [kJ/m2d],kPAR_7d,kPAR_14d
0,19900101,49,1990,1,220500.0,220.5,,
1,19900102,161,1990,1,724500.0,724.5,,
2,19900103,75,1990,1,337500.0,337.5,,
3,19900104,51,1990,1,229500.0,229.5,,
4,19900105,196,1990,1,882000.0,882.0,,


### Prepare abiotic data
***
_Preparing the abiotic datasets on the following points_:

- Extracting the unique variables measured in the dataset.
- Extracting the different measurement locations in the dataset.
- Move all measurements on a particular time and in a row together. 

In [60]:
unique_PAROMS = list(abio_df.PAROMS.unique())
unique_VARS = list(abio_df.VAR.unique())
unique_tuple = list(zip(unique_PAROMS, unique_VARS))
display(unique_tuple)

[('Zwevende stof', 'ZS'),
 ('Doorzicht', 'ZICHT'),
 ('Temperatuur', 'T'),
 ('silicaat', 'SiO2'),
 ('Saliniteit', 'SALNTT'),
 ('orthofosfaat', 'PO4'),
 ('Zuurgraad', 'pH'),
 ('nitraat', 'NO3'),
 ('nitriet', 'NO2'),
 ('ammonium', 'NH4'),
 ('Extinctiecoefficient', 'E'),
 ('chlorofyl-a', 'CHLFa')]

In [61]:
locations = list(abio_df["LOC_CODE"].unique())
locations_abv = list(abio_df["LOC"].unique())
locations_tuple = list(zip(locations, locations_abv))
locations_tuple

[('GROOTGND', 'EDG'),
 ('HUIBGOT', 'EDH'),
 ('SCHAARVODDL', 'WSO'),
 ('DANTZGT', 'WZD'),
 ('VLISSGBISSVH', 'WSV'),
 ('MARSDND', 'WZM'),
 ('HANSWGL', 'WSH'),
 ('GOERE6', 'GOE6'),
 ('WALCRN2', 'WA2'),
 ('NOORDWK20', 'NW20'),
 ('ROTTMPT3', 'RP3'),
 ('SOELKKPDOT', 'VMS'),
 ('NOORDWK2', 'NW2'),
 ('WALCRN70', 'WA70'),
 ('NOORDWK70', 'NW70'),
 ('TERSLG10', 'TS10'),
 ('NOORDWK10', 'NW10'),
 ('WALCRN20', 'WA20'),
 ('TERSLG4', 'TS4'),
 ('ROTTMPT70', 'RP70'),
 ('LODSGT', 'OSL'),
 ('TERSLG135', 'TS135'),
 ('ROTTMPT50', 'RP50'),
 ('TERSLG235', 'TS235'),
 ('DREISR', 'GMD'),
 ('TERSLG100', 'TS100'),
 ('TERSLG175', 'TS175')]

#### Move all rows from particular time and location into 1 row together.

In [62]:
def from_df_to_row_df(df: pd.DataFrame, location: str, time: str, col_list: list) -> pd.DataFrame:
    """
    Transforms a batch of rows to a single row. Both as pd.DataFrame type.

    Variables:
    df: Group of same location and same time in pd.DataFrame type.
    location: Location 
    time: Time 
    col_list = list of columns to use. The first two are the location and time.

    Output:
    A single row dataframe of type pd. DataFrame.
    """
    row_df = df
    row_df = row_df[col_list]

    measurements = list(zip(*(row_df[col] for col in col_list[2:])))
    measurements_dict = dict([t[:2] for t in measurements])
    loc, date = location, time

    row = pd.Series(measurements_dict)
    row[col_list[0]], row[col_list[1]] = loc, date

    final_df = row.to_frame().T

    return final_df

In [63]:
# group the DataFrame by 'LOC_CODE' and 'DATUMTIJDWAARDE'
# create a column list for the new dataframe
new_col_list = ["LOC_CODE", "DATUMTIJDWAARDE"] + unique_VARS
abio_flattened = pd.DataFrame(columns=new_col_list)

abio_grouped = abio_df.groupby(['LOC_CODE', 'DATUMTIJDWAARDE'])

# Iterate over the groups
for (location, time), group_df in abio_grouped:
    # 'location' and 'time' are the current group keys
    # 'group_df' is the DataFrame for the current group
    # Process the group_df as needed
    # print(group_df)
    group = from_df_to_row_df(group_df, location, time, col_list=["LOC_CODE", "DATUMTIJDWAARDE", 'VAR',"VALUE", "KWC", "EHD"])
    abio_flattened = pd.concat([abio_flattened, group], axis=0, join='outer')


#### Datumtijdwaarde
***
- Turn col='datumtijdwaarde' into datetime type. 
- Extract time to a different column

In [64]:
abio_flattened["DATUMTIJDWAARDE"] = pd.to_datetime(abio_flattened["DATUMTIJDWAARDE"], format='mixed', dayfirst=True)

abio_flattened['TIJD'] = abio_flattened["DATUMTIJDWAARDE"].dt.time
abio_flattened['DATUM'] = abio_flattened["DATUMTIJDWAARDE"].dt.date

In [65]:
display(abio_flattened.head())

Unnamed: 0,LOC_CODE,DATUMTIJDWAARDE,ZS,ZICHT,T,SiO2,SALNTT,PO4,pH,NO3,NO2,NH4,E,CHLFa,TIJD,DATUM
0,DANTZGT,2000-10-01 09:30:00,26.0,,,12.428571,,1.483871,,1.142857,1.0,9.5,,8.0,09:30:00,2000-10-01
0,DANTZGT,2002-10-01 09:30:00,,10.0,15.17,,31.18,,8.06,,,,,,09:30:00,2002-10-01
0,DANTZGT,2003-10-01 07:11:00,89.0,5.0,13.25,5.821429,33.03,1.354839,8.13,1.214286,0.714286,9.142857,3.23678,20.6,07:11:00,2003-10-01
0,DANTZGT,2009-10-01 10:19:00,,,15.0,,31.4,,,,,,,,10:19:00,2009-10-01
0,DANTZGT,2010-10-01 08:02:00,160.0,3.0,12.4,18.928571,28.9,0.806452,8.0,3.571429,1.642857,15.714286,4.61,16.0,08:02:00,2010-10-01


In [66]:
# abio_flattened = abio_flattened.drop('DATUMTIJDWAARDE', axis=1)
new_order = ['LOC_CODE', 'DATUM', 'TIJD'] + unique_VARS + ['DATUMTIJDWAARDE']

abio_flattened = abio_flattened[new_order]
abio_flattened.head()

Unnamed: 0,LOC_CODE,DATUM,TIJD,ZS,ZICHT,T,SiO2,SALNTT,PO4,pH,NO3,NO2,NH4,E,CHLFa,DATUMTIJDWAARDE
0,DANTZGT,2000-10-01,09:30:00,26.0,,,12.428571,,1.483871,,1.142857,1.0,9.5,,8.0,2000-10-01 09:30:00
0,DANTZGT,2002-10-01,09:30:00,,10.0,15.17,,31.18,,8.06,,,,,,2002-10-01 09:30:00
0,DANTZGT,2003-10-01,07:11:00,89.0,5.0,13.25,5.821429,33.03,1.354839,8.13,1.214286,0.714286,9.142857,3.23678,20.6,2003-10-01 07:11:00
0,DANTZGT,2009-10-01,10:19:00,,,15.0,,31.4,,,,,,,,2009-10-01 10:19:00
0,DANTZGT,2010-10-01,08:02:00,160.0,3.0,12.4,18.928571,28.9,0.806452,8.0,3.571429,1.642857,15.714286,4.61,16.0,2010-10-01 08:02:00


#### Append the units to the variable columns

In [67]:
var_unit_dict = {}
for var in unique_VARS:
    var_df = abio_df.loc[abio_df["VAR"] == var]
    first_row = var_df.iloc[0]
    unit = first_row['EHD']
    var_unit_dict[var] = str(var) + f" [{unit}]"

In [68]:
abio_flattened = abio_flattened.rename(columns=var_unit_dict)

#### Combine multiple measurements on a day to one per day

In [69]:
abio_unconflicting = abio_flattened

duplicates = abio_unconflicting[abio_unconflicting.duplicated(subset=['DATUM', 'LOC_CODE'], keep=False)].sort_values('DATUM')

no_duplicates = abio_unconflicting[~abio_unconflicting.duplicated(subset=['DATUM', 'LOC_CODE'], keep=False)].drop("DATUMTIJDWAARDE", axis=1)

display(duplicates.info())
display(no_duplicates.info())

<class 'pandas.core.frame.DataFrame'>
Index: 3073 entries, 0 to 0
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   LOC_CODE         3073 non-null   object        
 1   DATUM            3073 non-null   object        
 2   TIJD             3073 non-null   object        
 3   ZS [mg/l]        1726 non-null   object        
 4   ZICHT [dm]       205 non-null    object        
 5   T [oC]           1400 non-null   object        
 6   SiO2 [umol/L]    1279 non-null   object        
 7   SALNTT [DIMSLS]  1377 non-null   object        
 8   PO4 [umol/L]     1269 non-null   object        
 9   pH [DIMSLS]      1374 non-null   object        
 10  NO3 [umol/L]     1243 non-null   object        
 11  NO2 [umol/L]     1265 non-null   object        
 12  NH4 [umol/L]     1268 non-null   object        
 13  E [/m]           972 non-null    object        
 14  CHLFa [ug/l]     1411 non-null   object        


None

<class 'pandas.core.frame.DataFrame'>
Index: 12220 entries, 0 to 0
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   LOC_CODE         12220 non-null  object
 1   DATUM            12220 non-null  object
 2   TIJD             12220 non-null  object
 3   ZS [mg/l]        11450 non-null  object
 4   ZICHT [dm]       5708 non-null   object
 5   T [oC]           11309 non-null  object
 6   SiO2 [umol/L]    11353 non-null  object
 7   SALNTT [DIMSLS]  10676 non-null  object
 8   PO4 [umol/L]     11384 non-null  object
 9   pH [DIMSLS]      11086 non-null  object
 10  NO3 [umol/L]     11048 non-null  object
 11  NO2 [umol/L]     11396 non-null  object
 12  NH4 [umol/L]     11369 non-null  object
 13  E [/m]           3982 non-null   object
 14  CHLFa [ug/l]     11180 non-null  object
dtypes: object(15)
memory usage: 1.5+ MB


None

#### Duplicates

#### Delete duplicate values if conflicting

In [70]:
NAN_THRESHOLD = 5 # max nan values if conflict exists
COLS_START = 3 

In [71]:
grouped = duplicates.copy()

def check_conflicts(group):
    """
    Returns string that indicates the type of conflicting
    measurement on the same day.
    
    """
    conflicting_columns = []
    for column in group.columns[COLS_START:-1]:
        non_nan_count = group[column].notna().sum()
        if non_nan_count > 1:
            conflicting_columns.append(column)
    return ', '.join(conflicting_columns)


# adds a 'conflicts' column to grouped
conflicts = grouped.groupby(['LOC_CODE', 'DATUM']).apply(check_conflicts).reset_index(name='Conflicts')
grouped = pd.merge(grouped, conflicts, on=['LOC_CODE', 'DATUM'], how='left')


  conflicts = grouped.groupby(['LOC_CODE', 'DATUM']).apply(check_conflicts).reset_index(name='Conflicts')


In [72]:
mask_nan = grouped.isnull().sum(axis=1) > NAN_THRESHOLD 
mask_conflicts = grouped["Conflicts"] != ''

fixed_conflicts = grouped[~(mask_nan & mask_conflicts)].reset_index(drop=True)

In [73]:
# check for what's being filtered out; reverse filter_within_time > -> < for it to work
# filtered = duplicates2.groupby(['LOC_CODE', 'DATUM']).apply(lambda group: filter_within_time(group, minutes=120)).reset_index(drop=True)

# display(filtered)

# result = pd.merge(duplicates2, filtered[['LOC_CODE', 'DATUM']], on=['LOC_CODE', 'DATUM'], how='inner')

# display(result[57:])

#### Retains only values withing timeframe

In [74]:
TIMEFRAME = 120 # minutes

In [75]:
def main_measurement(group):
    """Finds row with least null values."""
    return group.isnull().sum(axis=1).idxmin()

def filter_within_time(group, minutes):
    """
    Finds all measurements that fall within the defined time range,
    relative to the main measurement.
    
    """
    seconds = 60 * minutes
    main_idx = main_measurement(group)
    main_time = group.loc[main_idx, 'DATUMTIJDWAARDE']

    delta_time = (group['DATUMTIJDWAARDE'] - main_time).dt.total_seconds().abs()
    # reverse this to check how many will be eliminated
    is_in_time = delta_time < seconds

    return group[is_in_time]

filtered = fixed_conflicts.groupby(['LOC_CODE', 'DATUM']).apply(lambda group: filter_within_time(group, minutes=TIMEFRAME)).reset_index(drop=True).sort_values('DATUM')

# display(filtered)


  filtered = fixed_conflicts.groupby(['LOC_CODE', 'DATUM']).apply(lambda group: filter_within_time(group, minutes=TIMEFRAME)).reset_index(drop=True).sort_values('DATUM')


#### Combining duplicate abiotic values

In [76]:
filtered_notime = filtered.drop(['DATUMTIJDWAARDE', 'TIJD', 'Conflicts'], axis=1).reset_index(drop=True)
n_groups = filtered_notime.groupby(["LOC_CODE", "DATUM"]).ngroups

In [77]:
combined_duplicates = filtered_notime.groupby(["LOC_CODE", "DATUM"], as_index=False).first()

for col in combined_duplicates.columns:
    if combined_duplicates[col].dtype == 'object':
            combined_duplicates[col] = combined_duplicates[col].replace({None: np.nan})

assert n_groups == combined_duplicates.shape[0], "Doesn't match" # sanity check

  combined_duplicates[col] = combined_duplicates[col].replace({None: np.nan})


#### Final merge with rest of data

In [78]:
final_abio = pd.concat([no_duplicates, combined_duplicates]).sort_values(by=["LOC_CODE", "DATUM"])

display(final_abio)

Unnamed: 0,LOC_CODE,DATUM,TIJD,ZS [mg/l],ZICHT [dm],T [oC],SiO2 [umol/L],SALNTT [DIMSLS],PO4 [umol/L],pH [DIMSLS],NO3 [umol/L],NO2 [umol/L],NH4 [umol/L],E [/m],CHLFa [ug/l]
0,DANTZGT,1990-01-10,15:00:00,135.0,2.0,4.0,20.178571,29.19,1.645161,7.8,37.571429,3.714286,14.071429,,1.3
0,DANTZGT,1990-02-06,13:40:00,295.0,0.5,6.0,,27.37,,,,,,,
0,DANTZGT,1990-03-08,13:45:00,103.0,3.0,7.3,19.428571,24.99,0.709677,8.0,89.285714,2.071429,8.642857,,21.1
0,DANTZGT,1990-04-04,10:00:00,113.0,3.0,8.2,6.285714,28.79,0.806452,8.1,40.0,2.0,6.428571,,25.0
0,DANTZGT,1990-05-09,15:30:00,20.0,11.0,17.4,1.714286,33.28,1.16129,8.3,0.214286,0.142857,1.928571,,10.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,WALCRN70,2020-08-13,05:04:00,4.02,,18.5,0.632143,35.0,0.577419,8.0,0.357143,0.035714,0.178571,,0.77
0,WALCRN70,2020-09-16,09:30:00,4.52,,18.5,1.275,34.8,0.190323,7.97,2.278571,0.114286,0.485714,0.12,1.47
0,WALCRN70,2020-10-14,08:25:00,4.99,,15.7,1.682143,34.9,0.206452,8.11,0.357143,0.242857,0.364286,,2.02
0,WALCRN70,2020-11-17,05:58:00,3.92,,13.9,3.607143,35.0,0.43871,8.13,6.442857,0.842857,0.178571,,1.59


### Prepare phytoplankton data

In [79]:
# rename the date_smp column
phyto_df['DATE_SMP'] = pd.to_datetime(phyto_df['DATE_SMP'].dt.date)
phyto_df.rename(columns={'DATE_SMP':'DATUM'}, inplace=True)

display(phyto_df)

Unnamed: 0,LOC_CODE,LOC,WATERBODY,TYPE,DATUM,YEAR,MONTH,DAY,Q_clndr,Q_eco,PROD_CODE,SPECIES,SPEC,GROUP,AMT_MEAS,cL,LcL
0,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,2005-01-21,,,,,1,OW,Actinocyclus normanii,Acn,DIAT,1.0,5000.000000,
1,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,1994-01-25,,,,,1,WATSGL,Actinocyclus normanii,Acn,DIAT,,2000.000000,
2,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,1994-02-24,,,,,1,WATSGL,Actinocyclus normanii,Acn,DIAT,,485.000000,
3,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,2001-03-22,,,,,2,OW,Actinocyclus normanii,Acn,DIAT,1.0,833.333333,
4,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,2002-03-13,,,,,2,OW,Actinocyclus normanii,Acn,DIAT,2.0,4000.000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105277,HANSWGL,WSH,WESTERSCHELDE,ESTUARINE,1994-11-22,,,,,4,WATSGL,Torodinium robustum,Tor,DINO,,127.000000,
105278,VLISSGBISSVH,WSV,WESTERSCHELDE,ESTUARINE,1994-11-22,,,,,4,WATSGL,Torodinium robustum,Tor,DINO,,150.000000,
105279,HANSWGL,WSH,WESTERSCHELDE,ESTUARINE,2000-12-19,,,,,1,OW,Torodinium robustum,Tor,DINO,1.0,1000.000000,
105280,VLISSGBISSVH,WSV,WESTERSCHELDE,ESTUARINE,2006-12-12,,,,,1,OW,Torodinium robustum,Tor,DINO,1.0,769.000000,


#### Handle duplicates

In [80]:
# Rhizosolenia delicatula with Guinardia
phyto_df["SPEC"] = phyto_df["SPEC"].replace({"Rde": "Gde" })

In [81]:
# Check that everything went right
spec_to_species_df = phyto_df.groupby('SPEC')['SPECIES'].unique().reset_index()
spec_to_species_df.columns = ['Abrv','Species']

# display_all(spec_to_species_df)

phyto_duplicates = spec_to_species_df[spec_to_species_df['Species'].apply(len) > 1]
print(phyto_duplicates)

   Abrv                                            Species
2   Agl  [Asterionella glacialis, Asterionellopsis glac...
13  Cha                 [Chattonella, Chattonella antiqua]
31  Gde    [Guinardia delicatula, Rhizosolenia delicatula]


In [82]:
# conflict between species/group after merge, keep only relevant columns
phyto_df.drop([col for col in phyto_df.columns if col not in ["LOC_CODE", "SPEC", "DATUM", "cL"]], axis=1, inplace=True)

display(phyto_df)

Unnamed: 0,LOC_CODE,DATUM,SPEC,cL
0,GROOTGND,2005-01-21,Acn,5000.000000
1,GROOTGND,1994-01-25,Acn,2000.000000
2,GROOTGND,1994-02-24,Acn,485.000000
3,GROOTGND,2001-03-22,Acn,833.333333
4,GROOTGND,2002-03-13,Acn,4000.000000
...,...,...,...,...
105277,HANSWGL,1994-11-22,Tor,127.000000
105278,VLISSGBISSVH,1994-11-22,Tor,150.000000
105279,HANSWGL,2000-12-19,Tor,1000.000000
105280,VLISSGBISSVH,2006-12-12,Tor,769.000000


#### Aggregating measurements on same day

In [83]:
phyto_duplicates = phyto_df[phyto_df.duplicated(subset=["LOC_CODE", "DATUM", "SPEC"], keep=False)].sort_values(by=["LOC_CODE", "DATUM", "SPEC"]).reset_index(drop=True)
phyto_non_duplicates = phyto_df[~phyto_df.duplicated(subset=["LOC_CODE", "DATUM", "SPEC"], keep=False)]

display(phyto_duplicates)
# display(phyto_non_duplicates)

n_duplicates = len(phyto_duplicates.groupby(["LOC_CODE", "DATUM", "SPEC"]).size())
display(n_duplicates)

Unnamed: 0,LOC_CODE,DATUM,SPEC,cL
0,DANTZGT,2000-02-15,Oau,5000.000000
1,DANTZGT,2000-02-15,Oau,5000.000000
2,DANTZGT,2000-03-28,Oau,135000.000000
3,DANTZGT,2000-03-28,Oau,4000.000000
4,DANTZGT,2000-05-29,Oau,62500.000000
...,...,...,...,...
5067,WALCRN70,2019-07-15,Pha,14937.927834
5068,WALCRN70,2019-08-15,Rse,1111.111111
5069,WALCRN70,2019-08-15,Rse,707.070707
5070,WALCRN70,2019-09-12,Pha,39543.651986


2355

In [84]:
# distinct_counts = phyto_duplicates.groupby(['LOC_CODE', 'DATUM', 'SPEC']).agg(lambda x: x.nunique())
# display(distinct_counts)

In [85]:
display(phyto_duplicates['SPEC'].value_counts())

SPEC
Pha    4466
Oau     310
Rse     290
Gsp       2
Kgl       2
Tor       2
Name: count, dtype: int64

In [86]:
def spec_aggregate(group):
    if any(spec in group["SPEC"].values for spec in ["Agl", "Cha", "Oau", "Pha", "Gde"]):
        return group['cL'].sum()
    else:
        return group['cL'].mean()

phyto_aggregated = phyto_duplicates.groupby(["LOC_CODE", "DATUM", "SPEC"]).apply(spec_aggregate).reset_index(name='cL')

display(phyto_aggregated)

  phyto_aggregated = phyto_duplicates.groupby(["LOC_CODE", "DATUM", "SPEC"]).apply(spec_aggregate).reset_index(name='cL')


Unnamed: 0,LOC_CODE,DATUM,SPEC,cL
0,DANTZGT,2000-02-15,Oau,1.000000e+04
1,DANTZGT,2000-03-28,Oau,1.390000e+05
2,DANTZGT,2000-05-29,Oau,9.375000e+04
3,DANTZGT,2000-05-29,Pha,4.103343e+07
4,DANTZGT,2000-06-09,Pha,1.971079e+06
...,...,...,...,...
2350,WALCRN70,2019-05-16,Pha,1.581818e+07
2351,WALCRN70,2019-06-13,Pha,9.864151e+04
2352,WALCRN70,2019-07-15,Pha,7.842412e+04
2353,WALCRN70,2019-08-15,Rse,9.090909e+02


In [87]:
final_phyto = pd.concat([phyto_non_duplicates, phyto_aggregated]).sort_values(by=["LOC_CODE", "DATUM", "SPEC"]).reset_index(drop=True)


display(final_phyto)

Unnamed: 0,LOC_CODE,DATUM,SPEC,cL
0,DANTZGT,1990-04-04,Agl,1869.000000
1,DANTZGT,1990-04-04,Dbr,56075.000000
2,DANTZGT,1990-04-04,Ezo,3738.000000
3,DANTZGT,1990-04-04,Gde,3738.000000
4,DANTZGT,1990-04-04,Oau,20561.000000
...,...,...,...,...
102560,WALCRN70,2019-10-30,Kgl,5123.014487
102561,WALCRN70,2019-10-30,Nsi,102.021174
102562,WALCRN70,2019-10-30,Tni,714.148219
102563,WALCRN70,2019-10-30,Tor,8050.451336


### Use function from_df_to_row_df() from before to flatten the dataset based on Abbreviations.

In [88]:
# recompute 'LcL' column
final_phyto['LcL'] = np.log10(final_phyto['cL'] + 1)

# Extract unique species
unique_SPEC = final_phyto['SPEC'].unique()

# Group the DataFrame by 'LOC_CODE' and 'DATUM'
new_col_list = ["LOC_CODE", "DATUM"] + list(unique_SPEC)
phyto_flattened = pd.DataFrame(columns=new_col_list)

phyto_grouped = final_phyto.groupby(['LOC_CODE', 'DATUM'])

# Iterate over the groups
for (location, time), group_df in phyto_grouped:
    # 'location' and 'time' are the current group keys
    # 'group_df' is the DataFrame for the current group
    # Process the group_df as needed
    # print(group_df)
    group = from_df_to_row_df(group_df, location, time, col_list=["LOC_CODE", "DATUM", 'SPEC', "LcL"])
    phyto_flattened = pd.concat([phyto_flattened, group], axis=0, join='outer')
     
phyto_flattened.head()

Unnamed: 0,LOC_CODE,DATUM,Agl,Dbr,Ezo,Gde,Oau,Omo,Orh,Osi,...,Mpe,Pde,Plo,Dpu,Rte,Fja,Hak,Mhe,Dno,Dat
0,DANTZGT,1990-04-04 00:00:00,3.271842,4.748777,3.572755,3.572755,4.313065,3.748808,3.572755,3.873785,...,,,,,,,,,,
0,DANTZGT,1990-04-24 00:00:00,4.590418,4.669596,3.447933,5.170191,,3.669689,2.670246,2.574031,...,,,,,,,,,,
0,DANTZGT,1990-05-09 00:00:00,4.669596,,3.271842,3.970672,,2.670246,1.973128,,...,,,,,,,,,,
0,DANTZGT,1990-05-23 00:00:00,,1.991226,,4.067889,2.292256,2.292256,,2.466868,...,,,,,,,,,,
0,DANTZGT,1990-06-07 00:00:00,5.300487,1.968483,,3.307282,,2.664642,1.968483,,...,,,,,,,,,,


In [89]:
# Extract date and time
# phyto_flattened['DATUMTIJDWAARDE'] = pd.to_datetime(phyto_flattened['DATUMTIJDWAARDE'])

# phyto_flattened['DATUM'] = phyto_flattened['DATUMTIJDWAARDE'].dt.date
# phyto_flattened['TIJD'] = phyto_flattened['DATUMTIJDWAARDE'].dt.time

In [90]:
# drop 'datumtijdwaarde'

# phyto_flattened = phyto_flattened.drop('DATUMTIJDWAARDE', axis=1)

In [91]:
display(phyto_flattened)

Unnamed: 0,LOC_CODE,DATUM,Agl,Dbr,Ezo,Gde,Oau,Omo,Orh,Osi,...,Mpe,Pde,Plo,Dpu,Rte,Fja,Hak,Mhe,Dno,Dat
0,DANTZGT,1990-04-04 00:00:00,3.271842,4.748777,3.572755,3.572755,4.313065,3.748808,3.572755,3.873785,...,,,,,,,,,,
0,DANTZGT,1990-04-24 00:00:00,4.590418,4.669596,3.447933,5.170191,,3.669689,2.670246,2.574031,...,,,,,,,,,,
0,DANTZGT,1990-05-09 00:00:00,4.669596,,3.271842,3.970672,,2.670246,1.973128,,...,,,,,,,,,,
0,DANTZGT,1990-05-23 00:00:00,,1.991226,,4.067889,2.292256,2.292256,,2.466868,...,,,,,,,,,,
0,DANTZGT,1990-06-07 00:00:00,5.300487,1.968483,,3.307282,,2.664642,1.968483,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,WALCRN70,2019-06-13 00:00:00,2.333737,,,,,,,,...,,,,,,,,,,
0,WALCRN70,2019-07-15 00:00:00,,,,2.852508,,,,,...,2.863132,,,,,,,,,
0,WALCRN70,2019-08-15 00:00:00,,,,4.462177,,,,,...,,,,,,,,,,
0,WALCRN70,2019-09-12 00:00:00,,,,3.974579,,,,,...,2.081022,,,,,,,,,


In [92]:
# check for duplicates
dups = phyto_flattened[phyto_flattened[['LOC_CODE','DATUM']].duplicated(keep=False)]
display(dups)

Unnamed: 0,LOC_CODE,DATUM,Agl,Dbr,Ezo,Gde,Oau,Omo,Orh,Osi,...,Mpe,Pde,Plo,Dpu,Rte,Fja,Hak,Mhe,Dno,Dat


In [93]:
# reorder phyto_flattened columns

new_order = ['LOC_CODE', 'DATUM'] + list(unique_SPEC)

phyto_flattened = phyto_flattened[new_order]
phyto_flattened.head()

Unnamed: 0,LOC_CODE,DATUM,Agl,Dbr,Ezo,Gde,Oau,Omo,Orh,Osi,...,Mpe,Pde,Plo,Dpu,Rte,Fja,Hak,Mhe,Dno,Dat
0,DANTZGT,1990-04-04 00:00:00,3.271842,4.748777,3.572755,3.572755,4.313065,3.748808,3.572755,3.873785,...,,,,,,,,,,
0,DANTZGT,1990-04-24 00:00:00,4.590418,4.669596,3.447933,5.170191,,3.669689,2.670246,2.574031,...,,,,,,,,,,
0,DANTZGT,1990-05-09 00:00:00,4.669596,,3.271842,3.970672,,2.670246,1.973128,,...,,,,,,,,,,
0,DANTZGT,1990-05-23 00:00:00,,1.991226,,4.067889,2.292256,2.292256,,2.466868,...,,,,,,,,,,
0,DANTZGT,1990-06-07 00:00:00,5.300487,1.968483,,3.307282,,2.664642,1.968483,,...,,,,,,,,,,


In [94]:
# drop tijd
# phyto_flattened = phyto_flattened.drop(columns='TIJD', axis=1)

In [95]:
phyto_flattened

Unnamed: 0,LOC_CODE,DATUM,Agl,Dbr,Ezo,Gde,Oau,Omo,Orh,Osi,...,Mpe,Pde,Plo,Dpu,Rte,Fja,Hak,Mhe,Dno,Dat
0,DANTZGT,1990-04-04 00:00:00,3.271842,4.748777,3.572755,3.572755,4.313065,3.748808,3.572755,3.873785,...,,,,,,,,,,
0,DANTZGT,1990-04-24 00:00:00,4.590418,4.669596,3.447933,5.170191,,3.669689,2.670246,2.574031,...,,,,,,,,,,
0,DANTZGT,1990-05-09 00:00:00,4.669596,,3.271842,3.970672,,2.670246,1.973128,,...,,,,,,,,,,
0,DANTZGT,1990-05-23 00:00:00,,1.991226,,4.067889,2.292256,2.292256,,2.466868,...,,,,,,,,,,
0,DANTZGT,1990-06-07 00:00:00,5.300487,1.968483,,3.307282,,2.664642,1.968483,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,WALCRN70,2019-06-13 00:00:00,2.333737,,,,,,,,...,,,,,,,,,,
0,WALCRN70,2019-07-15 00:00:00,,,,2.852508,,,,,...,2.863132,,,,,,,,,
0,WALCRN70,2019-08-15 00:00:00,,,,4.462177,,,,,...,,,,,,,,,,
0,WALCRN70,2019-09-12 00:00:00,,,,3.974579,,,,,...,2.081022,,,,,,,,,


### Prepare irradiance data

In [96]:
irradiance_df.head()

Unnamed: 0,YYYYMMDD,Q,YEAR,MONTH,PAR [J/m2d],PAR [kJ/m2d],kPAR_7d,kPAR_14d
0,19900101,49,1990,1,220500.0,220.5,,
1,19900102,161,1990,1,724500.0,724.5,,
2,19900103,75,1990,1,337500.0,337.5,,
3,19900104,51,1990,1,229500.0,229.5,,
4,19900105,196,1990,1,882000.0,882.0,,


In [97]:
irradiance_df = irradiance_df.rename(columns={'YYYYMMDD': 'DATUM'})
irradiance_df["DATUM"] = pd.to_datetime(irradiance_df['DATUM'], format='%Y%m%d').dt.date
display(irradiance_df.head())

Unnamed: 0,DATUM,Q,YEAR,MONTH,PAR [J/m2d],PAR [kJ/m2d],kPAR_7d,kPAR_14d
0,1990-01-01,49,1990,1,220500.0,220.5,,
1,1990-01-02,161,1990,1,724500.0,724.5,,
2,1990-01-03,75,1990,1,337500.0,337.5,,
3,1990-01-04,51,1990,1,229500.0,229.5,,
4,1990-01-05,196,1990,1,882000.0,882.0,,


In [98]:
irradiance_df = irradiance_df.drop(columns=['YEAR', 'MONTH'], axis=1)

## Merge prepared data into one dataset
***
_We merge the datasets by date and location_

### Sort al datasets on date and location (only date for Irradiance)

In [99]:
abio_sorted = final_abio.sort_values(by=['LOC_CODE', 'DATUM'])
phyto_sorted = phyto_flattened.sort_values(by=['LOC_CODE', 'DATUM'])
irradiance_sorted = irradiance_df.sort_values(by='DATUM')

display("abio_sorted dataset", abio_sorted.head())
display("phyto_sorted dataset", phyto_sorted.head())
display("irradiance_sorted dataset", irradiance_sorted.head())

'abio_sorted dataset'

Unnamed: 0,LOC_CODE,DATUM,TIJD,ZS [mg/l],ZICHT [dm],T [oC],SiO2 [umol/L],SALNTT [DIMSLS],PO4 [umol/L],pH [DIMSLS],NO3 [umol/L],NO2 [umol/L],NH4 [umol/L],E [/m],CHLFa [ug/l]
0,DANTZGT,1990-01-10,15:00:00,135.0,2.0,4.0,20.178571,29.19,1.645161,7.8,37.571429,3.714286,14.071429,,1.3
0,DANTZGT,1990-02-06,13:40:00,295.0,0.5,6.0,,27.37,,,,,,,
0,DANTZGT,1990-03-08,13:45:00,103.0,3.0,7.3,19.428571,24.99,0.709677,8.0,89.285714,2.071429,8.642857,,21.1
0,DANTZGT,1990-04-04,10:00:00,113.0,3.0,8.2,6.285714,28.79,0.806452,8.1,40.0,2.0,6.428571,,25.0
0,DANTZGT,1990-05-09,15:30:00,20.0,11.0,17.4,1.714286,33.28,1.16129,8.3,0.214286,0.142857,1.928571,,10.2


'phyto_sorted dataset'

Unnamed: 0,LOC_CODE,DATUM,Agl,Dbr,Ezo,Gde,Oau,Omo,Orh,Osi,...,Mpe,Pde,Plo,Dpu,Rte,Fja,Hak,Mhe,Dno,Dat
0,DANTZGT,1990-04-04 00:00:00,3.271842,4.748777,3.572755,3.572755,4.313065,3.748808,3.572755,3.873785,...,,,,,,,,,,
0,DANTZGT,1990-04-24 00:00:00,4.590418,4.669596,3.447933,5.170191,,3.669689,2.670246,2.574031,...,,,,,,,,,,
0,DANTZGT,1990-05-09 00:00:00,4.669596,,3.271842,3.970672,,2.670246,1.973128,,...,,,,,,,,,,
0,DANTZGT,1990-05-23 00:00:00,,1.991226,,4.067889,2.292256,2.292256,,2.466868,...,,,,,,,,,,
0,DANTZGT,1990-06-07 00:00:00,5.300487,1.968483,,3.307282,,2.664642,1.968483,,...,,,,,,,,,,


'irradiance_sorted dataset'

Unnamed: 0,DATUM,Q,PAR [J/m2d],PAR [kJ/m2d],kPAR_7d,kPAR_14d
0,1990-01-01,49,220500.0,220.5,,
1,1990-01-02,161,724500.0,724.5,,
2,1990-01-03,75,337500.0,337.5,,
3,1990-01-04,51,229500.0,229.5,,
4,1990-01-05,196,882000.0,882.0,,


In [100]:
merged_abio = pd.merge(abio_sorted, irradiance_sorted, on='DATUM', how='left').sort_values(by=['LOC_CODE', 'DATUM'])
merged_abio

Unnamed: 0,LOC_CODE,DATUM,TIJD,ZS [mg/l],ZICHT [dm],T [oC],SiO2 [umol/L],SALNTT [DIMSLS],PO4 [umol/L],pH [DIMSLS],NO3 [umol/L],NO2 [umol/L],NH4 [umol/L],E [/m],CHLFa [ug/l],Q,PAR [J/m2d],PAR [kJ/m2d],kPAR_7d,kPAR_14d
0,DANTZGT,1990-01-10,15:00:00,135.0,2.0,4.0,20.178571,29.19,1.645161,7.8,37.571429,3.714286,14.071429,,1.3,100,450000.0,450.0,513.000000,
1,DANTZGT,1990-02-06,13:40:00,295.0,0.5,6.0,,27.37,,,,,,,,603,2713500.0,2713.5,1737.000000,1497.857143
2,DANTZGT,1990-03-08,13:45:00,103.0,3.0,7.3,19.428571,24.99,0.709677,8.0,89.285714,2.071429,8.642857,,21.1,543,2443500.0,2443.5,2949.428571,2865.857143
3,DANTZGT,1990-04-04,10:00:00,113.0,3.0,8.2,6.285714,28.79,0.806452,8.1,40.0,2.0,6.428571,,25.0,1491,6709500.0,6709.5,6508.285714,5926.821429
4,DANTZGT,1990-05-09,15:30:00,20.0,11.0,17.4,1.714286,33.28,1.16129,8.3,0.214286,0.142857,1.928571,,10.2,1290,5805000.0,5805.0,9470.571429,9416.892857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13500,WALCRN70,2020-08-13,05:04:00,4.02,,18.5,0.632143,35.0,0.577419,8.0,0.357143,0.035714,0.178571,,0.77,2238,10071000.0,10071.0,9970.071429,10099.285714
13501,WALCRN70,2020-09-16,09:30:00,4.52,,18.5,1.275,34.8,0.190323,7.97,2.278571,0.114286,0.485714,0.12,1.47,1377,6196500.0,6196.5,6591.857143,5372.678571
13502,WALCRN70,2020-10-14,08:25:00,4.99,,15.7,1.682143,34.9,0.206452,8.11,0.357143,0.242857,0.364286,,2.02,654,2943000.0,2943.0,2649.857143,2390.785714
13503,WALCRN70,2020-11-17,05:58:00,3.92,,13.9,3.607143,35.0,0.43871,8.13,6.442857,0.842857,0.178571,,1.59,259,1165500.0,1165.5,1021.500000,1665.321429


In [101]:
# merge abiotic data with biotic data
phyto_sorted['DATUM'] = pd.to_datetime(phyto_sorted['DATUM']).dt.date
merged_df = pd.merge(merged_abio, phyto_sorted, on=['LOC_CODE', 'DATUM'], how='left')

In [102]:
merged_df

Unnamed: 0,LOC_CODE,DATUM,TIJD,ZS [mg/l],ZICHT [dm],T [oC],SiO2 [umol/L],SALNTT [DIMSLS],PO4 [umol/L],pH [DIMSLS],...,Mpe,Pde,Plo,Dpu,Rte,Fja,Hak,Mhe,Dno,Dat
0,DANTZGT,1990-01-10,15:00:00,135.0,2.0,4.0,20.178571,29.19,1.645161,7.8,...,,,,,,,,,,
1,DANTZGT,1990-02-06,13:40:00,295.0,0.5,6.0,,27.37,,,...,,,,,,,,,,
2,DANTZGT,1990-03-08,13:45:00,103.0,3.0,7.3,19.428571,24.99,0.709677,8.0,...,,,,,,,,,,
3,DANTZGT,1990-04-04,10:00:00,113.0,3.0,8.2,6.285714,28.79,0.806452,8.1,...,,,,,,,,,,
4,DANTZGT,1990-05-09,15:30:00,20.0,11.0,17.4,1.714286,33.28,1.16129,8.3,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13500,WALCRN70,2020-08-13,05:04:00,4.02,,18.5,0.632143,35.0,0.577419,8.0,...,,,,,,,,,,
13501,WALCRN70,2020-09-16,09:30:00,4.52,,18.5,1.275,34.8,0.190323,7.97,...,,,,,,,,,,
13502,WALCRN70,2020-10-14,08:25:00,4.99,,15.7,1.682143,34.9,0.206452,8.11,...,,,,,,,,,,
13503,WALCRN70,2020-11-17,05:58:00,3.92,,13.9,3.607143,35.0,0.43871,8.13,...,,,,,,,,,,


### Write temporary merged_df to excel

In [103]:
# filename = '../../data/MERGED_DATA_TEMP.xlsx'
# sheetname = 'MERGE'

# with pd.ExcelWriter(filename, mode='w') as writer:  
#     merged_df.to_excel(writer, sheet_name=sheetname, index=False)

# print(f"Data written to sheet '{sheetname}' in '{filename}'")

## Calculate extra columns based on recomendations from Dr. Louis Peperzak
***
_Variables:_
To be calculated:		
- DIN (NH4+NO3+NO2), and nutrient ratios (DIN:SRP, DIN:Si).		
- Kd:	Extinction coefficient (1/m), calculated from Sal,  TSS and Chla (model 2012a) or from Secchi (model 2012b)
> KD = 3.504 - 0.098 * Sal + 0.025 * TSS + 0.006 * CHla or  Log Kd = -0.673*logSecchi + 0.854, r2=0.75

- Im (Jm2d)	Daily Water Column Irradiance (J/m².d) (Peperzak 1993)	
- Im (kJm2d)	Daily Water Column Irradiance (kJ/m².d) (Peperzak 1993)	
- Interpolation for one or more missing values		


#### DIN (NH4+NO3+NO2), and nutrient ratios (DIN:SRP, DIN:Si).	

In [104]:
merged_df["DIN"] = merged_df['NO3 [umol/L]'] + merged_df['NH4 [umol/L]'] + merged_df['NO2 [umol/L]']
merged_df['DIN:SRP'] = merged_df['DIN']/merged_df['PO4 [umol/L]']
merged_df['DIN:SI'] = merged_df['DIN']/merged_df['SiO2 [umol/L]']
merged_df.head(20)

print(list(merged_df.columns))

['LOC_CODE', 'DATUM', 'TIJD', 'ZS [mg/l]', 'ZICHT [dm]', 'T [oC]', 'SiO2 [umol/L]', 'SALNTT [DIMSLS]', 'PO4 [umol/L]', 'pH [DIMSLS]', 'NO3 [umol/L]', 'NO2 [umol/L]', 'NH4 [umol/L]', 'E [/m]', 'CHLFa [ug/l]', '    Q', 'PAR [J/m2d]', 'PAR [kJ/m2d]', 'kPAR_7d', 'kPAR_14d', 'Agl', 'Dbr', 'Ezo', 'Gde', 'Oau', 'Omo', 'Orh', 'Osi', 'Ram', 'Rse', 'Tec', 'Tle', 'Tni', 'Tro', 'Dle', 'Etr', 'Gfl', 'Gsp', 'Nsc', 'Pbi', 'Pbr', 'Pha', 'Stu', 'Kgl', 'Oro', 'Tor', 'Cdi', 'Cra', 'Ore', 'Ata', 'Cfu', 'Cgr', 'Lan', 'Pcl', 'Pmi', 'Pos', 'Pse', 'Cden', 'Aco', 'Dip', 'Csu', 'Mnu', 'Pco', 'Cdeb', 'Cwa', 'Pba', 'Dac', 'Ptr', 'Lun', 'Nsi', 'Rst', 'Pst', 'Acn', 'Tno', 'Ccu', 'Pan', 'Gfa', 'Hta', 'Dsp', 'Psu', 'Cei', 'Ndi', 'Cda', 'Dro', 'Cha', 'Pac', 'Cau', 'Coc', 'Pte', 'Edu', 'Mpe', 'Pde', 'Plo', 'Dpu', 'Rte', 'Fja', 'Hak', 'Mhe', 'Dno', 'Dat', 'DIN', 'DIN:SRP', 'DIN:SI']


#### Kd:	Extinction coefficient (1/m), calculated from Sal,  TSS and Chla (model 2012a) or from Secchi (model 2012b)
> KD = 3.504 - 0.098 * Sal + 0.025 * TSS + 0.006 * CHla or  Log Kd = -0.673*logSecchi + 0.854, r2=0.75

In [105]:
def calculate_E(row):
    if pd.isna(row['E [/m]']):
        if not pd.isna(row['SALNTT [DIMSLS]']) and not pd.isna(row['T [oC]']) and not pd.isna(row['CHLFa [ug/l]']):
            row['E [/m]'] = 3.504 - 0.098 * row['SALNTT [DIMSLS]'] + 0.025 * row['T [oC]'] + 0.006 * row['CHLFa [ug/l]']
        elif not pd.isna(row['ZICHT [dm]']):
            log_kd = -0.673 * np.log10(row['ZICHT [dm]']) + 0.854
            row['E [/m]'] = 10 ** log_kd
    return row

# Apply the function to each row
merged_df = merged_df.apply(calculate_E, axis=1)

  log_kd = -0.673 * np.log10(row['ZICHT [dm]']) + 0.854


#### Im (Jm2d)	Daily Water Column Irradiance (J/m².d) (Peperzak 1993)	
- Im = DI (PAR) x (1 - EXP(-Kd x h)/(Kd x h)


In [106]:
# merged_df['IM [Jm2d]'] = merged_df['PAR [J/m2d]'] * (1-np.exp((-1 * [merged_df['E [/m]']]) * h))
merged_df['IM [Jm2d]'] = np.NaN
merged_df

  merged_df['IM [Jm2d]'] = np.NaN


Unnamed: 0,LOC_CODE,DATUM,TIJD,ZS [mg/l],ZICHT [dm],T [oC],SiO2 [umol/L],SALNTT [DIMSLS],PO4 [umol/L],pH [DIMSLS],...,Rte,Fja,Hak,Mhe,Dno,Dat,DIN,DIN:SRP,DIN:SI,IM [Jm2d]
0,DANTZGT,1990-01-10,15:00:00,135.00,2.0,4.0,20.178571,29.19,1.645161,7.80,...,,,,,,,55.357143,33.648459,2.743363,
1,DANTZGT,1990-02-06,13:40:00,295.00,0.5,6.0,,27.37,,,...,,,,,,,,,,
2,DANTZGT,1990-03-08,13:45:00,103.00,3.0,7.3,19.428571,24.99,0.709677,8.00,...,,,,,,,100.000000,140.909091,5.147059,
3,DANTZGT,1990-04-04,10:00:00,113.00,3.0,8.2,6.285714,28.79,0.806452,8.10,...,,,,,,,48.428571,60.051429,7.704545,
4,DANTZGT,1990-05-09,15:30:00,20.00,11.0,17.4,1.714286,33.28,1.161290,8.30,...,,,,,,,2.285714,1.968254,1.333333,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13500,WALCRN70,2020-08-13,05:04:00,4.02,,18.5,0.632143,35.00,0.577419,8.00,...,,,,,,,0.571429,0.989625,0.903955,
13501,WALCRN70,2020-09-16,09:30:00,4.52,,18.5,1.275000,34.80,0.190323,7.97,...,,,,,,,2.878571,15.124697,2.257703,
13502,WALCRN70,2020-10-14,08:25:00,4.99,,15.7,1.682143,34.90,0.206452,8.11,...,,,,,,,0.964286,4.670759,0.573248,
13503,WALCRN70,2020-11-17,05:58:00,3.92,,13.9,3.607143,35.00,0.438710,8.13,...,,,,,,,7.464286,17.014181,2.069307,


#### Move abiotic columns to the front


In [107]:
abiotic_columns = ['LOC_CODE', 'DATUM', 'TIJD', 'ZS [mg/l]', 'ZICHT [dm]', 'T [oC]', 'SiO2 [umol/L]',
                    'SALNTT [DIMSLS]', 'PO4 [umol/L]', 'pH [DIMSLS]', 'NO3 [umol/L]', 'NO2 [umol/L]',
                      'NH4 [umol/L]', 'E [/m]', 'CHLFa [ug/l]', '    Q', 'PAR [J/m2d]', 'PAR [kJ/m2d]',
                        'kPAR_7d', 'kPAR_14d', 'DIN', 'DIN:SRP', 'DIN:SI', 'IM [Jm2d]']
biotic_columns = list(unique_SPEC)
merged_df = merged_df[abiotic_columns + biotic_columns]
merged_df

Unnamed: 0,LOC_CODE,DATUM,TIJD,ZS [mg/l],ZICHT [dm],T [oC],SiO2 [umol/L],SALNTT [DIMSLS],PO4 [umol/L],pH [DIMSLS],...,Mpe,Pde,Plo,Dpu,Rte,Fja,Hak,Mhe,Dno,Dat
0,DANTZGT,1990-01-10,15:00:00,135.00,2.0,4.0,20.178571,29.19,1.645161,7.80,...,,,,,,,,,,
1,DANTZGT,1990-02-06,13:40:00,295.00,0.5,6.0,,27.37,,,...,,,,,,,,,,
2,DANTZGT,1990-03-08,13:45:00,103.00,3.0,7.3,19.428571,24.99,0.709677,8.00,...,,,,,,,,,,
3,DANTZGT,1990-04-04,10:00:00,113.00,3.0,8.2,6.285714,28.79,0.806452,8.10,...,,,,,,,,,,
4,DANTZGT,1990-05-09,15:30:00,20.00,11.0,17.4,1.714286,33.28,1.161290,8.30,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13500,WALCRN70,2020-08-13,05:04:00,4.02,,18.5,0.632143,35.00,0.577419,8.00,...,,,,,,,,,,
13501,WALCRN70,2020-09-16,09:30:00,4.52,,18.5,1.275000,34.80,0.190323,7.97,...,,,,,,,,,,
13502,WALCRN70,2020-10-14,08:25:00,4.99,,15.7,1.682143,34.90,0.206452,8.11,...,,,,,,,,,,
13503,WALCRN70,2020-11-17,05:58:00,3.92,,13.9,3.607143,35.00,0.438710,8.13,...,,,,,,,,,,


### Interpolation for missing values in ABIOTIC DATA

In [108]:
interpolation_columns = ['ZS [mg/l]', 'T [oC]', 'SiO2 [umol/L]',
                    'SALNTT [DIMSLS]', 'PO4 [umol/L]', 'pH [DIMSLS]', 'NO3 [umol/L]', 'NO2 [umol/L]',
                      'NH4 [umol/L]', 'E [/m]', 'CHLFa [ug/l]', 'DIN', 'DIN:SRP', 'DIN:SI']


missing_mask = merged_df[interpolation_columns].isna()
missing_mask

Unnamed: 0,ZS [mg/l],T [oC],SiO2 [umol/L],SALNTT [DIMSLS],PO4 [umol/L],pH [DIMSLS],NO3 [umol/L],NO2 [umol/L],NH4 [umol/L],E [/m],CHLFa [ug/l],DIN,DIN:SRP,DIN:SI
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,True,False,True,True,True,True,True,False,True,True,True,True
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13500,False,False,False,False,False,False,False,False,False,False,False,False,False,False
13501,False,False,False,False,False,False,False,False,False,False,False,False,False,False
13502,False,False,False,False,False,False,False,False,False,False,False,False,False,False
13503,False,False,False,False,False,False,False,False,False,False,False,False,False,False


#### interpolate in interpolatable columns

In [109]:
merged_interpolated = merged_df.copy()


for column in interpolation_columns:
    merged_interpolated[column] = merged_interpolated[column].interpolate(method='linear')

merged_interpolated

Unnamed: 0,LOC_CODE,DATUM,TIJD,ZS [mg/l],ZICHT [dm],T [oC],SiO2 [umol/L],SALNTT [DIMSLS],PO4 [umol/L],pH [DIMSLS],...,Mpe,Pde,Plo,Dpu,Rte,Fja,Hak,Mhe,Dno,Dat
0,DANTZGT,1990-01-10,15:00:00,135.00,2.0,4.0,20.178571,29.19,1.645161,7.80,...,,,,,,,,,,
1,DANTZGT,1990-02-06,13:40:00,295.00,0.5,6.0,19.803571,27.37,1.177419,7.90,...,,,,,,,,,,
2,DANTZGT,1990-03-08,13:45:00,103.00,3.0,7.3,19.428571,24.99,0.709677,8.00,...,,,,,,,,,,
3,DANTZGT,1990-04-04,10:00:00,113.00,3.0,8.2,6.285714,28.79,0.806452,8.10,...,,,,,,,,,,
4,DANTZGT,1990-05-09,15:30:00,20.00,11.0,17.4,1.714286,33.28,1.161290,8.30,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13500,WALCRN70,2020-08-13,05:04:00,4.02,,18.5,0.632143,35.00,0.577419,8.00,...,,,,,,,,,,
13501,WALCRN70,2020-09-16,09:30:00,4.52,,18.5,1.275000,34.80,0.190323,7.97,...,,,,,,,,,,
13502,WALCRN70,2020-10-14,08:25:00,4.99,,15.7,1.682143,34.90,0.206452,8.11,...,,,,,,,,,,
13503,WALCRN70,2020-11-17,05:58:00,3.92,,13.9,3.607143,35.00,0.438710,8.13,...,,,,,,,,,,


#### Annotate interpolated values

In [110]:
def list_interpolated_columns(row, missing_mask):
    interpolated_columns = [col for col in merged_df[interpolation_columns].columns if missing_mask.loc[row.name, col]]
    return interpolated_columns 

merged_interpolated['interpolated_columns'] = merged_interpolated.apply(lambda row: list_interpolated_columns(row, missing_mask), axis=1)


In [111]:
merged_interpolated

Unnamed: 0,LOC_CODE,DATUM,TIJD,ZS [mg/l],ZICHT [dm],T [oC],SiO2 [umol/L],SALNTT [DIMSLS],PO4 [umol/L],pH [DIMSLS],...,Pde,Plo,Dpu,Rte,Fja,Hak,Mhe,Dno,Dat,interpolated_columns
0,DANTZGT,1990-01-10,15:00:00,135.00,2.0,4.0,20.178571,29.19,1.645161,7.80,...,,,,,,,,,,[]
1,DANTZGT,1990-02-06,13:40:00,295.00,0.5,6.0,19.803571,27.37,1.177419,7.90,...,,,,,,,,,,"[SiO2 [umol/L], PO4 [umol/L], pH [DIMSLS], NO3..."
2,DANTZGT,1990-03-08,13:45:00,103.00,3.0,7.3,19.428571,24.99,0.709677,8.00,...,,,,,,,,,,[]
3,DANTZGT,1990-04-04,10:00:00,113.00,3.0,8.2,6.285714,28.79,0.806452,8.10,...,,,,,,,,,,[]
4,DANTZGT,1990-05-09,15:30:00,20.00,11.0,17.4,1.714286,33.28,1.161290,8.30,...,,,,,,,,,,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13500,WALCRN70,2020-08-13,05:04:00,4.02,,18.5,0.632143,35.00,0.577419,8.00,...,,,,,,,,,,[]
13501,WALCRN70,2020-09-16,09:30:00,4.52,,18.5,1.275000,34.80,0.190323,7.97,...,,,,,,,,,,[]
13502,WALCRN70,2020-10-14,08:25:00,4.99,,15.7,1.682143,34.90,0.206452,8.11,...,,,,,,,,,,[]
13503,WALCRN70,2020-11-17,05:58:00,3.92,,13.9,3.607143,35.00,0.438710,8.13,...,,,,,,,,,,[]


### Write merged data to excel file

In [112]:
filename = '../../data/MERGED_DATA_INTERPOLATED.xlsx'
sheetname = 'MERGE'

with pd.ExcelWriter(filename, mode='w') as writer:  
    merged_interpolated.to_excel(writer, sheet_name=sheetname, index=False)

print(f"Data written to sheet '{sheetname}' in '{filename}'")

Data written to sheet 'MERGE' in '../../data/MERGED_DATA_INTERPOLATED.xlsx'
