In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def display_all(df):
    pd.set_option('display.max_rows', None)
    display(df)
    pd.reset_option('display.max_rows')

In [3]:
abiotic_df = pd.read_excel('../data/ABIO.xlsx', sheet_name='ABIO_SURF')
abiotic_df.head()

Unnamed: 0,LOC_CODE,LOC,DATUMTIJDWAARDE,DATUM,TIJD,Year,Month,Day,Q_clndr,Q_eco,PAROMS,PLT:REFVLAK,VAR,BGC,Value_original,Value_interm,VALUE,KWC,EHD
0,GROOTGND,EDG,24-2-2017 15:29:00,20170224,'1529,2017,2,24,1,1,Zwevende stof,WATSGL,ZS,,1380.0,1380.0,1380.0,0,mg/l
1,GROOTGND,EDG,1-4-2015 14:10:00,20150401,'1410,2015,4,1,2,2,Zwevende stof,WATSGL,ZS,,732.0,732.0,732.0,0,mg/l
2,GROOTGND,EDG,19-4-2016 15:38:00,20160419,'1538,2016,4,19,2,2,Zwevende stof,WATSGL,ZS,,712.0,712.0,712.0,0,mg/l
3,HUIBGOT,EDH,21-10-2004 05:59:00,20041021,'0559,2004,10,21,4,4,Zwevende stof,WATSGL,ZS,,710.0,,,56,mg/l
4,GROOTGND,EDG,13-10-1997 11:37:00,19971013,'1137,1997,10,13,4,4,Zwevende stof,WATSGL,ZS,,677.0,677.0,677.0,50,mg/l


### Get unique variables

In [4]:
unique_PAROMS = list(abiotic_df.PAROMS.unique())
unique_VARS = list(abiotic_df.VAR.unique())
unique_tuple = list(zip(unique_PAROMS, unique_VARS))
unique_tuple


[('Zwevende stof', 'ZS'),
 ('Doorzicht', 'ZICHT'),
 ('Temperatuur', 'T'),
 ('silicaat', 'SiO2'),
 ('Saliniteit', 'SALNTT'),
 ('orthofosfaat', 'PO4'),
 ('Zuurgraad', 'pH'),
 ('nitraat', 'NO3'),
 ('nitriet', 'NO2'),
 ('ammonium', 'NH4'),
 ('Extinctiecoefficient', 'E'),
 ('chlorofyl-a', 'CHLFa')]

In [5]:
locations = list(abiotic_df["LOC_CODE"].unique())
locations_abv = list(abiotic_df["LOC"].unique())
locations_tuple = list(zip(locations, locations_abv))
locations_tuple

[('GROOTGND', 'EDG'),
 ('HUIBGOT', 'EDH'),
 ('SCHAARVODDL', 'WSO'),
 ('DANTZGT', 'WZD'),
 ('VLISSGBISSVH', 'WSV'),
 ('MARSDND', 'WZM'),
 ('HANSWGL', 'WSH'),
 ('GOERE6', 'GOE6'),
 ('WALCRN2', 'WA2'),
 ('NOORDWK20', 'NW20'),
 ('ROTTMPT3', 'RP3'),
 ('SOELKKPDOT', 'VMS'),
 ('NOORDWK2', 'NW2'),
 ('WALCRN70', 'WA70'),
 ('NOORDWK70', 'NW70'),
 ('TERSLG10', 'TS10'),
 ('NOORDWK10', 'NW10'),
 ('WALCRN20', 'WA20'),
 ('TERSLG4', 'TS4'),
 ('ROTTMPT70', 'RP70'),
 ('LODSGT', 'OSL'),
 ('TERSLG135', 'TS135'),
 ('ROTTMPT50', 'RP50'),
 ('TERSLG235', 'TS235'),
 ('DREISR', 'GMD'),
 ('TERSLG100', 'TS100'),
 ('TERSLG175', 'TS175')]

### Testing ways to compute a single row

In [6]:
test_df = abiotic_df.loc[(abiotic_df["DATUM"] == 20170224) & (abiotic_df["LOC"] == 'EDG')]
test_df

Unnamed: 0,LOC_CODE,LOC,DATUMTIJDWAARDE,DATUM,TIJD,Year,Month,Day,Q_clndr,Q_eco,PAROMS,PLT:REFVLAK,VAR,BGC,Value_original,Value_interm,VALUE,KWC,EHD
0,GROOTGND,EDG,24-2-2017 15:29:00,20170224,'1529,2017,2,24,1,1,Zwevende stof,WATSGL,ZS,,1380.0,1380.0,1380.0,0,mg/l
19469,GROOTGND,EDG,24-2-2017 15:29:00,20170224,'1529,2017,2,24,1,1,Doorzicht,WATSGL,ZICHT,,1.0,1.0,1.0,0,dm
32178,GROOTGND,EDG,24-2-2017 15:29:00,20170224,'1529,2017,2,24,1,1,Temperatuur,WATSGL,T,,4.87,4.87,4.87,0,oC
45511,GROOTGND,EDG,24-2-2017 15:29:00,20170224,'1529,2017,2,24,1,1,silicaat,WATSGL,SiO2,,2.75,2.75,98.214286,0,umol/L
58798,GROOTGND,EDG,24-2-2017 15:29:00,20170224,'1529,2017,2,24,1,1,Saliniteit,WATSGL,SALNTT,,14.4,14.4,14.4,0,DIMSLS
68666,GROOTGND,EDG,24-2-2017 15:29:00,20170224,'1529,2017,2,24,1,1,orthofosfaat,WATSGL,PO4,,0.0435,0.0435,1.403226,0,umol/L
79738,GROOTGND,EDG,24-2-2017 15:29:00,20170224,'1529,2017,2,24,1,1,Zuurgraad,WATSGL,pH,,8.04,8.04,8.04,0,DIMSLS
106104,GROOTGND,EDG,24-2-2017 15:29:00,20170224,'1529,2017,2,24,1,1,nitriet,WATSGL,NO2,,0.0234,0.0234,1.671429,0,umol/L
119581,GROOTGND,EDG,24-2-2017 15:29:00,20170224,'1529,2017,2,24,1,1,ammonium,WATSGL,NH4,,0.388,0.388,27.714286,0,umol/L
122815,GROOTGND,EDG,24-2-2017 15:29:00,20170224,'1529,2017,2,24,1,1,nitraat,WATSGL,NO3,,2.09,2.09,149.285714,0,umol/L


In [7]:
def from_df_to_row_df(df: pd.DataFrame, location, time, col_list: list = ["LOC_CODE", "DATUMTIJDWAARDE", 'VAR',"VALUE", "KWC", "EHD"]) -> pd.DataFrame:
    """
    Transforms a batch of rows to a single row. Both as pd.DataFrame type.
    """
    row_df = df
    row_df = row_df[col_list]

    measurements = list(zip(row_df["VAR"], row_df["VALUE"], row_df["EHD"], row_df["KWC"]))
    measurements_dict = dict([t[:2] for t in measurements])
    loc, date = location, time

    row = pd.Series(measurements_dict)
    row["LOC_CODE"], row["DATUMTIJDWAARDE"] = loc, date

    final_df = row.to_frame().T

    return final_df

In [8]:
final_df = from_df_to_row_df(test_df, 'GROOTGND', '24-2-2017 15:29:00')
final_df

Unnamed: 0,ZS,ZICHT,T,SiO2,SALNTT,PO4,pH,NO2,NH4,NO3,E,CHLFa,LOC_CODE,DATUMTIJDWAARDE
0,1380.0,1.0,4.87,98.214286,14.4,1.403226,8.04,1.671429,27.714286,149.285714,,10.1,GROOTGND,24-2-2017 15:29:00


### Computing a single row for all data clusters in the set

In [9]:
# Group the DataFrame by 'LOC_CODE' and 'DATUMTIJDWAARDE'
column_list = ["LOC_CODE", "DATUMTIJDWAARDE"] + unique_VARS
finished_df = pd.DataFrame(columns=column_list)

grouped = abiotic_df.groupby(['LOC_CODE', 'DATUMTIJDWAARDE'])

# Iterate over the groups
for (location, time), group_df in grouped:
    # 'location' and 'time' are the current group keys
    # 'group_df' is the DataFrame for the current group
    # Process the group_df as needed
    # print(group_df)
    group = from_df_to_row_df(group_df, location, time)
    finished_df = pd.concat([finished_df, group], axis=0, join='outer')




In [10]:
# d = finished_df[finished_df['LOC_CODE'] == 'WALCRN70']
# d

# f = d.groupby(d["DATUMTIJDWAARDE"]).size().reset_index(name='count').sort_values(by='count', ascending=False).reset_index(drop=True)

# display(f)
finished_df

Unnamed: 0,LOC_CODE,DATUMTIJDWAARDE,ZS,ZICHT,T,SiO2,SALNTT,PO4,pH,NO3,NO2,NH4,E,CHLFa
0,DANTZGT,1-10-2000 09:30:00,26.0,,,12.428571,,1.483871,,1.142857,1.0,9.5,,8.0
0,DANTZGT,1-10-2002 09:30:00,,10.0,15.17,,31.18,,8.06,,,,,
0,DANTZGT,1-10-2003 07:11:00,89.0,5.0,13.25,5.821429,33.03,1.354839,8.13,1.214286,0.714286,9.142857,3.23678,20.6
0,DANTZGT,1-10-2009 10:19:00,,,15.0,,31.4,,,,,,,
0,DANTZGT,1-10-2010 08:02:00,160.0,3.0,12.4,18.928571,28.9,0.806452,8.0,3.571429,1.642857,15.714286,4.61,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,WALCRN70,9-4-1992 09:19:00,5.0,,7.9,0.5,35.31,0.080645,8.3,3.642857,0.071429,0.642857,,4.64
0,WALCRN70,9-4-2001 23:16:00,6.0,,8.338,1.035714,34.67,0.129032,8.227,3.142857,0.142857,0.142857,,9.2
0,WALCRN70,9-6-1994 03:08:00,2.0,,12.1,0.321429,34.83,0.709677,8.5,0.178571,0.035714,0.285714,,2.6
0,WALCRN70,9-8-1999 19:15:00,2.0,,18.12,1.25,35.08,0.129032,7.989,0.107143,0.214286,0.5,,1.18


In [11]:
# d['datumpd'] = pd.to_datetime(d['DATUMTIJDWAARDE'], errors='coerce', infer_datetime_format=True)

# [d['DATUMTIJDWAARDE', 'datumpd']]

In [12]:
biotic_df = pd.read_excel('../data/PHYTO.xlsx', sheet_name='PHYTO_FLATTENED')

biotic_df

Unnamed: 0,LOC_CODE,DATUMTIJDWAARDE,Acn,Aco,Ata,Agl,Cfu,Ccu,Cda,Cdeb,...,Rst,Rte,Stu,Tni,Tec,Tle,Tno,Tro,Tor,Dat
0,DANTZGT,1990-04-04 00:00:00,,,,3.271842,,,,,...,,,,4.669596,3.572755,3.572755,,4.475787,,
1,DANTZGT,1990-04-24 00:00:00,,,,4.590418,,,,,...,,,2.448706,3.447933,,,,3.447933,,
2,DANTZGT,1990-05-09 00:00:00,,,,4.669596,,,,,...,,,,,,,,4.012035,1.973128,
3,DANTZGT,1990-05-23 00:00:00,,,,,,,,,...,,,,,,,,3.590842,,
4,DANTZGT,1990-06-07 00:00:00,,,2.267172,5.300487,2.444045,,,,...,,,,3.566909,1.968483,3.442009,,3.496376,1.968483,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11058,WALCRN70,2019-06-13 00:00:00,,2.034717,,2.333737,,,,,...,,,,2.809514,,,,,2.333737,
11059,WALCRN70,2019-07-15 00:00:00,,,,,,,,,...,,,,,,,,,2.961517,
11060,WALCRN70,2019-08-15 11:37:00,,2.008643,,,,,,,...,,,,2.959085,,,,,3.004795,
11061,WALCRN70,2019-09-12 13:29:00,,,,,,2.980947,3.032049,2.980947,...,,,,2.380246,,,,,3.281750,


In [13]:

duplicates = finished_df

duplicates['DATUMTIJDWAARDE'] = pd.to_datetime(duplicates['DATUMTIJDWAARDE'], format='mixed')

duplicates['DATUMTIJDWAARDE'] = duplicates['DATUMTIJDWAARDE'].dt.date

duplicates = duplicates[duplicates.duplicated(subset=['DATUMTIJDWAARDE', 'LOC_CODE'], keep=False)].sort_values('DATUMTIJDWAARDE')

display(duplicates[2:])

Unnamed: 0,LOC_CODE,DATUMTIJDWAARDE,ZS,ZICHT,T,SiO2,SALNTT,PO4,pH,NO3,NO2,NH4,E,CHLFa
0,HANSWGL,1994-01-02,56.0,,,,,,,,,,,
0,HANSWGL,1994-01-02,102.0,,,,,,,,,,,2.64
0,HANSWGL,1994-01-02,90.0,3.0,5.8,145.0,6.41,4.83871,7.8,404.285714,4.642857,55.142857,,2.48
0,HANSWGL,1994-01-02,119.0,,,,,,,,,,,
0,NOORDWK10,1994-02-11,2.0,,,,,,,,,,,0.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,DANTZGT,2012-11-20,,,6.76,,27.6,,,,,,,
0,DANTZGT,2013-12-11,,,7.91,,26.8,,,,,,,
0,DANTZGT,2013-12-11,42.9,3.0,7.91,28.464286,26.8,0.425806,7.89,15.928571,2.35,19.5,,7.9
0,DANTZGT,2014-11-13,127.0,3.0,8.0,12.857143,27.4,0.625806,8.0,5.521429,2.2,25.571429,,16.7


In [31]:
grouped = duplicates.copy()

def check_conflicts(group):
    conflicting_columns = []
    for column in group.columns[2:]:
        non_nan_count = group[column].notna().sum()
        if non_nan_count > 1:
            conflicting_columns.append(column)
    return ', '.join(conflicting_columns)

conflicts = grouped.groupby(['LOC_CODE', 'DATUMTIJDWAARDE']).apply(check_conflicts).reset_index(name='Conflicts')

grouped = pd.merge(grouped, conflicts, on=['LOC_CODE', 'DATUMTIJDWAARDE'], how='left')

# display_all(grouped[grouped['LOC_CODE'] == 'NOORDWK20'])
display(grouped)

  conflicts = grouped.groupby(['LOC_CODE', 'DATUMTIJDWAARDE']).apply(check_conflicts).reset_index(name='Conflicts')


Unnamed: 0,LOC_CODE,DATUMTIJDWAARDE,ZS,ZICHT,T,SiO2,SALNTT,PO4,pH,NO3,NO2,NH4,E,CHLFa,Conflicts
0,NOORDWK10,1991-11-19,4.0,,9.9,16.5,32.29,2.129032,,37.071429,1.285714,5.428571,,0.78,
1,NOORDWK10,1991-11-19,,,,,,,8.1,,,,,,
2,HANSWGL,1994-01-02,56.0,,,,,,,,,,,,"ZS, CHLFa"
3,HANSWGL,1994-01-02,102.0,,,,,,,,,,,2.64,"ZS, CHLFa"
4,HANSWGL,1994-01-02,90.0,3.0,5.8,145.0,6.41,4.83871,7.8,404.285714,4.642857,55.142857,,2.48,"ZS, CHLFa"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3068,DANTZGT,2012-11-20,,,6.76,,27.6,,,,,,,,"T, SALNTT"
3069,DANTZGT,2013-12-11,,,7.91,,26.8,,,,,,,,"T, SALNTT"
3070,DANTZGT,2013-12-11,42.9,3.0,7.91,28.464286,26.8,0.425806,7.89,15.928571,2.35,19.5,,7.9,"T, SALNTT"
3071,DANTZGT,2014-11-13,127.0,3.0,8.0,12.857143,27.4,0.625806,8.0,5.521429,2.2,25.571429,,16.7,"T, SALNTT"


In [15]:
conflict_counts = grouped.groupby(grouped["Conflicts"]).size().reset_index(name='count').sort_values(by='count', ascending=False).reset_index(drop=True)


display(conflict_counts)

Unnamed: 0,Conflicts,count
0,,2201
1,ZS,316
2,"T, SALNTT, pH",244
3,"ZS, CHLFa",242
4,"ZICHT, T, pH",24
5,E,21
6,ZICHT,12
7,"T, SALNTT",6
8,"ZS, T, SALNTT, pH, CHLFa",5
9,"ZICHT, E",2


In [16]:
grouped[grouped['Conflicts'] == 'T, SALNTT, pH']

Unnamed: 0,LOC_CODE,DATUMTIJDWAARDE,ZS,ZICHT,T,SiO2,SALNTT,PO4,pH,NO3,NO2,NH4,E,CHLFa,Conflicts
1732,NOORDWK10,2002-01-05,6.0,,10.7,1.571429,29.8,0.096774,8.084,18.857143,0.357143,1.785714,,13.0,"T, SALNTT, pH"
1733,NOORDWK10,2002-01-05,,,10.66,,29.91,,8.099,,,,1.14,,"T, SALNTT, pH"
1734,GOERE6,2002-02-01,,,5.671,,28.28,,7.93,,,,,,"T, SALNTT, pH"
1735,GOERE6,2002-02-01,13.0,,5.813,22.928571,28.55,1.064516,7.985,45.928571,3.214286,2.428571,,1.48,"T, SALNTT, pH"
1736,GOERE6,2002-02-01,,,,,,,,,,,0.0,,"T, SALNTT, pH"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2897,TERSLG4,2006-08-24,,,19.0,,32.86,,7.988,,,,,,"T, SALNTT, pH"
2898,TERSLG4,2006-08-24,,,18.99,,32.89,,7.914,,,,,,"T, SALNTT, pH"
3038,TERSLG135,2006-12-19,1.8,,,4.714286,,0.483871,,4.857143,0.035714,0.071429,,0.36,"T, SALNTT, pH"
3039,TERSLG135,2006-12-19,,,9.711,,34.69,,7.983,,,,,,"T, SALNTT, pH"


In [17]:
conflicts

Unnamed: 0,LOC_CODE,DATUMTIJDWAARDE,Conflicts
0,DANTZGT,1994-03-29,"ZS, CHLFa"
1,DANTZGT,1994-08-22,"ZS, CHLFa"
2,DANTZGT,1994-09-06,"ZS, CHLFa"
3,DANTZGT,1994-11-17,"ZS, CHLFa"
4,DANTZGT,1995-02-14,"ZS, CHLFa"
...,...,...,...
1386,WALCRN70,2006-07-20,
1387,WALCRN70,2006-09-14,
1388,WALCRN70,2006-11-10,
1389,WALCRN70,2006-11-15,


In [18]:
# filename = '../data/ABIO.xlsx'
# sheetname = 'ABIO_COMBINED'

# with pd.ExcelWriter(filename, mode='a') as writer:  
#     finished_df.to_excel(writer, sheet_name=sheetname, index=False)

# print(f"Data written to sheet '{sheetname}' in '{filename}'")