In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [3]:
abiotic_df = pd.read_excel('../data/ABIO.xlsx', sheet_name='ABIO_SURF')
abiotic_df.head()

Unnamed: 0,LOC_CODE,LOC,DATUMTIJDWAARDE,DATUM,TIJD,Year,Month,Day,Q_clndr,Q_eco,PAROMS,PLT:REFVLAK,VAR,BGC,Value_original,Value_interm,VALUE,KWC,EHD
0,GROOTGND,EDG,24-2-2017 15:29:00,20170224,'1529,2017,2,24,1,1,Zwevende stof,WATSGL,ZS,,1380.0,1380.0,1380.0,0,mg/l
1,GROOTGND,EDG,1-4-2015 14:10:00,20150401,'1410,2015,4,1,2,2,Zwevende stof,WATSGL,ZS,,732.0,732.0,732.0,0,mg/l
2,GROOTGND,EDG,19-4-2016 15:38:00,20160419,'1538,2016,4,19,2,2,Zwevende stof,WATSGL,ZS,,712.0,712.0,712.0,0,mg/l
3,HUIBGOT,EDH,21-10-2004 05:59:00,20041021,'0559,2004,10,21,4,4,Zwevende stof,WATSGL,ZS,,710.0,,,56,mg/l
4,GROOTGND,EDG,13-10-1997 11:37:00,19971013,'1137,1997,10,13,4,4,Zwevende stof,WATSGL,ZS,,677.0,677.0,677.0,50,mg/l


### Get unique variables

In [4]:
unique_PAROMS = list(abiotic_df.PAROMS.unique())
unique_VARS = list(abiotic_df.VAR.unique())
unique_tuple = list(zip(unique_PAROMS, unique_VARS))
unique_tuple


[('Zwevende stof', 'ZS'),
 ('Doorzicht', 'ZICHT'),
 ('Temperatuur', 'T'),
 ('silicaat', 'SiO2'),
 ('Saliniteit', 'SALNTT'),
 ('orthofosfaat', 'PO4'),
 ('Zuurgraad', 'pH'),
 ('nitraat', 'NO3'),
 ('nitriet', 'NO2'),
 ('ammonium', 'NH4'),
 ('Extinctiecoefficient', 'E'),
 ('chlorofyl-a', 'CHLFa')]

In [5]:
locations = list(abiotic_df["LOC_CODE"].unique())
locations_abv = list(abiotic_df["LOC"].unique())
locations_tuple = list(zip(locations, locations_abv))
locations_tuple

[('GROOTGND', 'EDG'),
 ('HUIBGOT', 'EDH'),
 ('SCHAARVODDL', 'WSO'),
 ('DANTZGT', 'WZD'),
 ('VLISSGBISSVH', 'WSV'),
 ('MARSDND', 'WZM'),
 ('HANSWGL', 'WSH'),
 ('GOERE6', 'GOE6'),
 ('WALCRN2', 'WA2'),
 ('NOORDWK20', 'NW20'),
 ('ROTTMPT3', 'RP3'),
 ('SOELKKPDOT', 'VMS'),
 ('NOORDWK2', 'NW2'),
 ('WALCRN70', 'WA70'),
 ('NOORDWK70', 'NW70'),
 ('TERSLG10', 'TS10'),
 ('NOORDWK10', 'NW10'),
 ('WALCRN20', 'WA20'),
 ('TERSLG4', 'TS4'),
 ('ROTTMPT70', 'RP70'),
 ('LODSGT', 'OSL'),
 ('TERSLG135', 'TS135'),
 ('ROTTMPT50', 'RP50'),
 ('TERSLG235', 'TS235'),
 ('DREISR', 'GMD'),
 ('TERSLG100', 'TS100'),
 ('TERSLG175', 'TS175')]

### Testing ways to compute a single row

In [6]:
test_df = abiotic_df.loc[(abiotic_df["DATUM"] == 20170224) & (abiotic_df["LOC"] == 'EDG')]
test_df

Unnamed: 0,LOC_CODE,LOC,DATUMTIJDWAARDE,DATUM,TIJD,Year,Month,Day,Q_clndr,Q_eco,PAROMS,PLT:REFVLAK,VAR,BGC,Value_original,Value_interm,VALUE,KWC,EHD
0,GROOTGND,EDG,24-2-2017 15:29:00,20170224,'1529,2017,2,24,1,1,Zwevende stof,WATSGL,ZS,,1380.0,1380.0,1380.0,0,mg/l
19469,GROOTGND,EDG,24-2-2017 15:29:00,20170224,'1529,2017,2,24,1,1,Doorzicht,WATSGL,ZICHT,,1.0,1.0,1.0,0,dm
32178,GROOTGND,EDG,24-2-2017 15:29:00,20170224,'1529,2017,2,24,1,1,Temperatuur,WATSGL,T,,4.87,4.87,4.87,0,oC
45511,GROOTGND,EDG,24-2-2017 15:29:00,20170224,'1529,2017,2,24,1,1,silicaat,WATSGL,SiO2,,2.75,2.75,98.214286,0,umol/L
58798,GROOTGND,EDG,24-2-2017 15:29:00,20170224,'1529,2017,2,24,1,1,Saliniteit,WATSGL,SALNTT,,14.4,14.4,14.4,0,DIMSLS
68666,GROOTGND,EDG,24-2-2017 15:29:00,20170224,'1529,2017,2,24,1,1,orthofosfaat,WATSGL,PO4,,0.0435,0.0435,1.403226,0,umol/L
79738,GROOTGND,EDG,24-2-2017 15:29:00,20170224,'1529,2017,2,24,1,1,Zuurgraad,WATSGL,pH,,8.04,8.04,8.04,0,DIMSLS
106104,GROOTGND,EDG,24-2-2017 15:29:00,20170224,'1529,2017,2,24,1,1,nitriet,WATSGL,NO2,,0.0234,0.0234,1.671429,0,umol/L
119581,GROOTGND,EDG,24-2-2017 15:29:00,20170224,'1529,2017,2,24,1,1,ammonium,WATSGL,NH4,,0.388,0.388,27.714286,0,umol/L
122815,GROOTGND,EDG,24-2-2017 15:29:00,20170224,'1529,2017,2,24,1,1,nitraat,WATSGL,NO3,,2.09,2.09,149.285714,0,umol/L


In [7]:
def from_df_to_row_df(df: pd.DataFrame, location, time, col_list: list = ["LOC_CODE", "DATUMTIJDWAARDE", 'VAR',"VALUE", "KWC", "EHD"]) -> pd.DataFrame:
    """
    Transforms a batch of rows to a single row. Both as pd.DataFrame type.
    """
    row_df = df
    row_df = row_df[col_list]

    measurements = list(zip(row_df["VAR"], row_df["VALUE"], row_df["EHD"], row_df["KWC"]))
    measurements_dict = dict([t[:2] for t in measurements])
    loc, date = location, time

    row = pd.Series(measurements_dict)
    row["LOC_CODE"], row["DATUMTIJDWAARDE"] = loc, date

    final_df = row.to_frame().T

    return final_df

In [8]:
final_df = from_df_to_row_df(test_df, 'GROOTGND', '24-2-2017 15:29:00')
final_df

Unnamed: 0,ZS,ZICHT,T,SiO2,SALNTT,PO4,pH,NO2,NH4,NO3,E,CHLFa,LOC_CODE,DATUMTIJDWAARDE
0,1380.0,1.0,4.87,98.214286,14.4,1.403226,8.04,1.671429,27.714286,149.285714,,10.1,GROOTGND,24-2-2017 15:29:00


### Computing a single row for all data clusters in the set

In [9]:
# Group the DataFrame by 'LOC_CODE' and 'DATUMTIJDWAARDE'
column_list = ["LOC_CODE", "DATUMTIJDWAARDE"] + unique_VARS
finished_df = pd.DataFrame(columns=column_list)

grouped = abiotic_df.groupby(['LOC_CODE', 'DATUMTIJDWAARDE'])

# Iterate over the groups
for (location, time), group_df in grouped:
    # 'location' and 'time' are the current group keys
    # 'group_df' is the DataFrame for the current group
    # Process the group_df as needed
    # print(group_df)
    group = from_df_to_row_df(group_df, location, time)
    finished_df = pd.concat([finished_df, group], axis=0, join='outer')


display(finished_df)

Unnamed: 0,LOC_CODE,DATUMTIJDWAARDE,ZS,ZICHT,T,SiO2,SALNTT,PO4,pH,NO3,NO2,NH4,E,CHLFa
0,DANTZGT,1-10-2000 09:30:00,26.0,,,12.428571,,1.483871,,1.142857,1.0,9.5,,8.0
0,DANTZGT,1-10-2002 09:30:00,,10.0,15.17,,31.18,,8.06,,,,,
0,DANTZGT,1-10-2003 07:11:00,89.0,5.0,13.25,5.821429,33.03,1.354839,8.13,1.214286,0.714286,9.142857,3.23678,20.6
0,DANTZGT,1-10-2009 10:19:00,,,15.0,,31.4,,,,,,,
0,DANTZGT,1-10-2010 08:02:00,160.0,3.0,12.4,18.928571,28.9,0.806452,8.0,3.571429,1.642857,15.714286,4.61,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,WALCRN70,9-4-1992 09:19:00,5.0,,7.9,0.5,35.31,0.080645,8.3,3.642857,0.071429,0.642857,,4.64
0,WALCRN70,9-4-2001 23:16:00,6.0,,8.338,1.035714,34.67,0.129032,8.227,3.142857,0.142857,0.142857,,9.2
0,WALCRN70,9-6-1994 03:08:00,2.0,,12.1,0.321429,34.83,0.709677,8.5,0.178571,0.035714,0.285714,,2.6
0,WALCRN70,9-8-1999 19:15:00,2.0,,18.12,1.25,35.08,0.129032,7.989,0.107143,0.214286,0.5,,1.18


In [17]:
display(finished_df.DATUMTIJDWAARDE)

finished_df['test'] = pd.to_datetime(finished_df.DATUMTIJDWAARDE, errors='coerce', infer_datetime_format=True)


display(finished_df)

0    1-10-2000 09:30:00
0    1-10-2002 09:30:00
0    1-10-2003 07:11:00
0    1-10-2009 10:19:00
0    1-10-2010 08:02:00
            ...        
0     9-4-1992 09:19:00
0     9-4-2001 23:16:00
0     9-6-1994 03:08:00
0     9-8-1999 19:15:00
0     9-8-1999 19:34:00
Name: DATUMTIJDWAARDE, Length: 15293, dtype: object

  finished_df['test'] = pd.to_datetime(finished_df.DATUMTIJDWAARDE, errors='coerce', infer_datetime_format=True)


Unnamed: 0,LOC_CODE,DATUMTIJDWAARDE,ZS,ZICHT,T,SiO2,SALNTT,PO4,pH,NO3,NO2,NH4,E,CHLFa,test
0,DANTZGT,1-10-2000 09:30:00,26.0,,,12.428571,,1.483871,,1.142857,1.0,9.5,,8.0,2000-01-10 09:30:00
0,DANTZGT,1-10-2002 09:30:00,,10.0,15.17,,31.18,,8.06,,,,,,2002-01-10 09:30:00
0,DANTZGT,1-10-2003 07:11:00,89.0,5.0,13.25,5.821429,33.03,1.354839,8.13,1.214286,0.714286,9.142857,3.23678,20.6,2003-01-10 07:11:00
0,DANTZGT,1-10-2009 10:19:00,,,15.0,,31.4,,,,,,,,2009-01-10 10:19:00
0,DANTZGT,1-10-2010 08:02:00,160.0,3.0,12.4,18.928571,28.9,0.806452,8.0,3.571429,1.642857,15.714286,4.61,16.0,2010-01-10 08:02:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,WALCRN70,9-4-1992 09:19:00,5.0,,7.9,0.5,35.31,0.080645,8.3,3.642857,0.071429,0.642857,,4.64,1992-09-04 09:19:00
0,WALCRN70,9-4-2001 23:16:00,6.0,,8.338,1.035714,34.67,0.129032,8.227,3.142857,0.142857,0.142857,,9.2,2001-09-04 23:16:00
0,WALCRN70,9-6-1994 03:08:00,2.0,,12.1,0.321429,34.83,0.709677,8.5,0.178571,0.035714,0.285714,,2.6,1994-09-06 03:08:00
0,WALCRN70,9-8-1999 19:15:00,2.0,,18.12,1.25,35.08,0.129032,7.989,0.107143,0.214286,0.5,,1.18,1999-09-08 19:15:00


In [25]:
pd.set_option('display.max_rows', 200)

In [26]:
# Assuming 'finished_df' is your DataFrame and 'DATUMTIJDWAARDE' is the column with the datetime strings
# This regex pattern matches strings that do not have a time component after the date
pattern = r'^\d{1,2}-\d{1,2}-\d{4}$'

# Create a boolean mask where True indicates rows with only date and no time
mask = finished_df['DATUMTIJDWAARDE'].str.match(pattern)

# Use the mask to filter the DataFrame and find all entries with only date
entries_without_time = finished_df[mask]




# Display the entries without time
display(entries_without_time)

entries_without_time["T"].unique()

Unnamed: 0,LOC_CODE,DATUMTIJDWAARDE,ZS,ZICHT,T,SiO2,SALNTT,PO4,pH,NO3,NO2,NH4,E,CHLFa,test
0,DANTZGT,1-2-1996,,,,,,,,,,,,,NaT
0,DANTZGT,16-12-2013,,,,,,,,,,,,,NaT
0,DANTZGT,16-2-2012,,,,,,,,,,,,,NaT
0,DREISR,8-1-1997,,,,,,,,,,,,,NaT
0,GOERE6,10-3-1997,,,,,,,,,,,,,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,WALCRN70,19-6-1990,,,,,,,,,,,,,NaT
0,WALCRN70,5-2-1990,,,,,,,,,,,,,NaT
0,WALCRN70,8-6-1998,,,,,,,,,,,,,NaT
0,WALCRN70,9-2-2004,,,,,,,,,,,,,NaT


array([nan, 16.7, 15.6, 16.76, 14.9, 16.2, 16.0, 14.0, 21.5, 14.6, 14.1,
       12.0, 13.5, 13.9, 14.8], dtype=object)

LOC_CODE            DANTZGT
DATUMTIJDWAARDE    1-2-1996
ZS                      NaN
ZICHT                   NaN
T                       NaN
SiO2                    NaN
SALNTT                  NaN
PO4                     NaN
pH                      NaN
NO3                     NaN
NO2                     NaN
NH4                     NaN
E                       NaN
CHLFa                   NaN
test                    NaT
Name: 0, dtype: object

In [None]:
filename = '../data/ABIO.xlsx'
sheetname = 'ABIO_COMBINED'

with pd.ExcelWriter(filename, mode='a') as writer:  
    finished_df.to_excel(writer, sheet_name=sheetname, index=False)

print(f"Data written to sheet '{sheetname}' in '{filename}'")

Data written to sheet 'ABIO_COMBINED' in '../data/ABIO.xlsx'
