In [78]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def display_all(df):
    pd.set_option('display.max_rows', None)
    display(df)
    pd.reset_option('display.max_rows')

In [79]:
# abiotic_df = pd.read_excel('../data/ABIO.xlsx', sheet_name='ABIO_SURF')
# abiotic_df.head()

In [80]:
biotic_df = pd.read_excel('../data/PHYTO.xlsx', sheet_name='PHYTO_SURF')
biotic_df.head()


Unnamed: 0,LOC_CODE,LOC,WATERBODY,TYPE,DATE_SMP,YEAR,MONTH,DAY,Q_clndr,Q_eco,PROD_CODE,SPECIES,SPEC,GROUP,AMT_MEAS,cL,LcL
0,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,2005-01-21,,,,,1,OW,Actinocyclus normanii,Acn,DIAT,1.0,5000.0,
1,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,1994-01-25,,,,,1,WATSGL,Actinocyclus normanii,Acn,DIAT,,2000.0,
2,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,1994-02-24,,,,,1,WATSGL,Actinocyclus normanii,Acn,DIAT,,485.0,
3,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,2001-03-22,,,,,2,OW,Actinocyclus normanii,Acn,DIAT,1.0,833.333333,
4,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,2002-03-13,,,,,2,OW,Actinocyclus normanii,Acn,DIAT,2.0,4000.0,


In [81]:
biotic_df.columns

Index(['LOC_CODE', 'LOC', 'WATERBODY', 'TYPE', 'DATE_SMP', 'YEAR', 'MONTH',
       'DAY', 'Q_clndr', 'Q_eco', 'PROD_CODE', 'SPECIES', 'SPEC', 'GROUP',
       'AMT_MEAS', 'cL', 'LcL'],
      dtype='object')

### Get unique variables

In [82]:
locations = biotic_df["LOC_CODE"].unique()

print(np.sort(locations), '\n')

grouped_locations = biotic_df.groupby(["LOC", "LOC_CODE"]).size().reset_index(name='count').sort_values(by='count', ascending=False).reset_index(drop=True)

# display(grouped_locations)


on_map = {'RT3', 'RT50', 'RT70', 
          'TS4', 'TS10', 'TS100', 'TS135', 'TS175', 'TS235', 
          'NW2', 'NW10', 'NW20', 'NW70',
          'GR6',
          'WC2', 'WC20', 'WC70',
          # waddeneilanden
          'ED30', 'ED250',
          'WZ30', 'WZ420', 'WZ590',
          # zeeland
          'GM40',
          'OS110', 'OS40', 'OS140', 'OS10',
          'VM50',
          'WS100', 'WS160', 'WSSVD'
         }

filtered_locations = grouped_locations[~grouped_locations["LOC"].isin(on_map)]

display(filtered_locations)

['DANTZGT' 'DREISR' 'GOERE6' 'GROOTGND' 'HANSWGL' 'HUIBGOT' 'LODSGT'
 'MARSDND' 'NOORDWK10' 'NOORDWK2' 'NOORDWK20' 'NOORDWK70' 'ROTTMPT3'
 'ROTTMPT50' 'ROTTMPT70' 'SCHAARVODDL' 'SOELKKPDOT' 'TERSLG10' 'TERSLG100'
 'TERSLG135' 'TERSLG175' 'TERSLG235' 'TERSLG4' 'VLISSGBISSVH' 'WALCRN2'
 'WALCRN20' 'WALCRN70'] 



Unnamed: 0,LOC,LOC_CODE,count
1,EDH,HUIBGOT,6954
2,WZM,MARSDND,6927
3,WZD,DANTZGT,6460
4,WSV,VLISSGBISSVH,5926
7,WSH,HANSWGL,4765
10,OSL,LODSGT,3882
11,WA20,WALCRN20,3347
13,EDG,GROOTGND,3236
14,GMD,DREISR,3192
15,WA2,WALCRN2,3035


#### Duplicate species

In [83]:
species = biotic_df.SPECIES.unique()
spec = biotic_df.SPEC.unique()

print(len(species))
print(len(spec))


83
81


In [84]:
# Group by 'species' and aggregate unique 'spec' values
spec_to_species = biotic_df.groupby('SPEC')['SPECIES'].unique()

# Convert the result to a DataFrame
spec_to_species_df = spec_to_species.reset_index()

# Rename the columns for clarity
spec_to_species_df.columns = ['Abrv','Species']

display(spec_to_species_df)


Unnamed: 0,Abrv,Species
0,Acn,[Actinocyclus normanii]
1,Aco,[Actinocyclus octonarius]
2,Agl,"[Asterionella glacialis, Asterionellopsis glac..."
3,Ata,[Alexandrium tamarense]
4,Cau,[Corymbellus aureus]
...,...,...
76,Tle,[Thalassiosira levanderi]
77,Tni,[Thalassionema nitzschioides]
78,Tno,[Thalassiosira nordenskioeldii]
79,Tor,[Torodinium robustum]


In [85]:
# Find species with more than one abbreviation
phyto_duplicates = spec_to_species_df[spec_to_species_df['Species'].apply(len) > 1]

print(phyto_duplicates)



   Abrv                                            Species
2   Agl  [Asterionella glacialis, Asterionellopsis glac...
13  Cha                 [Chattonella, Chattonella antiqua]


In [86]:
# Species counts
species_counts = biotic_df.groupby(biotic_df["SPEC"]).size().reset_index(name='count').sort_values(by='count', ascending=False).reset_index(drop=True)

display(species_counts)

Unnamed: 0,SPEC,count
0,Pha,7735
1,Gsp,4542
2,Tor,4371
3,Kgl,4250
4,Rse,4115
...,...,...
76,Pde,251
77,Hak,227
78,Ata,182
79,Dno,120


#### Waterbodies

In [87]:
waterbodies = biotic_df["WATERBODY"].unique()

print(np.sort(waterbodies), '\n')

grouped_waterbodies = biotic_df.groupby(biotic_df["WATERBODY"]).size().reset_index(name='count').sort_values(by='count', ascending=False).reset_index(drop=True)

display(grouped_waterbodies)

['EEMSDOLLARD' 'GREVELINGENMEER' 'NOORDZEE' 'OOSTERSCHELDE' 'VEERSEMEER'
 'WADDENZEE' 'WESTERSCHELDE'] 



Unnamed: 0,WATERBODY,count
0,NOORDZEE,60099
1,WADDENZEE,13387
2,WESTERSCHELDE,12580
3,EEMSDOLLARD,10190
4,OOSTERSCHELDE,3882
5,GREVELINGENMEER,3192
6,VEERSEMEER,1952


#### By type

In [88]:
types = biotic_df["TYPE"].unique()

print(np.sort(types), '\n')

grouped_types = biotic_df.groupby(biotic_df["TYPE"]).size().reset_index(name='count').sort_values(by='count', ascending=False).reset_index(drop=True)

display(grouped_types)

['COAST' 'ESTUARINE' 'LAKE' 'OPENSEA'] 



Unnamed: 0,TYPE,count
0,ESTUARINE,40039
1,COAST,38570
2,OPENSEA,21529
3,LAKE,5144


#### Sorting by location -> species -> time


In [203]:
# Sort the DataFrame and then reset the index
sorted_biotic_df = biotic_df.sort_values(by=["LOC_CODE", "SPEC", "DATE_SMP"]).reset_index(drop=True)

display(sorted_biotic_df)

Unnamed: 0,LOC_CODE,LOC,WATERBODY,TYPE,DATE_SMP,YEAR,MONTH,DAY,Q_clndr,Q_eco,PROD_CODE,SPECIES,SPEC,GROUP,AMT_MEAS,cL,LcL
0,DANTZGT,WZD,WADDENZEE,ESTUARINE,1991-02-13,,,,,1,WATSGL,Actinocyclus normanii,Acn,DIAT,,704.000000,
1,DANTZGT,WZD,WADDENZEE,ESTUARINE,1991-04-25,,,,,2,WATSGL,Actinocyclus normanii,Acn,DIAT,,94.000000,
2,DANTZGT,WZD,WADDENZEE,ESTUARINE,1993-10-13,,,,,4,WATSGL,Actinocyclus normanii,Acn,DIAT,,1127.000000,
3,DANTZGT,WZD,WADDENZEE,ESTUARINE,2000-11-22,,,,,4,OW,Actinocyclus normanii,Acn,DIAT,2.0,15384.615385,
4,DANTZGT,WZD,WADDENZEE,ESTUARINE,2000-12-20,,,,,1,OW,Actinocyclus normanii,Acn,DIAT,2.0,1666.666667,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105277,WALCRN70,WA70,NOORDZEE,OPENSEA,2010-02-17,,,,,1,OW,Thalassiosira rotula,Tro,DIAT,32.0,2348.079113,
105278,WALCRN70,WA70,NOORDZEE,OPENSEA,2010-03-10,,,,,2,OW,Thalassiosira rotula,Tro,DIAT,4.0,2651.591265,
105279,WALCRN70,WA70,NOORDZEE,OPENSEA,2010-12-15,,,,,1,OW,Thalassiosira rotula,Tro,DIAT,2.0,86.610081,
105280,WALCRN70,WA70,NOORDZEE,OPENSEA,2014-03-18,,,,,2,OW,Thalassiosira rotula,Tro,DIAT,47.0,6419.512195,


## Phyto cleanup

In [204]:
phyto_df = sorted_biotic_df
phyto_df['DATE_SMP'] = pd.to_datetime(phyto_df['DATE_SMP'].dt.date)
phyto_df.rename(columns={'DATE_SMP':'DATUM'}, inplace=True)

display(phyto_df)

Unnamed: 0,LOC_CODE,LOC,WATERBODY,TYPE,DATUM,YEAR,MONTH,DAY,Q_clndr,Q_eco,PROD_CODE,SPECIES,SPEC,GROUP,AMT_MEAS,cL,LcL
0,DANTZGT,WZD,WADDENZEE,ESTUARINE,1991-02-13,,,,,1,WATSGL,Actinocyclus normanii,Acn,DIAT,,704.000000,
1,DANTZGT,WZD,WADDENZEE,ESTUARINE,1991-04-25,,,,,2,WATSGL,Actinocyclus normanii,Acn,DIAT,,94.000000,
2,DANTZGT,WZD,WADDENZEE,ESTUARINE,1993-10-13,,,,,4,WATSGL,Actinocyclus normanii,Acn,DIAT,,1127.000000,
3,DANTZGT,WZD,WADDENZEE,ESTUARINE,2000-11-22,,,,,4,OW,Actinocyclus normanii,Acn,DIAT,2.0,15384.615385,
4,DANTZGT,WZD,WADDENZEE,ESTUARINE,2000-12-20,,,,,1,OW,Actinocyclus normanii,Acn,DIAT,2.0,1666.666667,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105277,WALCRN70,WA70,NOORDZEE,OPENSEA,2010-02-17,,,,,1,OW,Thalassiosira rotula,Tro,DIAT,32.0,2348.079113,
105278,WALCRN70,WA70,NOORDZEE,OPENSEA,2010-03-10,,,,,2,OW,Thalassiosira rotula,Tro,DIAT,4.0,2651.591265,
105279,WALCRN70,WA70,NOORDZEE,OPENSEA,2010-12-15,,,,,1,OW,Thalassiosira rotula,Tro,DIAT,2.0,86.610081,
105280,WALCRN70,WA70,NOORDZEE,OPENSEA,2014-03-18,,,,,2,OW,Thalassiosira rotula,Tro,DIAT,47.0,6419.512195,


### Merging identical species with different names

In [205]:
# Odontella with Phaeocystis, Rhizosolenia delicatula with Guinardia
phyto_df["SPEC"] = phyto_df["SPEC"].replace({"Oau": "Pha", "Rde": "Gde" })

In [206]:
# Check that everything went right
spec_to_species_df = phyto_df.groupby('SPEC')['SPECIES'].unique().reset_index()
spec_to_species_df.columns = ['Abrv','Species']

# display_all(spec_to_species_df)

phyto_duplicates = spec_to_species_df[spec_to_species_df['Species'].apply(len) > 1]
print(phyto_duplicates)

   Abrv                                            Species
2   Agl  [Asterionella glacialis, Asterionellopsis glac...
13  Cha                 [Chattonella, Chattonella antiqua]
31  Gde    [Guinardia delicatula, Rhizosolenia delicatula]
59  Pha                    [Odontella aurita, Phaeocystis]


In [207]:
# conflict between species/group after merge, so need to drop these columns

phyto_df.drop([col for col in phyto_df.columns if col not in ["LOC_CODE", "SPEC", "DATUM", "cL"]], axis=1, inplace=True)

display(phyto_df)

Unnamed: 0,LOC_CODE,DATUM,SPEC,cL
0,DANTZGT,1991-02-13,Acn,704.000000
1,DANTZGT,1991-04-25,Acn,94.000000
2,DANTZGT,1993-10-13,Acn,1127.000000
3,DANTZGT,2000-11-22,Acn,15384.615385
4,DANTZGT,2000-12-20,Acn,1666.666667
...,...,...,...,...
105277,WALCRN70,2010-02-17,Tro,2348.079113
105278,WALCRN70,2010-03-10,Tro,2651.591265
105279,WALCRN70,2010-12-15,Tro,86.610081
105280,WALCRN70,2014-03-18,Tro,6419.512195


### Aggregating measurements on same day

In [218]:
phyto_duplicates = phyto_df[phyto_df.duplicated(subset=["LOC_CODE", "DATUM", "SPEC"], keep=False)].sort_values(by=["LOC_CODE", "DATUM", "SPEC"]).reset_index(drop=True)
phyto_non_duplicates = phyto_df[~phyto_df.duplicated(subset=["LOC_CODE", "DATUM", "SPEC"], keep=False)]

display(phyto_duplicates)
# display(phyto_non_duplicates)

n_duplicates = len(phyto_duplicates.groupby(["LOC_CODE", "DATUM", "SPEC"]).size())
display(n_duplicates)

Unnamed: 0,LOC_CODE,DATUM,SPEC,cL
0,DANTZGT,1991-03-14,Pha,1.464660e+05
1,DANTZGT,1991-03-14,Pha,8.246900e+04
2,DANTZGT,1991-05-15,Pha,2.097000e+03
3,DANTZGT,1991-05-15,Pha,2.044940e+07
4,DANTZGT,1992-06-11,Pha,4.016000e+03
...,...,...,...,...
7124,WALCRN70,2019-07-15,Pha,1.493793e+04
7125,WALCRN70,2019-08-15,Rse,1.111111e+03
7126,WALCRN70,2019-08-15,Rse,7.070707e+02
7127,WALCRN70,2019-09-12,Pha,3.954365e+04


3083

In [219]:
# distinct_counts = phyto_duplicates.groupby(['LOC_CODE', 'DATUM', 'SPEC']).agg(lambda x: x.nunique())
# display(distinct_counts)

In [220]:
display(phyto_duplicates['SPEC'].value_counts())

SPEC
Pha    6833
Rse     290
Gsp       2
Kgl       2
Tor       2
Name: count, dtype: int64

In [228]:
def spec_aggregate(group):
    if any(spec in group["SPEC"].values for spec in ["Agl", "Cha", "Pha", "Gde"]):
        return group['cL'].sum()
    else:
        return group['cL'].mean()

phyto_aggregated = phyto_duplicates.groupby(["LOC_CODE", "DATUM", "SPEC"]).apply(spec_aggregate).reset_index(name='cL')

display(phyto_aggregated)

  phyto_aggregated = phyto_duplicates.groupby(["LOC_CODE", "DATUM", "SPEC"]).apply(spec_aggregate).reset_index(name='cL')


Unnamed: 0,LOC_CODE,DATUM,SPEC,cL
0,DANTZGT,1991-03-14,Pha,2.289350e+05
1,DANTZGT,1991-05-15,Pha,2.045150e+07
2,DANTZGT,1992-06-11,Pha,1.586752e+07
3,DANTZGT,1992-06-24,Pha,6.142860e+05
4,DANTZGT,1992-07-28,Pha,2.185410e+05
...,...,...,...,...
3078,WALCRN70,2019-05-16,Pha,1.581818e+07
3079,WALCRN70,2019-06-13,Pha,9.864151e+04
3080,WALCRN70,2019-07-15,Pha,7.842412e+04
3081,WALCRN70,2019-08-15,Rse,9.090909e+02


In [229]:
final_phyto = pd.concat([phyto_non_duplicates, phyto_aggregated]).sort_values(by=["LOC_CODE", "DATUM", "SPEC"])


display(final_phyto)

Unnamed: 0,LOC_CODE,DATUM,SPEC,cL
52,DANTZGT,1990-04-04,Agl,1869.000000
681,DANTZGT,1990-04-04,Dbr,56075.000000
1376,DANTZGT,1990-04-04,Ezo,3738.000000
4913,DANTZGT,1990-04-04,Gde,3738.000000
2872,DANTZGT,1990-04-04,Omo,5607.000000
...,...,...,...,...
96751,WALCRN70,2019-10-30,Kgl,5123.014487
96913,WALCRN70,2019-10-30,Nsi,102.021174
97832,WALCRN70,2019-10-30,Tni,714.148219
98096,WALCRN70,2019-10-30,Tor,8050.451336


### Flatten BIOTIC_DF


In [223]:
biotic_df.head()

Unnamed: 0,LOC_CODE,LOC,WATERBODY,TYPE,DATUMTIJDWAARDE,YEAR,MONTH,DAY,Q_clndr,Q_eco,PROD_CODE,SPECIES,SPEC,GROUP,AMT_MEAS,cL,LcL
0,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,2005-01-21,,,,,1,OW,Actinocyclus normanii,Acn,DIAT,1.0,5000.0,
1,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,1994-01-25,,,,,1,WATSGL,Actinocyclus normanii,Acn,DIAT,,2000.0,
2,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,1994-02-24,,,,,1,WATSGL,Actinocyclus normanii,Acn,DIAT,,485.0,
3,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,2001-03-22,,,,,2,OW,Actinocyclus normanii,Acn,DIAT,1.0,833.333333,
4,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,2002-03-13,,,,,2,OW,Actinocyclus normanii,Acn,DIAT,2.0,4000.0,


In [224]:
biotic_df.rename(columns={'DATE_SMP':'DATUMTIJDWAARDE'}, inplace=True)
biotic_df.head()

Unnamed: 0,LOC_CODE,LOC,WATERBODY,TYPE,DATUMTIJDWAARDE,YEAR,MONTH,DAY,Q_clndr,Q_eco,PROD_CODE,SPECIES,SPEC,GROUP,AMT_MEAS,cL,LcL
0,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,2005-01-21,,,,,1,OW,Actinocyclus normanii,Acn,DIAT,1.0,5000.0,
1,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,1994-01-25,,,,,1,WATSGL,Actinocyclus normanii,Acn,DIAT,,2000.0,
2,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,1994-02-24,,,,,1,WATSGL,Actinocyclus normanii,Acn,DIAT,,485.0,
3,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,2001-03-22,,,,,2,OW,Actinocyclus normanii,Acn,DIAT,1.0,833.333333,
4,GROOTGND,EDG,EEMSDOLLARD,ESTUARINE,2002-03-13,,,,,2,OW,Actinocyclus normanii,Acn,DIAT,2.0,4000.0,


In [225]:
def from_df_to_row_df(df: pd.DataFrame, location, time, col_list: list = ["LOC_CODE", "DATUMTIJDWAARDE", 'SPEC', "LcL"]) -> pd.DataFrame:
    """
    Transforms a batch of rows to a single row. Both as pd.DataFrame type.
    """
    row_df = df
    row_df = row_df[col_list]

    measurements = list(zip(row_df["SPEC"], row_df["LcL"]))
    measurements_dict = dict([t[:2] for t in measurements])
    loc, date = location, time

    row = pd.Series(measurements_dict)
    row["LOC_CODE"], row["DATUMTIJDWAARDE"] = loc, date

    final_df = row.to_frame().T

    return final_df

In [226]:
list(spec)

['Acn',
 'Aco',
 'Ata',
 'Agl',
 'Cfu',
 'Ccu',
 'Cda',
 'Cdeb',
 'Cden',
 'Cdi',
 'Cei',
 'Csu',
 'Cha',
 'Coc',
 'Cau',
 'Cgr',
 'Cra',
 'Cwa',
 'Dpu',
 'Dsp',
 'Dac',
 'Dno',
 'Dro',
 'Dip',
 'Dle',
 'Dbr',
 'Etr',
 'Ezo',
 'Edu',
 'Fja',
 'Gde',
 'Gfl',
 'Gsp',
 'Gfa',
 'Hta',
 'Hak',
 'Kgl',
 'Lan',
 'Lun',
 'Mhe',
 'Mnu',
 'Mpe',
 'Ndi',
 'Nsi',
 'Nsc',
 'Oro',
 'Oau',
 'Omo',
 'Ore',
 'Orh',
 'Osi',
 'Pha',
 'Pan',
 'Pos',
 'Pba',
 'Pmi',
 'Ptr',
 'Pac',
 'Pbi',
 'Pbr',
 'Pcl',
 'Pco',
 'Pde',
 'Pst',
 'Psu',
 'Pse',
 'Pte',
 'Plo',
 'Ram',
 'Rde',
 'Rse',
 'Rst',
 'Rte',
 'Stu',
 'Tni',
 'Tec',
 'Tle',
 'Tno',
 'Tro',
 'Tor',
 'Dat']

In [227]:
# Group the DataFrame by 'LOC_CODE' and 'DATUMTIJDWAARDE'
column_list = ["LOC_CODE", "DATUMTIJDWAARDE"] + list(spec)
finished_df = pd.DataFrame(columns=column_list)

grouped = biotic_df.groupby(['LOC_CODE', 'DATUMTIJDWAARDE'])

# Iterate over the groups
for (location, time), group_df in grouped:
    # 'location' and 'time' are the current group keys
    # 'group_df' is the DataFrame for the current group
    # Process the group_df as needed
    # print(group_df)
    group = from_df_to_row_df(group_df, location, time)
    finished_df = pd.concat([finished_df, group], axis=0, join='outer')

finished_df.head()

KeyboardInterrupt: 

In [None]:
# filename = '../data/PHYTO.xlsx'
# sheetname = 'PHYTO_FLATTENED'

# with pd.ExcelWriter(filename, mode='a') as writer:  
#     finished_df.to_excel(writer, sheet_name=sheetname, index=False)

# print(f"Data written to sheet '{sheetname}' in '{filename}'")

Data written to sheet 'PHYTO_FLATTENED' in '../data/PHYTO.xlsx'
