In [3]:
import numpy as np
import math
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from tensorflow import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Input, Masking, Bidirectional
from keras import metrics

In [6]:
df = pd.read_excel('../../data/MERGED_DATA_180624.xlsx', sheet_name='MERGE_FINAL')
df.head()

Unnamed: 0,LOC_CODE,DATUM,TIJD,ZS [mg/l],ZICHT [dm],T [oC],SiO2 [umol/L],SALNTT [DIMSLS],PO4 [umol/L],pH [DIMSLS],...,Pde,Plo,Dpu,Rte,Fja,Hak,Mhe,Dno,Dat,interpolated_columns
0,DANTZGT,1990-01-10,15:00:00,135.0,2.0,4.0,20.178571,29.19,1.645161,7.8,...,,,,,,,,,,[]
1,DANTZGT,1990-02-06,13:40:00,295.0,0.5,6.0,19.803571,27.37,1.177419,7.9,...,,,,,,,,,,"['SiO2 [umol/L]', 'PO4 [umol/L]', 'pH [DIMSLS]..."
2,DANTZGT,1990-03-08,13:45:00,103.0,3.0,7.3,19.428571,24.99,0.709677,8.0,...,,,,,,,,,,[]
3,DANTZGT,1990-04-04,10:00:00,113.0,3.0,8.2,6.285714,28.79,0.806452,8.1,...,,,,,,,,,,[]
4,DANTZGT,1990-05-09,15:30:00,20.0,11.0,17.4,1.714286,33.28,1.16129,8.3,...,,,,,,,,,,[]


## Defining the Model

In [7]:
class BiLSTMEncoder:
    def __init__(self, input_size, masking_value=-1.0, optimizer="Adam", loss="MSE", chosen_metrics=[metrics.mean_squared_error,
                    metrics.mean_absolute_error]):

        self.encoder = nn.LSTM(
            input_size=input_size,
            hidden_size=input_size,
            bidirectional=False,
        )

    def encode(self, input):
        u, (hx, cx) = self.encoder(input)
        return cx.reshape(-1)

## Pre-Processing and Masking of the Missing Values

In [8]:
columns = list(df.columns)

non_phyto_columns = ['TIJD', 'ZS [mg/l]', 'ZICHT [dm]', 'T [oC]', 'SiO2 [umol/L]', 'SALNTT [DIMSLS]', 'PO4 [umol/L]', 'pH [DIMSLS]', 'NO3 [umol/L]', 'NO2 [umol/L]', 'NH4 [umol/L]', 'E [/m]', 'E_method', 'CHLFa [ug/l]', '    Q', 'PAR [J/m2d]', 'PAR [kJ/m2d]', 'kPAR_7d', 'kPAR_14d', 'DIN', 'DIN:SRP', 'DIN:SI', 'SRP:SI', 'IM [Jm2d]', 'interpolated_columns']

loc_date_columns = ["LOC_CODE", "DATUM"]

phyto_columns = sorted(list(set(columns) - set(non_phyto_columns) - set(loc_date_columns)))

phyto_df = df[loc_date_columns + phyto_columns]

print(phyto_df.columns)

Index(['LOC_CODE', 'DATUM', 'Acn', 'Aco', 'Agl', 'Ata', 'Cau', 'Ccu', 'Cda',
       'Cdeb', 'Cden', 'Cdi', 'Cei', 'Cfu', 'Cgr', 'Cha', 'Coc', 'Cra', 'Csu',
       'Cwa', 'Dac', 'Dat', 'Dbr', 'Dip', 'Dle', 'Dno', 'Dpu', 'Dro', 'Dsp',
       'Edu', 'Etr', 'Ezo', 'Fja', 'Gde', 'Gfa', 'Gfl', 'Gsp', 'Hak', 'Hta',
       'Kgl', 'Lan', 'Lun', 'Mhe', 'Mnu', 'Mpe', 'Ndi', 'Nsc', 'Nsi', 'Oau',
       'Omo', 'Ore', 'Orh', 'Oro', 'Osi', 'Pac', 'Pan', 'Pba', 'Pbi', 'Pbr',
       'Pcl', 'Pco', 'Pde', 'Pha', 'Plo', 'Pmi', 'Pos', 'Pse', 'Pst', 'Psu',
       'Pte', 'Ptr', 'Ram', 'Rse', 'Rst', 'Rte', 'Stu', 'Tec', 'Tle', 'Tni',
       'Tno', 'Tor', 'Tro'],
      dtype='object')


In [9]:
display(phyto_df.head())

# padding_value = -1.0

# phyto_df = phyto_df.replace(np.nan, padding_value)

# phyto_df.head()

Unnamed: 0,LOC_CODE,DATUM,Acn,Aco,Agl,Ata,Cau,Ccu,Cda,Cdeb,...,Rse,Rst,Rte,Stu,Tec,Tle,Tni,Tno,Tor,Tro
0,DANTZGT,1990-01-10,,,,,,,,,...,,,,,,,,,,
1,DANTZGT,1990-02-06,,,,,,,,,...,,,,,,,,,,
2,DANTZGT,1990-03-08,,,,,,,,,...,,,,,,,,,,
3,DANTZGT,1990-04-04,,,3.271842,,,,,,...,3.271842,,,,3.572755,3.572755,4.669596,,,4.475787
4,DANTZGT,1990-05-09,,,4.669596,,,,,,...,5.192467,,,,,,,,1.973128,4.012035


In [10]:
# MISSING VALUES BEFORE
percent_missing = phyto_df.isnull().sum() * 100 / len(phyto_df)

print(percent_missing.max())

99.63717141799333


In [11]:
# back and forward fill yaknow
phyto_df = phyto_df.groupby("LOC_CODE").apply(lambda group: group.ffill().bfill()).reset_index(drop=True)

percent_missing = phyto_df.isnull().sum() * 100 / len(phyto_df)

# missing values after
print(percent_missing.to_string())

LOC_CODE     0.000000
DATUM        0.000000
Acn         12.039985
Aco          0.000000
Agl          0.000000
Ata         22.895224
Cau         19.081822
Ccu          0.000000
Cda          0.000000
Cdeb         0.000000
Cden        10.366531
Cdi          0.000000
Cei         14.720474
Cfu         10.396150
Cgr          4.309515
Cha         10.396150
Coc         10.396150
Cra          0.000000
Csu          4.324324
Cwa          6.042207
Dac          8.715291
Dat         60.688634
Dbr          0.000000
Dip          0.000000
Dle          6.042207
Dno         61.229174
Dpu          4.353943
Dro          6.042207
Dsp         10.366531
Edu          0.000000
Etr          0.000000
Ezo          0.000000
Fja          0.000000
Gde          0.000000
Gfa         18.793040
Gfl          0.000000
Gsp          0.000000
Hak          7.108478
Hta         14.690855
Kgl          0.000000
Lan          0.000000
Lun         12.350981
Mhe         35.720104
Mnu         15.098112
Mpe          8.781933
Ndi       

In [12]:

# splitting the phytoplankton randomly
random_split = np.split(np.asarray(phyto_columns), 5)

print(random_split)

# making the new columns for the new phytoplankton thingemabobs
group_dict = {}
for i, group_columns in enumerate(random_split):
    group_dict[f"group_{i}"] = phyto_df[group_columns].sum(axis=1)

grouped_phyto_df = pd.DataFrame(group_dict)

display(grouped_phyto_df)


# effe checken of ze wel vol zijn
percent_missing = grouped_phyto_df.isnull().sum() * 100 / len(grouped_phyto_df)

print(percent_missing.to_string())



[array(['Acn', 'Aco', 'Agl', 'Ata', 'Cau', 'Ccu', 'Cda', 'Cdeb', 'Cden',
       'Cdi', 'Cei', 'Cfu', 'Cgr', 'Cha', 'Coc', 'Cra'], dtype='<U4'), array(['Csu', 'Cwa', 'Dac', 'Dat', 'Dbr', 'Dip', 'Dle', 'Dno', 'Dpu',
       'Dro', 'Dsp', 'Edu', 'Etr', 'Ezo', 'Fja', 'Gde'], dtype='<U4'), array(['Gfa', 'Gfl', 'Gsp', 'Hak', 'Hta', 'Kgl', 'Lan', 'Lun', 'Mhe',
       'Mnu', 'Mpe', 'Ndi', 'Nsc', 'Nsi', 'Oau', 'Omo'], dtype='<U4'), array(['Ore', 'Orh', 'Oro', 'Osi', 'Pac', 'Pan', 'Pba', 'Pbi', 'Pbr',
       'Pcl', 'Pco', 'Pde', 'Pha', 'Plo', 'Pmi', 'Pos'], dtype='<U4'), array(['Pse', 'Pst', 'Psu', 'Pte', 'Ptr', 'Ram', 'Rse', 'Rst', 'Rte',
       'Stu', 'Tec', 'Tle', 'Tni', 'Tno', 'Tor', 'Tro'], dtype='<U4')]


Unnamed: 0,group_0,group_1,group_2,group_3,group_4
0,46.713973,48.221006,44.913413,50.458967,52.410028
1,46.713973,48.221006,44.913413,50.458967,52.410028
2,46.713973,48.221006,44.913413,50.458967,52.410028
3,46.713973,48.221006,44.913413,50.458967,52.410028
4,48.111727,48.318009,43.834850,48.859339,53.866902
...,...,...,...,...,...
13500,40.076212,37.772291,38.465263,37.631076,42.994885
13501,40.076212,37.772291,38.465263,37.631076,42.994885
13502,40.076212,37.772291,38.465263,37.631076,42.994885
13503,40.076212,37.772291,38.465263,37.631076,42.994885


group_0    0.0
group_1    0.0
group_2    0.0
group_3    0.0
group_4    0.0


In [13]:
encoder = BiLSTMEncoder(input_size=len(grouped_phyto_df.columns))

# converting the Ndarray to a torch tensor
grouped_phyto_tensor = torch.tensor(grouped_phyto_df.to_numpy(), dtype=torch.float32)

# final feature tensor
encoder.encode(grouped_phyto_tensor).shape

torch.Size([5])