### This ipynb will import and clean the river height data

In [78]:
import pandas as pd
import plotly.express as px
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots


In [79]:
filePath1 = "Data_Eijsden.csv"
filePath2 = "Data_Lobith.csv"
filePath3 = "Temperature_data.csv"
filterSize = 0.1


In [80]:
def data_cleaner(data):
    selectedData = data.iloc[:,[21,22,24]]
    selectedData = selectedData.iloc[::144,:] #144 = 6 * 24 to reduce the amount of rows to 1 row per day. 
    print(selectedData.head())
    print(len(selectedData))

    locationName = data.iloc[1,1]

    selectedData = selectedData[(np.abs(stats.zscore(selectedData["NUMERIEKEWAARDE"])) < filterSize)] #filters out outliers
    selectedData['WAARNEMINGDATUM'] = pd.to_datetime(selectedData['WAARNEMINGDATUM'], format='%d-%m-%Y') #date from YYYYMM to YYYY-MM-DD
    print(selectedData.head())
    print(len(selectedData))

    return selectedData, locationName

def data_cleaner_temperature(data):
    data['Year'] = pd.to_datetime(data['Year'], format='%Y%m') #date from YYYYMM to YYYY-MM-DD
    print(data.head())
    print(len(data))
    return data
    

def data_import(filePath):
    rawData = pd.read_csv(filePath, delimiter=";", encoding='latin1') 
    return rawData

def data_import_temperature(filePath):
    rawData_temperature = pd.read_csv(filePath, delimiter = ',', skiprows=[0,1,2,3])
    return rawData_temperature

def first_visual(data, plotTitle):
    fig = px.line(data,title=plotTitle, x="WAARNEMINGDATUM", y="NUMERIEKEWAARDE")
    fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=10, label="10y", step="year", stepmode="backward"), #de legenda moet nog even gefixt worden. geen idee nog hoe
            dict(count=3, label="3y", step="year", stepmode="backward"),
            dict(count=1, label="YTD", step="year", stepmode="todate"),
            dict(count=5, label="5y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    )
)
    
    fig.show()
    return
    


In [81]:
#activates the data_import function. This takes quite a long time due to the filesize. 
rawDataEijsden =  data_import(filePath=filePath1)
rawDataLobith =  data_import(filePath=filePath2)
rawTemperature = data_import_temperature(filePath=filePath3)


In [82]:
dataEijsden, locationName1 = data_cleaner(data=rawDataEijsden)
dataLobith, locationName2 = data_cleaner(data=rawDataLobith)

dataTemperature = data_cleaner_temperature(data=rawTemperature)


first_visual(data=dataEijsden,plotTitle=locationName1)
first_visual(data=dataLobith,plotTitle=locationName2)

    WAARNEMINGDATUM WAARNEMINGTIJD (MET/CET)  NUMERIEKEWAARDE
0        20-05-1987                 00:00:00             4489
144      21-05-1987                 00:00:00             4491
288      22-05-1987                 00:00:00             4481
432      23-05-1987                 00:00:00             4492
576      24-05-1987                 00:00:00             4459
12915
    WAARNEMINGDATUM WAARNEMINGTIJD (MET/CET)  NUMERIEKEWAARDE
0        1987-05-20                 00:00:00             4489
144      1987-05-21                 00:00:00             4491
288      1987-05-22                 00:00:00             4481
432      1987-05-23                 00:00:00             4492
576      1987-05-24                 00:00:00             4459
12896
    WAARNEMINGDATUM WAARNEMINGTIJD (MET/CET)  NUMERIEKEWAARDE
0        01-01-1987                 00:00:00             1277
144      07-01-1987                 00:00:00             1474
288      13-01-1987                 00:00:00             1

In [83]:
#plot europe temperature anomalies

fig = px.line(dataTemperature, title = 'Europe temperature anomalies', x = 'Year', y = 'Value')
fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=10, label="10y", step="year", stepmode="backward"), #de legenda moet nog even gefixt worden. geen idee nog hoe
            dict(count=3, label="3y", step="year", stepmode="backward"),
            dict(count=1, label="YTD", step="year", stepmode="todate"),
            dict(count=5, label="5y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    )
)

fig.show()





In [7]:
import math

#we need the mean of all data, to get the variance of different periods towards the mean
number_of_all_data_Eijsden = len(dataEijsden)
mean_all_data_Eijsden = sum(dataEijsden['NUMERIEKEWAARDE'])/number_of_all_data_Eijsden

def variance_Eijsden(data): 
    #number of data points
    number_data = len(data)
    #square deviation
    deviations = [(p - mean_all_data_Eijsden)**2 for p in data]
    #variance
    variance = sum(deviations) / number_data
    return variance
    
def stddev_Eijsden(data):
    #variance of data
    variance_data = variance_Eijsden(data)
    #standard deviation of the data 
    stddev_data = math.sqrt(variance_data)
    return stddev_data

number_of_all_data_Lobith = len(dataLobith)
mean_all_data_Lobith = sum(dataLobith['NUMERIEKEWAARDE'])/number_of_all_data_Lobith

def variance_Lobith(data): 
    #number of data points
    number_data = len(data)
    #square deviation
    deviations = [(p - mean_all_data_Lobith)**2 for p in data]
    #variance
    variance = sum(deviations) / number_data
    return variance

def stddev_Lobith(data):
    #variance of data
    variance_data = variance_Lobith(data)
    #standard deviation of the data 
    stddev_data = math.sqrt(variance_data)
    return stddev_data

In [28]:
# functions variance specific years 

# function variance Eijsden in a given year
def variance_year_Eijsden(year):
    data_Eijsden_year = dataEijsden[dataEijsden['WAARNEMINGDATUM'].str.endswith(str(year))==True]
    return (variance_Eijsden(data_Eijsden_year['NUMERIEKEWAARDE']))

#function variance Lobith in a given year
def variance_year_Lobith(year):
    data_Lobith_year = dataLobith[dataLobith['WAARNEMINGDATUM'].str.endswith(str(year))==True]
    return (variance_Lobith(data_Lobith_year['NUMERIEKEWAARDE']))


In [29]:
# loop through years
year_number = range(1987, 2023)
variance_list_Eijsden = []
variance_list_Lobith = []

for n in year_number:
    numbers1 = variance_year_Eijsden(n)
    variance_list_Eijsden.append(numbers1)
    
for n in year_number:
    numbers2 = variance_year_Lobith(n)
    variance_list_Lobith.append(numbers2)

AttributeError: Can only use .str accessor with string values!

In [25]:
#plot variance Eijsden
x = year_number
y = variance_list_Eijsden
fig = px.line(dataEijsden, title = 'variance Eijsden', x = year_number , y = variance_list_Eijsden )
fig.show()

In [14]:
#plot variance Lobith
x = year_number
y = variance_list_Lobith

fig = px.line(dataLobith, title = 'variance Lobith', x = year_number, y = variance_list_Lobith)
fig.show()

In [97]:
#Deviation per day Eijsden
deviation_day_Eijsden = []

for p in (dataEijsden['NUMERIEKEWAARDE']):
    deviation_day_Eijsden.append((p - mean_all_data_Eijsden))


fig = px.line(dataEijsden, title = 'Deviation Eijsden', x = 'WAARNEMINGDATUM', y = deviation_day_Eijsden)
fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=6, label="6m", step="month", stepmode="backward"), #de legenda moet nog even gefixt worden. geen idee nog hoe
            dict(count=3, label="3y", step="year", stepmode="backward"),
            dict(count=1, label="YTD", step="year", stepmode="todate"),
            dict(count=5, label="5y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    )
)
fig.show()

In [98]:
#Deviation per day Lobith
deviation_day_Lobith = []

for p in (dataLobith['NUMERIEKEWAARDE']):
    deviation_day_Lobith.append((p - mean_all_data_Lobith))


fig = px.line(dataLobith, title = 'Deviation Lobith', x = 'WAARNEMINGDATUM', y = deviation_day_Lobith)
fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=6, label="6m", step="month", stepmode="backward"), #de legenda moet nog even gefixt worden. geen idee nog hoe
            dict(count=3, label="3y", step="year", stepmode="backward"),
            dict(count=1, label="YTD", step="year", stepmode="todate"),
            dict(count=5, label="5y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    )
)
fig.show()

In [17]:
def stdev_year_Eijsden(year):
    data_Eijsden_year = dataEijsden[dataEijsden['WAARNEMINGDATUM'].str.endswith(str(year))==True]
    return (stddev_Eijsden(data_Eijsden_year['NUMERIEKEWAARDE']))

def stdev_year_Lobith(year):
    data_Lobith_year = dataLobith[dataLobith['WAARNEMINGDATUM'].str.endswith(str(year))==True]
    return (stddev_Lobith(data_Lobith_year['NUMERIEKEWAARDE']))


In [19]:
stdev_list_Eijsden = []
stdev_list_Lobith = []

for n in year_number:
    numbers3 = stdev_year_Eijsden(n)
    stdev_list_Eijsden.append(numbers3)
    
for n in year_number:
    numbers4 = stdev_year_Lobith(n)
    stdev_list_Lobith.append(numbers4)

AttributeError: Can only use .str accessor with string values!

In [20]:
#plot standard deviation Eijsden
x = year_number
y = stdev_list_Eijsden
fig = px.scatter(dataEijsden, title = 'Standard Deviation Eijsden', x = year_number, y = stdev_list_Eijsden, trendline = 'ols')
fig.show()

In [21]:
#plot standard deviation Lobith
x = year_number
y = stdev_list_Lobith

fig = px.scatter(dataLobith, title = 'Standard Deviation Lobith', x = year_number, y = stdev_list_Lobith, trendline = 'ols')
fig.show()

In [52]:
#variance and standard deviation per month
#dataEijsden['WAARNEMINGDATUM'].str.endswith(str(year))==True



In [68]:
#try to merge 



In [122]:
dataTemperature_merge = dataTemperature.rename({'Year': 'WAARNEMINGDATUM'}, axis=1)

df_Lobith_temp = pd.merge(dataLobith, dataTemperature_merge, how='outer', on=['WAARNEMINGDATUM'])
print(df_Lobith_temp.head())

df_Eijsden_temp = pd.merge(dataEijsden, dataTemperature_merge, how='outer', on=['WAARNEMINGDATUM'])
print(df_Eijsden_temp.head())



def figure_merge(data_merge):
    trace1 = go.Scatter(x=data_merge['WAARNEMINGDATUM'],
                    y=data_merge['NUMERIEKEWAARDE'],
                    name='Water level',
                    mode='lines+markers',
                    yaxis='y1')
    trace2 = go.Scatter(x=data_merge['WAARNEMINGDATUM'],
                    y=data_merge['Value'],
                    name='Temperature',
                    mode='lines+markers',
                    yaxis='y2')
    data = [trace1, trace2]
    layout = go.Layout(title= 'standard deviation river levels vs Temperature',
                   yaxis=dict(title='Water level'),
                   yaxis2=dict(title='Temperature anomalies in Celsius',
                               overlaying='y',
                               side='right'))

    Figure = go.Figure(data=data, layout=layout)
    return Figure



# #figures merge lobith and temp
figure_merge(df_Lobith_temp)






  WAARNEMINGDATUM WAARNEMINGTIJD (MET/CET)  NUMERIEKEWAARDE  Value
0      1987-01-31                 00:00:00            913.0    NaN
1      1987-09-22                 00:00:00            913.0    NaN
2      1987-10-10                 00:00:00            923.0    NaN
3      1987-11-03                 00:00:00            914.0    NaN
4      1987-11-15                 00:00:00            911.0    NaN
  WAARNEMINGDATUM WAARNEMINGTIJD (MET/CET)  NUMERIEKEWAARDE  Value
0      1987-05-20                 00:00:00           4489.0    NaN
1      1987-05-21                 00:00:00           4491.0    NaN
2      1987-05-22                 00:00:00           4481.0    NaN
3      1987-05-23                 00:00:00           4492.0    NaN
4      1987-05-24                 00:00:00           4459.0    NaN


In [123]:
# figure merge Eijsden and temp

figure_merge(df_Eijsden_temp)
