### This ipynb will import and clean the river height data

In [302]:
import pandas as pd
import plotly.express as px
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt


In [303]:
filePath1 = "Data Eijsden.csv"
filePath2 = "Data_Lobith.csv"
filterSize = 0.1


In [304]:
def data_cleaner(data):
    selectedData = data.iloc[:,[21,22,24]]
    selectedData = selectedData.iloc[::144,:] #144 = 6 * 24 to reduce the amount of rows to 1 row per day. 
    print(selectedData.head())
    print(len(selectedData))

    locationName = data.iloc[1,1]

    selectedData = selectedData[(np.abs(stats.zscore(selectedData["NUMERIEKEWAARDE"])) < filterSize)] #filters out outliers
    print(selectedData.head())
    print(len(selectedData))

    return selectedData, locationName

def data_import(filePath):
    rawData = pd.read_csv(filePath, delimiter=";", encoding='latin1') 
    return rawData

def first_visual(data, plotTitle):
    fig = px.line(data,title=plotTitle, x="WAARNEMINGDATUM", y="NUMERIEKEWAARDE")
    fig.show()
    return
    


In [305]:
#activates the data_import function. This takes quite a long time due to the filesize. 
rawDataEijsden =  data_import(filePath=filePath1)
rawDataLobith =  data_import(filePath=filePath2)



In [306]:
dataEijsden, locationName1 = data_cleaner(data=rawDataEijsden)
dataLobith, locationName2 = data_cleaner(data=rawDataLobith)


first_visual(data=dataEijsden,plotTitle=locationName1)
first_visual(data=dataLobith,plotTitle=locationName2)

    WAARNEMINGDATUM WAARNEMINGTIJD (MET/CET)  NUMERIEKEWAARDE
0        20-05-1987                 00:00:00             4489
144      21-05-1987                 00:00:00             4491
288      22-05-1987                 00:00:00             4481
432      23-05-1987                 00:00:00             4492
576      24-05-1987                 00:00:00             4459
12915
    WAARNEMINGDATUM WAARNEMINGTIJD (MET/CET)  NUMERIEKEWAARDE
0        20-05-1987                 00:00:00             4489
144      21-05-1987                 00:00:00             4491
288      22-05-1987                 00:00:00             4481
432      23-05-1987                 00:00:00             4492
576      24-05-1987                 00:00:00             4459
12896
    WAARNEMINGDATUM WAARNEMINGTIJD (MET/CET)  NUMERIEKEWAARDE
0        01-01-1987                 00:00:00             1277
144      07-01-1987                 00:00:00             1474
288      13-01-1987                 00:00:00             1

In [307]:
import math

#we need the mean of all data, to get the variance of different periods towards the mean
number_of_all_data_Eijsden = len(dataEijsden)
mean_all_data_Eijsden = sum(dataEijsden['NUMERIEKEWAARDE'])/number_of_all_data_Eijsden

def variance_Eijsden(data): 
    #number of data points
    number_data = len(data)
    #square deviation
    deviations = [(p - mean_all_data_Eijsden)**2 for p in data]
    #variance
    variance = sum(deviations) / number_data
    return variance
    
def stddev_Eijsden(data):
    #variance of data
    variance_data = variance_Eijsden(data)
    #standard deviation of the data 
    stddev_data = math.sqrt(variance_data)
    return stddev_data

number_of_all_data_Lobith = len(dataLobith)
mean_all_data_Lobith = sum(dataLobith['NUMERIEKEWAARDE'])/number_of_all_data_Lobith

def variance_Lobith(data): 
    #number of data points
    number_data = len(data)
    #square deviation
    deviations = [(p - mean_all_data_Lobith)**2 for p in data]
    #variance
    variance = sum(deviations) / number_data
    return variance

def stddev_Lobith(data):
    #variance of data
    variance_data = variance_Lobith(data)
    #standard deviation of the data 
    stddev_data = math.sqrt(variance_data)
    return stddev_data

In [308]:
# functions variance specific years 

#function variance Eijsden in a given year
def variance_year_Eijsden(year):
    data_Eijsden_year = dataEijsden[dataEijsden['WAARNEMINGDATUM'].str.endswith(str(year))==True]
    return (variance_Eijsden(data_Eijsden_year['NUMERIEKEWAARDE']))

#function variance Lobith in a given year
def variance_year_Lobith(year):
    data_Lobith_year = dataLobith[dataLobith['WAARNEMINGDATUM'].str.endswith(str(year))==True]
    return (variance_Lobith(data_Lobith_year['NUMERIEKEWAARDE']))


In [309]:
#loop through years
year_number = range(1987, 2023)
variance_list_Eijsden = []
variance_list_Lobith = []

for n in year_number:
    numbers1 = variance_year_Eijsden(n)
    variance_list_Eijsden.append(numbers1)
    
for n in year_number:
    numbers2 = variance_year_Lobith(n)
    variance_list_Lobith.append(numbers2)

In [310]:
#plot variance Eijsden
x = year_number
y = variance_list_Eijsden
fig = px.line(dataEijsden, title = 'variance Eijsden', x = year_number, y = variance_list_Eijsden)
fig.show()

In [311]:
#plot variance Lobith
x = year_number
y = variance_list_Lobith

fig = px.line(dataLobith, title = 'variance Lobith', x = year_number, y = variance_list_Lobith)
fig.show()

In [312]:
#Deviation per day Eijsden
deviation_day_Eijsden = []

for p in (dataEijsden['NUMERIEKEWAARDE']):
    deviation_day_Eijsden.append((p - mean_all_data_Eijsden))


fig = px.line(dataEijsden, title = 'Deviation Eijsden', x = 'WAARNEMINGDATUM', y = deviation_day_Eijsden)
fig.show()

In [313]:
#Deviation per day Lobith
deviation_day_Lobith = []

for p in (dataLobith['NUMERIEKEWAARDE']):
    deviation_day_Lobith.append((p - mean_all_data_Lobith))


fig = px.line(dataLobith, title = 'Deviation Lobith', x = 'WAARNEMINGDATUM', y = deviation_day_Lobith)
fig.show()

In [314]:
def stdev_year_Eijsden(year):
    data_Eijsden_year = dataEijsden[dataEijsden['WAARNEMINGDATUM'].str.endswith(str(year))==True]
    return (stddev_Eijsden(data_Eijsden_year['NUMERIEKEWAARDE']))

def stdev_year_Lobith(year):
    data_Lobith_year = dataLobith[dataLobith['WAARNEMINGDATUM'].str.endswith(str(year))==True]
    return (stddev_Lobith(data_Lobith_year['NUMERIEKEWAARDE']))


In [315]:
stdev_list_Eijsden = []
stdev_list_Lobith = []

for n in year_number:
    numbers3 = stdev_year_Eijsden(n)
    stdev_list_Eijsden.append(numbers3)
    
for n in year_number:
    numbers4 = stdev_year_Lobith(n)
    stdev_list_Lobith.append(numbers4)

In [316]:
#plot standard deviation Eijsden
x = year_number
y = stdev_list_Eijsden
fig = px.scatter(dataEijsden, title = 'Standard Deviation Eijsden', x = year_number, y = stdev_list_Eijsden, trendline = 'ols')
fig.show()

In [273]:
#plot standard deviation Lobith
x = year_number
y = stdev_list_Lobith

fig = px.scatter(dataLobith, title = 'Standard Deviation Lobith', x = year_number, y = stdev_list_Lobith, trendline = 'ols')
fig.show()

In [274]:
#variance and standard deviation per month

#dataEijsden['year'] = pd.DatetimeIndex(dataEijsden['WAARNEMINGDATUM']).year
#dataEijsden.head()
 
#dataEijsden['month'] = pd.DatetimeIndex(dataEijsden['WAARNEMINGDATUM']).month
#dataEijsden.head()

#dataEijsden['month_year'] = pd.to_datetime(dataEijsden['WAARNEMINGDATUM']).dt.to_period('M')
#dataEijsden.head()




In [275]:
#loop through months 

#year_number = range(1987, 2023)
#variance_list_Eijsden = []
#variance_list_Lobith = []

#for n in year_number:
 #   numbers1 = variance_year_Eijsden(n)
  #  variance_list_Eijsden.append(numbers1)
    
#for n in year_number:
 #   numbers2 = variance_year_Lobith(n)
  #  variance_list_Lobith.append(numbers2)

#for d in dataEijsden['WAARNEMINGDATUM']:
 #dataLobith['WAARNEMINGDATUM'].dt.strftime('%m/%Y')
    


In [318]:
#variance and standard deviation per month

def stdev_month_Eijsden(month):
    data_Eijsden_month = dataEijsden[dataEijsden['WAARNEMINGDATUM'].str.contains(str(month))==True]
    return (stddev_Eijsden(data_Eijsden_month['NUMERIEKEWAARDE']))


def stdev_month_Lobith(month):
    data_Lobith_month = dataLobith[dataLobith['WAARNEMINGDATUM'].str.contains(str(month))==True]
    return (stddev_Lobith(data_Lobith_month['NUMERIEKEWAARDE']))

print(stdev_month_Eijsden('06-1987'))
date = '06-1987'
stdev_month_Eijsden(date)

print(dataEijsden)

81.09980804992682
        WAARNEMINGDATUM WAARNEMINGTIJD (MET/CET)  NUMERIEKEWAARDE
0            20-05-1987                 00:00:00             4489
144          21-05-1987                 00:00:00             4491
288          22-05-1987                 00:00:00             4481
432          23-05-1987                 00:00:00             4492
576          24-05-1987                 00:00:00             4459
...                 ...                      ...              ...
1859040      07-10-2022                 12:30:00             4405
1859184      08-10-2022                 12:30:00             4401
1859328      09-10-2022                 12:30:00             4401
1859472      10-10-2022                 12:30:00             4420
1859616      11-10-2022                 12:30:00             4409

[12896 rows x 3 columns]


In [334]:
current = dataEijsden["WAARNEMINGDATUM"].values[0][3:]

month_Eijsden_stdev = []
current_list = []
for month_year in dataEijsden['WAARNEMINGDATUM']:
    if current != month_year[3:]:
        month_Eijsden_stdev.append(stdev_month_Eijsden(current))
        current_list.append(current)
        current = month_year[3:]
       
current = dataLobith["WAARNEMINGDATUM"].values[0][3:]

month_Lobith_stdev = []

for month_year in dataLobith['WAARNEMINGDATUM']:
    if current != month_year[3:]:
        month_Lobith_stdev.append(stdev_month_Lobith(current))
        current = month_year[3:]

In [339]:
#plot stdev per month Eijsden
month_number = range(0,425)

fig = px.scatter(dataEijsden, title = 'Standard Deviation per Month Eijsden', x = month_number, y = month_Eijsden_stdev, trendline = 'ols')
fig.show()

In [341]:
#print stdev per month Lobith
month_number = range(0,202)
fig = px.scatter(dataLobith, title = 'Standard Deviation per Month Lobith', x = month_number, y = month_Lobith_stdev, trendline = 'ols')
fig.show()