In [8]:
import load_files as lf
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [9]:
file_path = "/Users/blazejmanczak/Desktop/Q1/DataChallenge3/"
flow_data_DRU, level_data_DRU = lf.get_measurements(file_path + "waterschap-aa-en-maas_sewage_2019/sewer_data/data_pump/RG8150")
rain_data = lf.get_rain(file_path +"/waterschap-aa-en-maas_sewage_2019/sewer_data/rain_timeseries")
data = lf.sdf(file_path + "waterschap-aa-en-maas_sewage_2019/sewer_model/aa-en-maas_sewer_shp")

In [10]:
class measurement_analysis:
    def __init__(self, flow_data, level_data, rain_data):
        # CLEAN DATA
        # Check wether data has right format and if not make necessary transformations
        
        # Transform TimeStamp column to datetime
        if flow_data["TimeStamp"].dtype != "<M8[ns]":
            flow_data["TimeStamp"] = pd.to_datetime(flow_data["TimeStamp"])
            
        flow_data = flow_data.loc[flow_data["DataQuality"] != 0]
        flow_data.sort_values("TimeStamp", inplace=True)
        flow_data.reset_index(drop=True, inplace=True)
        
        if level_data["TimeStamp"].dtype != "<M8[ns]":
            level_data["TimeStamp"] = pd.to_datetime(level_data["TimeStamp"])
        
        level_data = level_data.loc[level_data["DataQuality"] != 0]
        level_data.sort_values("TimeStamp", inplace=True)
        level_data.reset_index(drop=True, inplace=True)
        
        if rain_data["Start"].dtype != "<M8[ns]":
            rain_data["Start"] = pd.to_datetime(rain_data["Start"])
            
        rain_data.sort_values("Start", inplace=True)
        rain_data.reset_index(drop=True, inplace=True)
        
        # STORE DATA
        self.flow_data = flow_data
        self.level_data = level_data
        self.rain_data = rain_data
    
    def analyze(self, area_data=None, village_code=None):
        
        if village_code is not None:
            area_data["village_ID"] = area_data["sewer_system"].str.slice(4,7)
            area_data = area_data.loc[area_data["village_ID"] == village_code]
            areas = area_data["area_name"][area_data["area_name"].apply(lambda i: i in self.rain_data.columns)].to_list()
        
            self.rain_data = self.rain_data.loc[:, ["Start", "End"] + areas]
        
        self.rain_data["Date"] = self.rain_data["Start"].apply(lambda i: i.date())
        self.flow_data["Date"] = self.flow_data["TimeStamp"].apply(lambda i: i.date())
        self.level_data["Date"] = self.level_data["TimeStamp"].apply(lambda i: i.date())
        
        self.rain_data["Hour"] = self.rain_data["Start"].apply(lambda i: i.hour)
        self.flow_data["Hour"] = self.flow_data["TimeStamp"].apply(lambda i: i.hour)
        self.level_data["Hour"] = self.level_data["TimeStamp"].apply(lambda i: i.hour)
        
        
        self.flow_data["TimeSpan"] = self.flow_data["TimeStamp"].diff(1).apply(lambda i: i.seconds).fillna(5)
        self.level_data["TimeSpan"] = self.level_data["TimeStamp"].diff(1).apply(lambda i: i.seconds)
        
        self.flow_data["Flow"] = self.flow_data["Value"] * self.flow_data["TimeSpan"] / 3600
        
        flow_summary = self.flow_data.groupby(["Date", "Hour"])["Flow"].sum().reset_index(drop=False)
        #flow_summary["Weekday"] = self.flow_data["Date"].apply(lambda i: int(i.weekday() >= 5))
        flow_summary["nvals"] = self.flow_data.groupby(["Date", "Hour"])["Flow"].count().reset_index(drop=True)
        rain_summary = self.rain_data.iloc[:, 2:].groupby(["Date", "Hour"]).agg("sum").reset_index(drop=False)
        
        self.village_code = village_code
        self.area_data = area_data
        
        #self.flow_data = self.flow_data
        
        return flow_summary, rain_summary

In [11]:
df = measurement_analysis(flow_data_DRU, level_data_DRU, rain_data)

In [12]:
data.area_data

Unnamed: 0,sewer_system,area_name,area_ID,area,geometry
0,CUI-CUI-BEI,Beijerd en 't Riet,316,972233.664525,"POLYGON ((188476.626 415967.071, 188542.047 41..."
1,HEL-HEL-AKI,30. Akkers I,792,373952.057344,"POLYGON ((172555.205 385189.083, 172472.248 38..."
2,HER-ROS-OVE,gebied E; Overlaet E,97,711823.358004,"POLYGON ((153801.956 415030.312, 153801.194 41..."
3,HER-HER-ORP,Geb. 8 Orthenpoort,539,190067.950313,"POLYGON ((149224.757 412105.929, 149224.116 41..."
4,HER-BRA-BRA,de Brand,770,658138.191970,"POLYGON ((153600.725 410510.338, 153375.621 41..."
...,...,...,...,...,...
983,OSS-DEU-DEU,Deursen,14,71025.956432,"POLYGON ((171789.081 423649.889, 171772.122 42..."
984,OSS-DEU-DEN,Dennenburg,15,155874.653226,"POLYGON ((171346.18 423892.787, 171442.457 423..."
985,SOM-SOM-SLI,RG Slievenpark,653,70.684837,"POLYGON ((177569.755 377706.673, 177570.888 37..."
986,SMI-MID-COU,Coudevoort,1448,25398.328770,"POLYGON ((156598.024 409729.325, 156643.581 40..."


In [13]:
x, y = df.analyze(data.area_data, "DRU")

In [14]:
x["Weekday"] = x["Date"].apply(lambda i: int(i.weekday() >= 5))

In [15]:
rain_data

Unnamed: 0,Start,End,Kwaliteit,Lekerstraat,11. Speelweide/Vlinkert,Rips,Handel,14. Koolhof,13. Heiakker,Vlierden,...,Ooivaarsrijt,de Brand,Bitswijk,de Geer-zuid,De Hoeven(Haarsteeg),11+19 Grevekeur,CHV-terrein,Geb. 12 Vughterpoort,Heusdenseweg(Haarsteeg),Bosscheweg
0,2017-12-31 23:00:00,01-01-2018 00:00:00,HydroNET D2,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0
1,2018-01-01 00:00:00,01-01-2018 01:00:00,HydroNET D2,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0124,0.0000,0.0000,0.0000,0.0000,0.0
2,2018-01-01 01:00:00,01-01-2018 02:00:00,HydroNET D2,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.0000,0.0244,0.0082,0.0000,0.0789,0.0489,0.0000,0.0000,0.0820,0.0
3,2018-01-01 02:00:00,01-01-2018 03:00:00,HydroNET D2,0.0,0.0223,0.0,0.0,0.0000,0.0000,0.0000,...,0.0186,0.0932,0.0199,0.0689,0.0186,0.0373,0.0351,0.0609,0.0205,0.0
4,2018-01-01 03:00:00,01-01-2018 04:00:00,HydroNET D2,0.0,0.0000,0.0,0.0,0.0477,0.0049,0.0032,...,0.1992,0.0509,0.0264,0.0984,0.0036,0.0269,0.0110,0.0323,0.0176,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128097,2019-12-07 23:35:00,12-07-2019 23:40:00,HydroNET D2,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0
128098,2019-12-07 23:40:00,12-07-2019 23:45:00,HydroNET D2,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0
128099,2019-12-07 23:45:00,12-07-2019 23:50:00,HydroNET D2,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0
128100,2019-12-07 23:50:00,12-07-2019 23:55:00,HydroNET D2,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0


In [None]:
haha = pd.merge(x, y, on=["Date", "Hour"])

In [None]:
haha = haha[haha["nvals"] >= 100]

In [None]:
haha.to_csv("heya_lm.csv")

In [None]:
hey.loc[hey["Date"] == pd.to_datetime("2019-1-7").date(), ["Flow"]].plot()

In [None]:
z.iloc[:, 2:].groupby(["Date", "hour"]).agg("sum").reset_index(drop=False)

In [None]:
haha.groupby("hour")["Value"].sum().plot()

In [None]:
haha.groupby("hour")["Flow"].sum().plot()

In [16]:
sebData = pd.read_csv('/Users/blazejmanczak/Desktop/Q1/DataChallenge3/heya_lm.csv')