In [1]:
import load_files as lf
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
file_path = "/Users/blazejmanczak/Desktop/Q1/DataChallenge3/"
flow_data_DRU, level_data_DRU = lf.get_measurements(file_path + "waterschap-aa-en-maas_sewage_2019/sewer_data/data_pump/RG8150")
rain_data = lf.get_rain(file_path +"/waterschap-aa-en-maas_sewage_2019/sewer_data/rain_timeseries")
data = lf.sdf(file_path + "waterschap-aa-en-maas_sewage_2019/sewer_model/aa-en-maas_sewer_shp")

In [7]:
class measurement_analysis:
    def __init__(self, flow_data, level_data, rain_data,
                 min_dry_series=3, area_data=None, village_code=None, dry_threshold=2.5, max_interval=None):
       
        # CLEAN DATA
        # Check wether data has right format and if not make necessary transformations

        # Transform TimeStamp column to datetime
        if flow_data["TimeStamp"].dtype != "<M8[ns]":
            flow_data["TimeStamp"] = pd.to_datetime(flow_data["TimeStamp"])

        flow_data.sort_values("TimeStamp", inplace=True)
        flow_data.reset_index(drop=True, inplace=True)

        if level_data["TimeStamp"].dtype != "<M8[ns]":
            level_data["TimeStamp"] = pd.to_datetime(level_data["TimeStamp"])

        level_data.sort_values("TimeStamp", inplace=True)
        level_data.reset_index(drop=True, inplace=True)

        # Check if rain_data is already summarized
        if not all(i in rain_data.columns for i in ['Date', 'Total', 'DrySeries']):
            rain_data = summarize_rain_data(rain_data, area_data, village_code, dry_threshold)

        flow_data["Date"] = flow_data["TimeStamp"].apply(lambda i: i.date())
        flow_data["Hour"] = flow_data["TimeStamp"].apply(lambda i: i.hour)
        flow_data["Month"] = flow_data["Date"].apply(lambda i: i.month)
        flow_data["Weekend"] = flow_data["Date"].apply(lambda i: int(i.weekday() >= 5))
        flow_data["TimeSpan"] = flow_data["TimeStamp"].diff(1).apply(lambda i: i.seconds).fillna(5)
        flow_data["Freq"] = 1 / flow_data["TimeSpan"]
        flow_data["Flow"] = flow_data["Value"] * flow_data["TimeSpan"] / 3600

        flow_data["max"] = ((flow_data["Value"].diff(1) > 0) & (flow_data["Value"].diff(-1) > 0)).astype(int)
        flow_data["min"] = ((flow_data["Value"].diff(1) < 0) & (flow_data["Value"].diff(-1) < 0)).astype(int)

        level_data["Date"] = level_data["TimeStamp"].apply(lambda i: i.date())
        level_data["Hour"] = level_data["TimeStamp"].apply(lambda i: i.hour)
        level_data["Month"] = level_data["Date"].apply(lambda i: i.month)
        level_data["Weekend"] = level_data["Date"].apply(lambda i: int(i.weekday() >= 5))
        level_data["TimeSpan"] = level_data["TimeStamp"].diff(1).apply(lambda i: i.seconds)
        level_data["Freq"] = 1 / level_data["TimeSpan"]
        level_data["Delta"] = level_data["Value"].diff(1)

        level_data["max"] = ((level_data["Value"].diff(1) > 0) & (level_data["Value"].diff(-1) > 0)).astype(int)
        level_data["min"] = ((level_data["Value"].diff(1) < 0) & (level_data["Value"].diff(-1) < 0)).astype(int)

        # Additional Measures
        self.area = area_data.loc[area_data["village_ID"] == village_code, "geometry"]                                 .to_crs({"init": "epsg:3395"}).map(lambda p: p.area / 10**6).sum()

        # STORE DATA
        self.min_dry_series = min_dry_series
        self.area_data = area_data
        self.village_code = village_code
        self.dry_threshold = dry_threshold
        self.max_interval = max_interval

        self.flow_data = flow_data
        self.level_data = level_data
        self.rain_data = rain_data

In [8]:
df = measurement_analysis(flow_data_DRU, level_data_DRU, rain_data, area_data=data.area_data, village_code="DRU")

NameError: name 'summarize_rain_data' is not defined

In [13]:
x, y = df.analyze(data.area_data, "DRU")

In [14]:
x["Weekday"] = x["Date"].apply(lambda i: int(i.weekday() >= 5))

In [15]:
rain_data

Unnamed: 0,Start,End,Kwaliteit,Lekerstraat,11. Speelweide/Vlinkert,Rips,Handel,14. Koolhof,13. Heiakker,Vlierden,...,Ooivaarsrijt,de Brand,Bitswijk,de Geer-zuid,De Hoeven(Haarsteeg),11+19 Grevekeur,CHV-terrein,Geb. 12 Vughterpoort,Heusdenseweg(Haarsteeg),Bosscheweg
0,2017-12-31 23:00:00,01-01-2018 00:00:00,HydroNET D2,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0
1,2018-01-01 00:00:00,01-01-2018 01:00:00,HydroNET D2,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0124,0.0000,0.0000,0.0000,0.0000,0.0
2,2018-01-01 01:00:00,01-01-2018 02:00:00,HydroNET D2,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.0000,0.0244,0.0082,0.0000,0.0789,0.0489,0.0000,0.0000,0.0820,0.0
3,2018-01-01 02:00:00,01-01-2018 03:00:00,HydroNET D2,0.0,0.0223,0.0,0.0,0.0000,0.0000,0.0000,...,0.0186,0.0932,0.0199,0.0689,0.0186,0.0373,0.0351,0.0609,0.0205,0.0
4,2018-01-01 03:00:00,01-01-2018 04:00:00,HydroNET D2,0.0,0.0000,0.0,0.0,0.0477,0.0049,0.0032,...,0.1992,0.0509,0.0264,0.0984,0.0036,0.0269,0.0110,0.0323,0.0176,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128097,2019-12-07 23:35:00,12-07-2019 23:40:00,HydroNET D2,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0
128098,2019-12-07 23:40:00,12-07-2019 23:45:00,HydroNET D2,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0
128099,2019-12-07 23:45:00,12-07-2019 23:50:00,HydroNET D2,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0
128100,2019-12-07 23:50:00,12-07-2019 23:55:00,HydroNET D2,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0


In [None]:
haha = pd.merge(x, y, on=["Date", "Hour"])

In [None]:
haha = haha[haha["nvals"] >= 100]

In [None]:
haha.to_csv("heya_lm.csv")

In [None]:
hey.loc[hey["Date"] == pd.to_datetime("2019-1-7").date(), ["Flow"]].plot()

In [None]:
z.iloc[:, 2:].groupby(["Date", "hour"]).agg("sum").reset_index(drop=False)

In [None]:
haha.groupby("hour")["Value"].sum().plot()

In [None]:
haha.groupby("hour")["Flow"].sum().plot()

In [25]:
sebData = pd.read_csv('/Users/blazejmanczak/Desktop/Q1/DataChallenge3/heya_lm.csv')

In [26]:
sebData['Hour'] = sebData['Hour'].apply(lambda i: '0' + str(i)+':00:00' if i < 10 else str(i)+':00:00')

In [28]:
sebData.to_csv("heya_lm1.csv")