In [6]:
import pandas as pd
import os
import numpy as np
from IPython.display import display, HTML, Markdown
from hydrology_api import HydrologyApi, Measure, process_hydrology_data

In [7]:
api = HydrologyApi(max_threads=5)

## Download Level Data

In [8]:
stations = api.get_stations_on_river('River Wear', Measure.LEVEL)
print(f"Loading data for {len(stations)} stations: {stations['label'].values}")
level_df = api.load(Measure.LEVEL, stations)
level_df = process_hydrology_data(level_df)
level_df.info()

Loading data for 5 stations: ['Chester Le Street' 'Witton Park' 'Sunderland Bridge' 'Stanhope'
 'Durham New Elvet Bridge']
Loading from cache: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=e7d8bbb6-5bba-4057-9f49-a299482c3348-level-i-900-m-qualified&mineq-date=2007-01-01
Loading from cache: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=05784319-693a-4d75-b29e-32f01a99ee4f-level-i-900-m-qualified&mineq-date=2007-01-01
Loading from cache: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=ddedb4d9-b2be-47c1-998d-acbc0ffb124b-level-i-900-m-qualified&mineq-date=2007-01-01
Loading from cache: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=b29c481a-5012-40f5-bb0c-f9370be34975-level-i-900-m-qualified&mineq-date=2007-01-01
Loading from cache: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=ba3f8598-e654-430d-9bb8-e1652e6ff93d-level-i-900-m-qualified

## Download Flow Data

In [9]:
stations = api.get_stations_on_river('River Wear', Measure.FLOW)
print(f"Loading data for {len(stations)} stations: {stations['label'].values}")
flow_df = api.load(Measure.FLOW, stations)
flow_df = process_hydrology_data(flow_df)
flow_df.info()

Loading data for 4 stations: ['Chester Le Street' 'Witton Park' 'Sunderland Bridge' 'Stanhope']
Loading from cache: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=e7d8bbb6-5bba-4057-9f49-a299482c3348-flow-i-900-m3s-qualified&mineq-date=2007-01-01
Loading from cache: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=05784319-693a-4d75-b29e-32f01a99ee4f-flow-i-900-m3s-qualified&mineq-date=2007-01-01
Loading from cache: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=ddedb4d9-b2be-47c1-998d-acbc0ffb124b-flow-i-900-m3s-qualified&mineq-date=2007-01-01
Loading from cache: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=b29c481a-5012-40f5-bb0c-f9370be34975-flow-i-900-m3s-qualified&mineq-date=2007-01-01
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 588505 entries, 2007-01-01 00:00:00 to 2023-10-14 06:00:00
Freq: 15T
Data columns (total 4 columns):
 #   Column             No

We don't have flow data at New Elvet unfortunatly

## Download Rainfall Data

In [10]:
rainfall_stations = api.get_stations_close_to_with_measure(54.66305556, -1.67611111, 15, Measure.RAINFALL, limit=10)
bad_stations = ['15202aee-c5fd-404d-9de9-7357174ad10c']
rainfall_stations = rainfall_stations[~rainfall_stations['notation'].isin(bad_stations)].head(5)
print(f"Using {len(rainfall_stations)} rainfall stations: {rainfall_stations['label'].values}")

rainfall_df = api.load(Measure.RAINFALL, rainfall_stations)
rainfall_df = process_hydrology_data(rainfall_df)
rainfall_df.info()

Using 5 rainfall stations: ['Evenwood Gate' 'Harpington Hill Farm' 'Copley' 'Tunstall'
 'Darlington Lingfield Way']
Loading from cache: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=bf61ce31-b20e-4593-85dc-a083133b12ce-rainfall-t-900-mm-qualified&mineq-date=2007-01-01
Loading from cache: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=1dabd12c-1d2e-4765-ae38-a4d5a121928d-rainfall-t-900-mm-qualified&mineq-date=2007-01-01
Loading from cache: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=bc34e640-d9ae-4362-8804-25d66ca66e4d-rainfall-t-900-mm-qualified&mineq-date=2007-01-01
Loading from cache: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=051f1b2a-6aca-4402-8956-5474ad39b12a-rainfall-t-900-mm-qualified&mineq-date=2007-01-01
Loading from cache: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=a8773476-0fde-40c7-a66b-0901e528e8f2-rainfall-t-900

In [11]:
df = pd.merge(
    level_df.add_prefix('Level '),
    flow_df.add_prefix('Flow '),
    left_index=True,
    right_index=True,
    how='outer',
)
df = pd.merge(
    df,
    rainfall_df.add_prefix('Rainfall '),
    left_index=True,
    right_index=True,
    how='outer',
)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 591440 entries, 2007-01-01 00:00:00 to 2023-11-13 19:45:00
Freq: 15T
Data columns (total 14 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Level Chester Le Street            591440 non-null  float16
 1   Level Durham New Elvet Bridge      564299 non-null  float16
 2   Level Stanhope                     502088 non-null  float16
 3   Level Sunderland Bridge            589891 non-null  float16
 4   Level Witton Park                  585640 non-null  float16
 5   Flow Chester Le Street             588505 non-null  float16
 6   Flow Stanhope                      499151 non-null  float16
 7   Flow Sunderland Bridge             586956 non-null  float16
 8   Flow Witton Park                   582705 non-null  float16
 9   Rainfall Copley                    576838 non-null  float16
 10  Rainfall Darlington Lingfield Way  543094 non-null  float16


In [12]:
# Fill na on flow and level by linear interpolation
flow_and_level_cols = [col for col in df.columns if col.startswith('Flow') or col.startswith('Level')]
df[flow_and_level_cols] = df[flow_and_level_cols].interpolate('time')

rainfall_cols = [col for col in df.columns if col.startswith('Rainfall')]
df[rainfall_cols] = df[rainfall_cols].fillna(0)

df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 591440 entries, 2007-01-01 00:00:00 to 2023-11-13 19:45:00
Freq: 15T
Data columns (total 14 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Level Chester Le Street            591440 non-null  float16
 1   Level Durham New Elvet Bridge      591440 non-null  float16
 2   Level Stanhope                     591440 non-null  float16
 3   Level Sunderland Bridge            591440 non-null  float16
 4   Level Witton Park                  591440 non-null  float16
 5   Flow Chester Le Street             591440 non-null  float16
 6   Flow Stanhope                      591440 non-null  float16
 7   Flow Sunderland Bridge             591440 non-null  float16
 8   Flow Witton Park                   591440 non-null  float16
 9   Rainfall Copley                    591440 non-null  float16
 10  Rainfall Darlington Lingfield Way  591440 non-null  float16


## Add Lag Features

In [13]:
df_rainfall_hourly = df.filter(regex='Rainfall').resample('1h').sum().resample('15min').interpolate('time')
df_rainfall_six_hourly = df.filter(regex='Rainfall').resample('6h').sum().resample('15min').interpolate('time')
df_rainfall_daily = df.filter(regex='Rainfall').resample('1d').sum().resample('15min').interpolate('time')

target_cols = ['Level Durham New Elvet Bridge']
target_shifts = range(15, 12*60 + 15, 15)
level_shifts = [-15, -30, -60, -90, -120, -180, -240, -300, -360, -420, -480, -540, -600, -660, -720, -780, -840, -900]
flow_shifts = level_shifts
rainfall_min_shifts = [-15, -30, -60]
rainfall_hour_shifts = [-2, -3, -4, -5, -6]
rainfall_six_hour_shifts = [-12, -18, -24, -30, -36, -42, -48]
rainfall_day_shifts = [-3, -4, -5, -6, -7]

output_target_cols = []

df_lagged = df.copy()

for shift in target_shifts:
    shifted_df = df[target_cols].shift(shift, freq='min')
    shifted_df = shifted_df.add_suffix(f' {shift:+d}min')
    output_target_cols.extend(shifted_df.columns)
    df_lagged = pd.merge(df_lagged, shifted_df, left_index=True, right_index=True, how='left')

for shift in level_shifts:
    shifted_df = df.filter(regex='Level').shift(shift, freq='min')
    df_lagged = pd.merge(df_lagged, shifted_df.add_suffix(f' {shift:+d}min'), left_index=True, right_index=True, how='left')
    
for shift in flow_shifts:
    shifted_df = df.filter(regex='Flow').shift(shift, freq='min')
    df_lagged = pd.merge(df_lagged, shifted_df.add_suffix(f' {shift:+d}min'), left_index=True, right_index=True, how='left')

for shift in rainfall_min_shifts:
    shifted_df = df.filter(regex='Rainfall').shift(shift, freq='min')
    df_lagged = pd.merge(df_lagged, shifted_df.add_suffix(f' {shift:+d}min'), left_index=True, right_index=True, how='left')
    
for shift in rainfall_hour_shifts:
    shifted_df = df_rainfall_hourly.shift(shift, freq='h')
    df_lagged = pd.merge(df_lagged, shifted_df.add_suffix(f' {shift:+d}h'), left_index=True, right_index=True, how='left')
    
for shift in rainfall_six_hour_shifts:
    shifted_df = df_rainfall_six_hourly.shift(shift, freq='h')
    df_lagged = pd.merge(df_lagged, shifted_df.add_suffix(f' {shift:+d}h'), left_index=True, right_index=True, how='left')
        
for shift in rainfall_day_shifts:
    shifted_df = df_rainfall_daily.shift(shift, freq='d')
    df_lagged = pd.merge(df_lagged, shifted_df.add_suffix(f' {shift:+d}d'), left_index=True, right_index=True, how='left')
    
df_lagged = df_lagged.astype(np.float16)

In [14]:
df_lagged = df_lagged.dropna()
df_lagged.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 590641 entries, 2007-01-01 12:00:00 to 2023-11-06 00:00:00
Freq: 15T
Columns: 324 entries, Level Chester Le Street to Rainfall Tunstall -7d
dtypes: float16(324)
memory usage: 369.5 MB


In [15]:
df_lagged.to_feather('data/river_wear_lagged.feather')

  if _pandas_api.is_sparse(col):
