In [13]:
import pandas as pd
import os
import dask.dataframe as dd

In [14]:
level = dd.read_parquet('data/level/river_wear_level_*.parquet')
level = level.set_index('dateTime')

# Dataset is much too large to fit in memory, so we'll have to process it in chunks

for offset in [-15, 15]:
    lagged_copy = level.copy()
    lagged_copy.index = lagged_copy.index + pd.Timedelta(minutes=offset)
    lagged_copy.columns = [f'{col}_lead_{offset}' for col in lagged_copy.columns] if offset > 0 else [f'{col}_lag_{offset}' for col in lagged_copy.columns]
    level = level.merge(lagged_copy, how='left', left_index=True, right_index=True)
  
level = level.dropna()




In [15]:
flow = dd.read_parquet('data/flow/river_wear_flow_*.parquet')

for offset in [-15]:
    # Lag dataset
    lagged_copy = flow.copy()
    lagged_copy.index = lagged_copy.index + pd.Timedelta(minutes=offset)
    lagged_copy.columns = [f'{col}_lead_{offset}' for col in lagged_copy.columns] if offset > 0 else [f'{col}_lag_{offset}' for col in lagged_copy.columns]
    flow = flow.merge(lagged_copy, how='left', left_index=True, right_index=True)
    
flow = flow.dropna()
all_data = level.merge(flow, how='left', left_index=True, right_index=True)

Processing lag -15


In [16]:
all_data.head(5)

Unnamed: 0_level_0,Chester Le Street_x,Durham New Elvet Bridge,Stanhope_x,Sunderland Bridge_x,Witton Park_x,Chester Le Street_lag_-15_x,Durham New Elvet Bridge_lag_-15,Stanhope_lag_-15_x,Sunderland Bridge_lag_-15_x,Witton Park_lag_-15_x,...,Sunderland Bridge_lag_-15_lead_15,Witton Park_lag_-15_lead_15,Chester Le Street_y,Stanhope_y,Sunderland Bridge_y,Witton Park_y,Chester Le Street_lag_-15_y,Stanhope_lag_-15_y,Sunderland Bridge_lag_-15_y,Witton Park_lag_-15_y
dateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-07-20 09:30:00,0.376,0.256,0.267,0.306,0.351,0.375,0.256,0.267,0.305,0.351,...,0.306,0.351,5.74,1.09,3.16,2.02,5.7,1.09,3.14,2.02
2005-11-18 08:30:00,0.413,0.306,0.231,0.407,0.478,0.413,0.308,0.231,0.407,0.478,...,0.407,0.478,7.21,1.66,6.15,4.5,7.21,1.66,6.15,4.5
2006-02-18 13:00:00,0.474,0.366,0.288,0.466,0.524,0.472,0.366,0.288,0.465,0.523,...,0.466,0.524,10.1,2.32,8.47,5.71,9.99,2.32,8.42,5.68
2005-12-04 06:45:00,0.71,0.568,0.446,0.643,0.703,0.707,0.568,0.45,0.643,0.703,...,0.643,0.703,27.0,4.49,18.9,12.2,26.700001,4.55,18.9,12.2
2006-06-04 11:30:00,0.334,0.264,0.148,0.319,0.384,0.334,0.264,0.148,0.319,0.383,...,0.319,0.384,4.3,0.849,3.48,2.55,4.3,0.849,3.48,2.53


In [5]:
import re, os
pattern = re.compile(r'rainfall_\d*')
rainfall_dfs = [
  os.path.join('data', f) for f in os.listdir('data') if pattern.match(f)
] 

for df_path in rainfall_dfs:
  df = pd.read_parquet(df_path)
  for offset in [15, 30, 45, 60]:
    df.copy().shift(offset, freq='min').to_parquet(f'data/rainfall_lag_{offset}min.parquet')
  df = df.copy().resample('1H').sum()
  for offset in [2, 3, 4, 5, 6]:
    df.copy().shift(offset, freq='H').to_parquet(f'data/rainfall_lag_{offset}h.parquet')
  df = df.copy().resample('6H').sum()
  for offset in [2, 3, 4]:
    df.copy().shift(offset, freq='6H').to_parquet(f'data/rainfall_lag_{offset*6}h.parquet')
  df = df.copy().resample('1D').sum()
  for offset in [2, 3, 4, 5, 6, 7]:
    df.copy().shift(offset, freq='D').to_parquet(f'data/rainfall_lag_{offset}d.parquet')
del df

In [12]:
# Match all *_lag_* files
flow_files = [f for f in os.listdir('data') if re.match(r'river_wear_flow_lag_\d*.*.parquet', f)]
level_files = [f for f in os.listdir('data') if re.match(r'river_wear_level_lag_\d*.*.parquet', f)]
rainfall_files = [f for f in os.listdir('data') if re.match(r'rainfall_lag_\d*.*.parquet', f)]

def get_lag(path):
  return path.split('_')[-1].split('.')[0]

In [30]:
import dask.dataframe as dd

all_data = dd.read_parquet(os.path.join('data', flow_files[0]))
all_data.columns = [f'{col}_flow_0min' for col in all_data.columns]

for path in flow_files[1:]:
  offset = get_lag(path)
  df = dd.read_parquet(os.path.join('data', path))
  df.columns = [f'{col}_flow_{offset}' for col in df.columns]
  all_data = all_data.merge(df, how='left', left_index=True, right_index=True)
  
for path in level_files:
  offset = get_lag(path)
  df = dd.read_parquet(os.path.join('data', path))
  df.columns = [f'{col}_level_{offset}' for col in df.columns]
  all_data = all_data.merge(df, how='left', left_index=True, right_index=True)
  
for path in rainfall_files:
  offset = get_lag(path)
  df = dd.read_parquet(os.path.join('data', path))
  df.columns = [f'{col}_rainfall_{offset}' for col in df.columns]
  all_data = all_data.merge(df, how='left', left_index=True, right_index=True)
  
all_data['year'] = all_data.index.year
all_data['month'] = all_data.index.month
# Save in batches in parquet format
all_data.to_parquet('data/all_data.parquet', write_index=True, partition_on=['year', 'month'])

: 