In [1]:
from urllib.request import urlopen
from urllib.parse import urlencode
import urllib3
import json
import pandas as pd
from time import sleep
from io import StringIO
import os
from enum import Enum
from concurrent.futures import ThreadPoolExecutor
import numpy as np

## Hydrology API code

In [2]:
class Measure(Enum):
    LEVEL = 'level'
    FLOW = 'flow'
    RAINFALL = 'rainfall'

class HydrologyApi:
    API_BASE_URL = "https://environment.data.gov.uk/hydrology/"
    DATA_DIR = "data"
    
    float_precision = np.float16
    
    units = {
        Measure.LEVEL: 'i-900-m-qualified',
        Measure.FLOW: 'i-900-m3s-qualified',
        Measure.RAINFALL: 't-900-mm-qualified',
    }
    
    def __init__(self, max_threads):
        self.http = urllib3.PoolManager(maxsize=max_threads)
        self.thread_pool = ThreadPoolExecutor(max_workers=max_threads)
    
    def get_stations_on_river(self, river):
        api_url = self.API_BASE_URL + 'id/stations'
        result = urlopen(
            api_url + '?' + urlencode({'riverName': river})).read().decode('utf-8')
        data = json.loads(result)
        return pd.DataFrame(data['items'])
        
    def get_stations_close_to_with_measure(self, lat, lon, radius, measure: Measure, limit=100):
        api_url = self.API_BASE_URL + 'id/stations'
        
        result = self.http.request(
            'GET',
            api_url,
            fields={
                'observedProperty': measure.value,
                'lat': lat,
                'long': lon,
                'dist': radius,
                'status.label':'Active',
                '_limit': limit
            }
        ).data.decode('utf-8')
        data = json.loads(result)
        return pd.DataFrame(data['items'])
        
    
    def get_measure(self, measure: Measure, station_id: str, start=None):
        api_url = self.API_BASE_URL + f"id/measures/{station_id}-{measure.value}-{HydrologyApi.units[measure]}/readings"
        # result = urlopen(api_url).read().decode('utf-8')
        result = self.http.request(
            'GET',
            api_url,
            fields={}
                | ({
                    'mineq-date': start.strftime('%Y-%m-%d')
                } if start is not None else {}),
        ).data.decode('utf-8')
        data = json.loads(result)
        return pd.DataFrame(data['items'])
    
    def _batch_request(self, api_url):
        status = "Pending"

        while status in ("Pending", "InProgress"):
            print(f"Making request to: {api_url}")
            
            request = self.http.request(
                'GET', 
                api_url, 
                headers={
                    'Accept-Encoding': 'gzip'
                }
            )
            content_type = request.headers['Content-Type']

            if content_type == 'text/csv':
                if len(request.data) == 0:
                    print('Got empty CSV')
                    return None
                buffer = StringIO(request.data.decode('utf-8'))
                return pd.read_csv(buffer, low_memory=False)
            
            assert content_type in (
                'application/json',
                'application/json;charset=UTF-8'), f"Unexpected content type: {content_type}"

            data = json.loads(request.data.decode('utf-8'))
            status = data["status"]

            if status == "Pending":
                print(f"Query is pending")
                pos_in_queue = data["positionInQueue"]
                print(f"Position in queue: {pos_in_queue}")
                eta = data["eta"] / 1000
                print(f"Estimated completion: {eta}")
                sleep(eta * 1.1)

            elif status == "InProgress":
                print(f"Query in progress")
                eta = data["eta"] / 1000
                print(f"Estimated completion: {eta}")
                sleep(eta * 1.1)

            elif status in ("Complete", "Completed"):
                print(f"Query completed: {data}")
                csv_url = data["dataUrl"] if "dataUrl" in data else data["url"]
                return pd.read_csv(csv_url)

            elif status == "Failed":
                raise Exception(f"Query failed, response: {data}")

            else:
                raise Exception(f"Unknown status: {data['status']}")
    
    def batch_get_measure(self, measure: Measure, station_id):
        try:
            api_url = self.API_BASE_URL + \
                f"data/batch-readings/batch/?measure={station_id}-{measure.value}-{HydrologyApi.units[measure]}"
                
            return self._batch_request(api_url)
        except Exception as e:
            print(f"Failed to get data for station: {station_id}, {e}")
            return None
        
    def batch_get_measure_on_river(self, measure: Measure, river):
        stations = self.get_stations_on_river(river)
        return self.batch_get_measure_from_stations(measure, stations)
        
    def batch_get_measure_from_stations(self, measure: Measure, stations):
        data = pd.DataFrame()
        threads = [
            self.thread_pool.submit(
                self.batch_get_measure, measure, station_id)
            for station_id in stations['notation'].values
        ]
        
        for thread, (station_id, station_name) in zip(threads, stations[['notation', 'label']].values):
            new_data = thread.result()
            if new_data is None:
                print(f"No new data for station: {station_name}")
                continue
            new_data = new_data.drop(columns=['measure', 'date', 'qcode', 'completeness'])
            new_data['station'] = station_name
            new_data['station'] = new_data['station'].astype('category')
            new_data['dateTime'] = pd.to_datetime(new_data['dateTime'])
            new_data['value'] = new_data['value'].astype(float)
            new_data['quality'] = new_data['quality'].astype('category')
            data = pd.concat([data, new_data])
            data.drop_duplicates(subset=['dateTime', 'station'], inplace=True)
        return data
        
        
    def get_filename(self, measure: Measure, river):
        return f"{river.lower().replace(' ', '_')}_{measure.value}_raw.feather"
        
    def update_dataframe(self, df: pd.DataFrame, measure: Measure, river: str):
        for station_name, station_id in self.get_stations_on_river(river)[['label', 'notation']].values:
            print(f"Updating data for station: {station_name}")
            last = df[df['station'] == station_name]['dateTime'].max() if len(df) > 0 else None
            new_measurements = self.get_measure(measure, station_id, last)[['dateTime', 'value', 'quality']]
            new_measurements['station'] = station_name
            new_measurements['station'] = new_measurements['station'].astype('category')
            new_measurements['dateTime'] = pd.to_datetime(new_measurements['dateTime'])
            new_measurements['value'] = new_measurements['value'].astype(float)
            print(f"Got {len(new_measurements)} new measurements")
            df = pd.concat([df, new_measurements])
        df.drop_duplicates(subset=['dateTime', 'station'], inplace=True)
        return df
            
    def load(self, measure: Measure, river):
        if not os.path.exists(self.DATA_DIR):
            os.mkdir(self.DATA_DIR)
        
        filename = self.get_filename(measure, river)
        filepath = os.path.join(self.DATA_DIR, filename)
        if os.path.exists(filepath):
            print(f"Loading {filepath}")
            df = pd.read_feather(filepath)
            df['dateTime'] = pd.to_datetime(df['dateTime'])
            df['station'] = df['station'].astype('category')
            df['value'] = df['value'].astype(float)
        else:
            print(f"Downloading {measure.value} data on: {river}")
            df = self.batch_get_measure_on_river(measure, river)
            df.to_feather(filepath)
        try:
            df = self.update_dataframe(df, measure, river)
        except Exception as e:
            print(f"Failed to update data: {e}")
        df.to_feather(filepath)
        return df

def process_hydrology_data(df):
    return df[df['quality'].isin(['Good', 'Unchecked', 'Estimated'])] \
        .pivot(index='dateTime', columns='station', values='value') \
        .resample('15min').interpolate('time', limit_direction='both', limit=2, fill_value='extrapolate') \
        .astype(np.float16)

api = HydrologyApi(max_threads = 2)


## Download Level Data

In [3]:
level_df = api.load(Measure.LEVEL, "River Wear")
level_df = process_hydrology_data(level_df)
level_df.info()

Loading data\river_wear_level_raw.feather
Updating data for station: Chester Le Street
Got 4 new measurements
Updating data for station: Witton Park
Got 4 new measurements
Updating data for station: Sunderland Bridge
Got 4 new measurements
Updating data for station: Stanhope
Got 4 new measurements
Updating data for station: Durham New Elvet Bridge
Got 4 new measurements


  if _pandas_api.is_sparse(col):


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2200800 entries, 1961-01-29 01:00:00 to 2023-11-05 00:45:00
Freq: 15T
Data columns (total 5 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   Chester Le Street        float16
 1   Durham New Elvet Bridge  float16
 2   Stanhope                 float16
 3   Sunderland Bridge        float16
 4   Witton Park              float16
dtypes: float16(5)
memory usage: 37.8 MB


## Download Flow Data

In [4]:
flow_df = api.load(Measure.FLOW, "River Wear")
flow_df = process_hydrology_data(flow_df)
flow_df.info()

Loading data\river_wear_flow_raw.feather
Updating data for station: Chester Le Street
Got 25 new measurements
Updating data for station: Witton Park
Got 25 new measurements
Updating data for station: Sunderland Bridge
Got 25 new measurements
Updating data for station: Stanhope
Got 25 new measurements
Updating data for station: Durham New Elvet Bridge
Failed to update data: NaTType does not support strftime


  if _pandas_api.is_sparse(col):


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2198709 entries, 1961-01-29 01:00:00 to 2023-10-14 06:00:00
Freq: 15T
Data columns (total 4 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Chester Le Street  float16
 1   Stanhope           float16
 2   Sunderland Bridge  float16
 3   Witton Park        float16
dtypes: float16(4)
memory usage: 33.5 MB


We don't have flow data at New Elvet unfortunatly

## Download Rainfall Data

In [5]:
rainfall_stations = api.get_stations_close_to_with_measure(54.66305556, -1.67611111, 15, Measure.RAINFALL, limit=10)
bad_stations = ['15202aee-c5fd-404d-9de9-7357174ad10c']
rainfall_stations = rainfall_stations[~rainfall_stations['notation'].isin(bad_stations)].head(5)
print(f"Using {len(rainfall_stations)} rainfall stations")

Using 5 rainfall stations


In [6]:
rainfall_df = api.batch_get_measure_from_stations(Measure.RAINFALL, rainfall_stations)
rainfall_df = process_hydrology_data(rainfall_df)
rainfall_df.info()

Making request to: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=1dabd12c-1d2e-4765-ae38-a4d5a121928d-rainfall-t-900-mm-qualified
Making request to: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=bf61ce31-b20e-4593-85dc-a083133b12ce-rainfall-t-900-mm-qualified
Making request to: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=bc34e640-d9ae-4362-8804-25d66ca66e4d-rainfall-t-900-mm-qualified
Making request to: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=051f1b2a-6aca-4402-8956-5474ad39b12a-rainfall-t-900-mm-qualified
Making request to: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=a8773476-0fde-40c7-a66b-0901e528e8f2-rainfall-t-900-mm-qualified
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1467087 entries, 1982-01-01 09:15:00 to 2023-11-04 12:45:00
Freq: 15T
Data columns (total 5 columns):
 #   Column                    Non-Null 

In [7]:
df = pd.merge(
    level_df.add_prefix('Level '),
    flow_df.add_prefix('Flow '),
    left_index=True,
    right_index=True,
    how='outer',
)
df = pd.merge(
    df,
    rainfall_df.add_prefix('Rainfall '),
    left_index=True,
    right_index=True,
    how='outer',
).dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 351726 entries, 1999-12-03 20:30:00 to 2023-10-14 06:00:00
Data columns (total 14 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Level Chester Le Street            351726 non-null  float16
 1   Level Durham New Elvet Bridge      351726 non-null  float16
 2   Level Stanhope                     351726 non-null  float16
 3   Level Sunderland Bridge            351726 non-null  float16
 4   Level Witton Park                  351726 non-null  float16
 5   Flow Chester Le Street             351726 non-null  float16
 6   Flow Stanhope                      351726 non-null  float16
 7   Flow Sunderland Bridge             351726 non-null  float16
 8   Flow Witton Park                   351726 non-null  float16
 9   Rainfall Copley                    351726 non-null  float16
 10  Rainfall Darlington Lingfield Way  351726 non-null  float16
 11  Rainf

## Add Lag Features
for some reason makes loads of nans

In [24]:
df_rainfall_hourly = df.filter(regex='Rainfall').resample('1h').sum()
df_rainfall_six_hourly = df.filter(regex='Rainfall').resample('6h').sum()
df_rainfall_daily = df.filter(regex='Rainfall').resample('1d').sum()

target_cols = ['Level Durham New Elvet Bridge']
target_shifts = [+15, +30, +60, +90, +120]
level_shifts = [-15, -30, -60, -90, -120]
flow_shifts = [-15, -30, -60, -90, -120]
rainfall_min_shifts = [-15, -30, -60]
rainfall_hour_shifts = [-2, -3, -4, -5, -6]
rainfall_six_hour_shifts = [-12, -18, -24, -30, -36, -42, -48]
rainfall_day_shifts = [-3, -4, -5, -6, -7]

output_target_cols = []

df_lagged = df.copy()

for shift in target_shifts:
    shifted_df = df[target_cols].shift(shift, freq='min')
    shifted_df = shifted_df.add_suffix(f' {shift:+d}min')
    output_target_cols.extend(shifted_df.columns)
    df_lagged = pd.concat([df_lagged, shifted_df], axis=1)

for shift in level_shifts:
    shifted_df = df.filter(regex='Level').shift(shift, freq='min')
    df_lagged = pd.concat([df_lagged, shifted_df.add_suffix(f' {shift:+d}min')], axis=1)
    
for shift in flow_shifts:
    shifted_df = df.filter(regex='Flow').shift(shift, freq='min')
    df_lagged = pd.concat([df_lagged, shifted_df.add_suffix(f' {shift:+d}min')], axis=1)

for shift in rainfall_min_shifts:
    shifted_df = df.filter(regex='Rainfall').shift(shift, freq='min')
    df_lagged = pd.concat([df_lagged, shifted_df.add_suffix(f' {shift:+d}min')], axis=1)
    
for shift in rainfall_hour_shifts:
    shifted_df = df_rainfall_hourly.shift(shift, freq='h')
    df_lagged = pd.concat([df_lagged, shifted_df.add_suffix(f' {shift:+d}h')], axis=1)
    
for shift in rainfall_six_hour_shifts:
    shifted_df = df_rainfall_six_hourly.shift(shift, freq='h')
    df_lagged = pd.concat([df_lagged, shifted_df.add_suffix(f' {shift:+d}h')], axis=1)
    
for shift in rainfall_day_shifts:
    shifted_df = df_rainfall_daily.shift(shift, freq='d')
    df_lagged = pd.concat([df_lagged, shifted_df.add_suffix(f' {shift:+d}d')], axis=1)
    
df_lagged.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 475082 entries, 1999-11-26 00:00:00 to 2023-10-14 08:00:00
Columns: 164 entries, Level Chester Le Street to Rainfall Tunstall -7d
dtypes: float16(79), float32(85)
memory usage: 229.3 MB


In [25]:
output_target_cols

['Level Durham New Elvet Bridge +15min',
 'Level Durham New Elvet Bridge +30min',
 'Level Durham New Elvet Bridge +60min',
 'Level Durham New Elvet Bridge +90min',
 'Level Durham New Elvet Bridge +120min']

In [30]:
df_lagged.

station,Level Chester Le Street,Level Durham New Elvet Bridge,Level Stanhope,Level Sunderland Bridge,Level Witton Park,Flow Chester Le Street,Flow Stanhope,Flow Sunderland Bridge,Flow Witton Park,Rainfall Copley,...,Rainfall Copley -6d,Rainfall Darlington Lingfield Way -6d,Rainfall Evenwood Gate -6d,Rainfall Harpington Hill Farm -6d,Rainfall Tunstall -6d,Rainfall Copley -7d,Rainfall Darlington Lingfield Way -7d,Rainfall Evenwood Gate -7d,Rainfall Harpington Hill Farm -7d,Rainfall Tunstall -7d
dateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1999-11-26 00:00:00,,,,,,,,,,,...,,,,,,0.000000,0.000000,0.000000,0.000000,0.000000
1999-11-27 00:00:00,,,,,,,,,,,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.399902,2.799316,3.599121,2.799316,4.600098
1999-11-28 00:00:00,,,,,,,,,,,...,0.399902,2.799316,3.599121,2.799316,4.600098,0.000000,0.000000,0.000000,0.000000,0.000000
1999-11-29 00:00:00,,,,,,,,,,,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1999-11-30 00:00:00,,,,,,,,,,,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-10-14 07:00:00,,,,,,,,,,,...,,,,,,,,,,
2023-10-14 07:15:00,,,,,,,,,,,...,,,,,,,,,,
2023-10-14 07:30:00,,,,,,,,,,,...,,,,,,,,,,
2023-10-14 07:45:00,,,,,,,,,,,...,,,,,,,,,,


In [26]:
df_lagged.to_feather('data/river_wear_lagged.feather')

  if _pandas_api.is_sparse(col):
