In [2]:
from urllib.request import urlopen
from urllib.parse import urlencode
import urllib3
import json
import pandas as pd
from time import sleep
from io import StringIO
import os
from enum import Enum
from concurrent.futures import ThreadPoolExecutor
import numpy as np

## Hydrology API code

In [4]:
class Measure(Enum):
    LEVEL = 'level'
    FLOW = 'flow'
    RAINFALL = 'rainfall'

class HydrologyApi:
    API_BASE_URL = "https://environment.data.gov.uk/hydrology/"
    DATA_DIR = "data"
    
    units = {
        Measure.LEVEL: 'i-900-m-qualified',
        Measure.FLOW: 'i-900-m3s-qualified',
        Measure.RAINFALL: 't-900-mm-qualified',
    }
    
    def __init__(self, max_threads):
        self.http = urllib3.PoolManager(maxsize=max_threads)
        self.thread_pool = ThreadPoolExecutor(max_workers=max_threads)
    
    def get_stations_on_river(self, river):
        api_url = self.API_BASE_URL + 'id/stations'
        result = urlopen(
            api_url + '?' + urlencode({'riverName': river})).read().decode('utf-8')
        data = json.loads(result)
        print(data)
        return pd.DataFrame(data['items'])
    
    def get_stations_close_to_with_measure(self, lat, lon, radius, measure: Measure, limit=100):
        api_url = self.API_BASE_URL + 'id/stations'
        
        result = self.http.request(
            'GET',
            api_url,
            fields={
                'observedProperty': measure.value,
                'lat': lat,
                'long': lon,
                'dist': radius,
                'status.label':'Active',
                '_limit': limit
            }
        ).data.decode('utf-8')
        data = json.loads(result)
        return pd.DataFrame(data['items'])
        
    
    def get_measure(self, measure: Measure, station_id: str, start=None):
        api_url = self.API_BASE_URL + f"id/measures/{station_id}-{measure.value}-{HydrologyApi.units[measure]}/readings"
        # result = urlopen(api_url).read().decode('utf-8')
        result = self.http.request(
            'GET',
            api_url,
            fields={}
                | ({
                    'mineq-date': start.strftime('%Y-%m-%d')
                } if start is not None else {}),
        ).data.decode('utf-8')
        data = json.loads(result)
        return pd.DataFrame(data['items'])
    
    def _batch_request(self, api_url):
        status = "Pending"

        while status in ("Pending", "InProgress"):
            print(f"Making request to: {api_url}")
            
            request = self.http.request(
                'GET', 
                api_url, 
                headers={
                    'Accept-Encoding': 'gzip'
                }
            )
            content_type = request.headers['Content-Type']

            if content_type == 'text/csv':
                if len(request.data) == 0:
                    print('Got empty CSV')
                    return None
                buffer = StringIO(request.data.decode('utf-8'))
                return pd.read_csv(buffer, low_memory=False)
            
            assert content_type in (
                'application/json',
                'application/json;charset=UTF-8'), f"Unexpected content type: {content_type}"

            data = json.loads(request.data.decode('utf-8'))
            status = data["status"]

            if status == "Pending":
                print(f"Query is pending")
                pos_in_queue = data["positionInQueue"]
                print(f"Position in queue: {pos_in_queue}")
                eta = data["eta"] / 1000
                print(f"Estimated completion: {eta}")
                sleep(eta * 1.1)

            elif status == "InProgress":
                print(f"Query in progress")
                eta = data["eta"] / 1000
                print(f"Estimated completion: {eta}")
                sleep(eta * 1.1)

            elif status in ("Complete", "Completed"):
                print(f"Query completed: {data}")
                csv_url = data["dataUrl"] if "dataUrl" in data else data["url"]
                return pd.read_csv(csv_url, low_memory=False)

            elif status == "Failed":
                raise Exception(f"Query failed, response: {data}")

            else:
                raise Exception(f"Unknown status: {data['status']}")
    
    def batch_get_measure(self, measure: Measure, station_id):
        try:
            api_url = self.API_BASE_URL + \
                f"data/batch-readings/batch/?measure={station_id}-{measure.value}-{HydrologyApi.units[measure]}"
                
            return self._batch_request(api_url)
        except Exception as e:
            print(f"Failed to get data for station: {station_id}, {e}")
            return None
        
    def batch_get_measure_on_river(self, measure: Measure, river):
        stations = self.get_stations_on_river(river)
        return self.batch_get_measure_from_stations(measure, stations)
        
    def batch_get_measure_from_stations(self, measure: Measure, stations):
        data = pd.DataFrame()
        threads = [
            self.thread_pool.submit(
                self.batch_get_measure, measure, station_id)
            for station_id in stations['notation'].values
        ]
        
        for thread, (station_id, station_name) in zip(threads, stations[['notation', 'label']].values):
            new_data = thread.result()
            if new_data is None:
                print(f"No new data for station: {station_name}")
                continue
            new_data = new_data.drop(columns=['measure', 'date', 'qcode', 'completeness'])
            new_data['station'] = station_name
            new_data['station'] = new_data['station'].astype('category')
            new_data['dateTime'] = pd.to_datetime(new_data['dateTime'])
            new_data['value'] = new_data['value'].astype('float32')
            new_data['quality'] = new_data['quality'].astype('category')
            data = pd.concat([data, new_data])
            data.drop_duplicates(subset=['dateTime', 'station'], inplace=True)
        return data
        
        
    def get_filename(self, measure: Measure, river):
        return f"{river.lower().replace(' ', '_')}_{measure.value}_raw.parquet"
        
    def update_dataframe(self, df: pd.DataFrame, measure: Measure, river: str):
        for station_name, station_id in self.get_stations_on_river(river)[['label', 'notation']].values:
            print(f"Updating data for station: {station_name}")
            last = df[df['station'] == station_name]['dateTime'].max() if len(df) > 0 else None
            new_measurements = self.get_measure(measure, station_id, last)[['dateTime', 'value', 'quality']]
            new_measurements['station'] = station_name
            new_measurements['station'] = new_measurements['station'].astype('category')
            new_measurements['dateTime'] = pd.to_datetime(new_measurements['dateTime'])
            new_measurements['value'] = new_measurements['value'].astype('float32')
            print(f"Got {len(new_measurements)} new measurements")
            df = pd.concat([df, new_measurements])
        df.drop_duplicates(subset=['dateTime', 'station'], inplace=True)
        return df
            
    def load(self, measure: Measure, river):
        if not os.path.exists(self.DATA_DIR):
            os.mkdir(self.DATA_DIR)
        
        filename = self.get_filename(measure, river)
        filepath = os.path.join(self.DATA_DIR, filename)
        if os.path.exists(filepath):
            print(f"Loading {filepath}")
            df = pd.read_parquet(filepath)
            df['dateTime'] = pd.to_datetime(df['dateTime'])
            df['station'] = df['station'].astype('category')
            df['value'] = df['value'].astype('float32')
        else:
            print(f"Downloading {measure.value} data on: {river}")
            df = self.batch_get_measure_on_river(measure, river)
            df.to_parquet(filepath)
        try:
            df = self.update_dataframe(df, measure, river)
        except Exception as e:
            print(f"Failed to update data: {e}")
        df.to_parquet(filepath)
        return df

def process_hydrology_data(df):
    return df[df['quality'].isin(['Good', 'Unchecked', 'Estimated'])] \
        .pivot(index='dateTime', columns='station', values='value') \
        .resample('15min').interpolate('time', limit_direction='both', limit=2)
        
def save_partitioned(df, dir, filename):
    if not os.path.exists(dir):
        os.mkdir(dir)
    # partition by year and month
    df['year'] = df.index.year
    df['month'] = df.index.month
    for year in df['year'].unique():
        for month in df[df['year'] == year]['month'].unique():
            partition = df[(df['year'] == year) & (df['month'] == month)]
            if len(partition) == 0:
                continue
            partition = partition.drop(columns=['year', 'month'])
            partition.to_parquet(os.path.join(dir, f"{filename}_{year}_{month}.parquet"))
        

api = HydrologyApi(max_threads = 2)


## Download Level Data

In [23]:
earliest = pd.Timestamp('2023-01-01')
level_df = api.load(Measure.LEVEL, "River Wear")
level_df.tail()

Loading data/river_wear_level_raw.parquet
{'meta': {'@id': 'http://environment.data.gov.uk/hydrology/id/stations?riverName=River+Wear', 'publisher': 'Environment Agency', 'license': 'http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/', 'licenseName': 'OGL 3', 'comment': 'Hydrology API for sub-daily data', 'version': '2.0.0', 'hasFormat': ['http://environment.data.gov.uk/hydrology/id/stations.geojson?riverName=River+Wear', 'http://environment.data.gov.uk/hydrology/id/stations.json?riverName=River+Wear', 'http://environment.data.gov.uk/hydrology/id/stations.ttl?riverName=River+Wear', 'http://environment.data.gov.uk/hydrology/id/stations.html?riverName=River+Wear', 'http://environment.data.gov.uk/hydrology/id/stations.csv?riverName=River+Wear', 'http://environment.data.gov.uk/hydrology/id/stations.rdf?riverName=River+Wear'], 'limit': 100}, 'items': [{'@id': 'http://environment.data.gov.uk/hydrology/id/stations/e7d8bbb6-5bba-4057-9f49-a299482c3348', 'label': 'Chester

Unnamed: 0,dateTime,value,quality,station
80,2023-11-04 20:00:00,0.775,Unchecked,Durham New Elvet Bridge
81,2023-11-04 20:15:00,0.775,Unchecked,Durham New Elvet Bridge
82,2023-11-04 20:30:00,0.777,Unchecked,Durham New Elvet Bridge
83,2023-11-04 20:45:00,0.778,Unchecked,Durham New Elvet Bridge
84,2023-11-04 21:00:00,0.776,Unchecked,Durham New Elvet Bridge


In [24]:
level_df = process_hydrology_data(level_df).dropna().loc[earliest:]
level_df.tail()

station,Chester Le Street,Durham New Elvet Bridge,Stanhope,Sunderland Bridge,Witton Park
dateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-11-04 20:00:00,0.916,0.775,0.596,0.78,0.841
2023-11-04 20:15:00,0.914,0.775,0.591,0.775,0.842
2023-11-04 20:30:00,0.913,0.777,0.591,0.775,0.843
2023-11-04 20:45:00,0.911,0.778,0.591,0.773,0.847
2023-11-04 21:00:00,0.91,0.776,0.596,0.771,0.841


In [26]:
# level_df.to_parquet(os.path.join(HydrologyApi.DATA_DIR, 'river_wear_level.parquet'))
# del level_df # Free
# print('Saved river_wear_level.parquet')
# save_partitioned(level_df, os.path.join(HydrologyApi.DATA_DIR, 'level'), 'river_wear_level')
level_df.to_parquet(os.path.join(HydrologyApi.DATA_DIR, 'level', 'river_wear_level.parquet'))
del level_df # Free

## Download Flow Data

In [29]:
flow_df = api.load(Measure.FLOW, "River Wear")
flow_df = process_hydrology_data(flow_df).dropna().loc[earliest:]
flow_df.tail()

Loading data/river_wear_flow_raw.parquet
{'meta': {'@id': 'http://environment.data.gov.uk/hydrology/id/stations?riverName=River+Wear', 'publisher': 'Environment Agency', 'license': 'http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/', 'licenseName': 'OGL 3', 'comment': 'Hydrology API for sub-daily data', 'version': '2.0.0', 'hasFormat': ['http://environment.data.gov.uk/hydrology/id/stations.geojson?riverName=River+Wear', 'http://environment.data.gov.uk/hydrology/id/stations.json?riverName=River+Wear', 'http://environment.data.gov.uk/hydrology/id/stations.ttl?riverName=River+Wear', 'http://environment.data.gov.uk/hydrology/id/stations.html?riverName=River+Wear', 'http://environment.data.gov.uk/hydrology/id/stations.csv?riverName=River+Wear', 'http://environment.data.gov.uk/hydrology/id/stations.rdf?riverName=River+Wear'], 'limit': 100}, 'items': [{'@id': 'http://environment.data.gov.uk/hydrology/id/stations/e7d8bbb6-5bba-4057-9f49-a299482c3348', 'label': 'Chester 

station,Chester Le Street,Stanhope,Sunderland Bridge,Witton Park
dateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-10-14 05:00:00,7.38,1.81,5.24,3.78
2023-10-14 05:15:00,7.34,1.81,5.21,3.76
2023-10-14 05:30:00,7.43,1.81,5.21,3.76
2023-10-14 05:45:00,7.34,1.81,5.21,3.76
2023-10-14 06:00:00,7.34,1.81,5.21,3.74


In [34]:
# save_partitioned(flow_df, os.path.join(HydrologyApi.DATA_DIR, 'flow'), 'river_wear_flow')
flow_df.columns = flow_df.columns.astype('str')
flow_df.to_parquet(os.path.join(HydrologyApi.DATA_DIR, 'flow', 'river_wear_flow.parquet'))
del flow_df

## Download Rainfall Data

In [6]:
rainfall_stations = api.get_stations_close_to_with_measure(54.66305556, -1.67611111, 10, Measure.RAINFALL, limit=5)

print(f"Got {len(rainfall_stations)} rainfall stations")
rainfall_stations.head()

Got 4 rainfall stations


Unnamed: 0,@id,label,notation,easting,northing,lat,long,type,stationGuid,wiskiID,dateOpened,observedProperty,status,measures
0,http://environment.data.gov.uk/hydrology/id/st...,Evenwood Gate,1dabd12c-1d2e-4765-ae38-a4d5a121928d,416820,523978,54.610671,-1.741089,[{'@id': 'http://environment.data.gov.uk/flood...,1dabd12c-1d2e-4765-ae38-a4d5a121928d,23164,1997-04-16,[{'@id': 'http://environment.data.gov.uk/refer...,{'@id': 'http://environment.data.gov.uk/flood-...,[{'@id': 'http://environment.data.gov.uk/hydro...
1,http://environment.data.gov.uk/hydrology/id/st...,ESH WINNING,15202aee-c5fd-404d-9de9-7357174ad10c,420084,542275,54.774977,-1.689302,[{'@id': 'http://environment.data.gov.uk/flood...,15202aee-c5fd-404d-9de9-7357174ad10c,24121,,[{'@id': 'http://environment.data.gov.uk/refer...,{'@id': 'http://environment.data.gov.uk/flood-...,[{'@id': 'http://environment.data.gov.uk/hydro...
2,http://environment.data.gov.uk/hydrology/id/st...,Harpington Hill Farm,bf61ce31-b20e-4593-85dc-a083133b12ce,433631,526654,54.633882,-1.480513,[{'@id': 'http://environment.data.gov.uk/flood...,bf61ce31-b20e-4593-85dc-a083133b12ce,32822,1987-05-12,[{'@id': 'http://environment.data.gov.uk/refer...,{'@id': 'http://environment.data.gov.uk/flood-...,[{'@id': 'http://environment.data.gov.uk/hydro...
3,http://environment.data.gov.uk/hydrology/id/st...,Copley,bc34e640-d9ae-4362-8804-25d66ca66e4d,408498,525452,54.624124,-1.869893,[{'@id': 'http://environment.data.gov.uk/flood...,bc34e640-d9ae-4362-8804-25d66ca66e4d,22994,1997-04-02,[{'@id': 'http://environment.data.gov.uk/refer...,{'@id': 'http://environment.data.gov.uk/flood-...,[{'@id': 'http://environment.data.gov.uk/hydro...


In [7]:

rainfall_df = api.batch_get_measure_from_stations(Measure.RAINFALL, rainfall_stations)
rainfall_df = process_hydrology_data(rainfall_df).dropna().loc[earliest:]

Making request to: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=1dabd12c-1d2e-4765-ae38-a4d5a121928d-rainfall-t-900-mm-qualified
Making request to: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=15202aee-c5fd-404d-9de9-7357174ad10c-rainfall-t-900-mm-qualified


Got empty CSV
Making request to: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=bf61ce31-b20e-4593-85dc-a083133b12ce-rainfall-t-900-mm-qualified


In [1]:
all_data = pd.concat([
  pd.read_parquet(os.path.join(HydrologyApi.DATA_DIR, 'level', 'river_wear_level.parquet')),
  pd.read_parquet(os.path.join(HydrologyApi.DATA_DIR, 'flow', 'river_wear_flow.parquet')),
  rainfall_df], axis=1)
del rainfall_df


NameError: name 'pd' is not defined