In [8]:
from urllib.request import urlopen
from urllib.parse import urlencode
import urllib3
import json
import pandas as pd
from time import sleep
from io import StringIO
import os
from enum import Enum
from concurrent.futures import ThreadPoolExecutor

In [35]:
class Measure(Enum):
    LEVEL = 'level'
    FLOW = 'flow'

class HydrologyApi:
    API_BASE_URL = "https://environment.data.gov.uk/hydrology/"
    DATA_DIR = "data"
    
    def __init__(self, max_threads):
        self.http = urllib3.PoolManager(maxsize=max_threads)
        self.thread_pool = ThreadPoolExecutor(max_workers=max_threads)
    
    def get_stations_on_river(self, river):
        api_url = self.API_BASE_URL + 'id/stations'
        result = urlopen(
            api_url + '?' + urlencode({'riverName': river})).read().decode('utf-8')
        data = json.loads(result)
        print(data)
        return pd.DataFrame(data['items'])
    
    def get_measure(self, station_id: str, measure: Measure, start=None):
        api_url = self.API_BASE_URL + f"id/measures/{station_id}-{measure.value}-i-900-m-qualified/readings"
        # result = urlopen(api_url).read().decode('utf-8')
        result = self.http.request(
            'GET',
            api_url,
            fields={}
                | ({
                    'mineq-date': start.strftime('%Y-%m-%d')
                } if start is not None else {}),
        ).data.decode('utf-8')
        print({} 
                | ({
                    'mineq-date': start.strftime('%Y-%m-%d')
                } if start is not None else {}))
        data = json.loads(result)
        print(data)
        return pd.DataFrame(data['items'])
    
    def _batch_request(self, api_url):
        status = "Pending"

        while status in ("Pending", "InProgress"):
            print(f"Making request to: {api_url}")
            
            request = self.http.request(
                'GET', 
                api_url, 
                headers={
                    'Accept-Encoding': 'gzip'
                }
            )
            content_type = request.headers['Content-Type']

            if content_type == 'text/csv':
                if len(request.data) == 0:
                    print('Got empty CSV')
                    return None
                buffer = StringIO(request.data.decode('utf-8'))
                return pd.read_csv(buffer, low_memory=False)
            
            assert content_type in (
                'application/json',
                'application/json;charset=UTF-8'), f"Unexpected content type: {content_type}"

            data = json.loads(request.data.decode('utf-8'))
            status = data["status"]

            if status == "Pending":
                print(f"Query is pending")
                pos_in_queue = data["positionInQueue"]
                print(f"Position in queue: {pos_in_queue}")
                eta = data["eta"] / 1000
                print(f"Estimated completion: {eta}")
                sleep(eta * 1.1)

            elif status == "InProgress":
                print(f"Query in progress")
                eta = data["eta"] / 1000
                print(f"Estimated completion: {eta}")
                sleep(eta * 1.1)

            elif status in ("Complete", "Completed"):
                print(f"Query completed: {data}")
                csv_url = data["dataUrl"] if "dataUrl" in data else data["url"]
                return pd.read_csv(csv_url, low_memory=False)

            elif status == "Failed":
                raise Exception(f"Query failed, response: {data}")

            else:
                raise Exception(f"Unknown status: {data['status']}")
    
    def batch_get_measure(self, measure: Measure, station_id):
        api_url = self.API_BASE_URL + \
            f"data/batch-readings/batch/?measure={station_id}-{measure.value}-i-900"
            
        return self._batch_request(api_url)
        
    def batch_get_measure_on_river(self, measure: Measure, river):
        data = pd.DataFrame()
    
        stations = self.get_stations_on_river(river)
        
        threads = [
            self.thread_pool.submit(
                self.batch_get_measure, measure, station_id)
            for station_id in stations['notation'].values
        ]
        
        for thread, (station_id, station_name) in zip(threads, stations[['notation', 'label']].values):
            new_data = thread.result()
            if new_data is None:
                print(f"No new data for station: {station_name}")
                continue
            new_data = new_data.drop(columns=['measure', 'date', 'qcode', 'completeness'])
            new_data['station'] = station_name
            new_data['station'] = new_data['station'].astype('category')
            new_data['dateTime'] = pd.to_datetime(new_data['dateTime'])
            new_data['value'] = new_data['value'].astype('float32')
            new_data['quality'] = new_data['quality'].astype('category')
            data = pd.concat([data, new_data])
            data.drop_duplicates(subset=['dateTime', 'station'], inplace=True)
        return data
        
    def get_filename(self, measure: Measure, river):
        return f"{river.lower().replace(' ', '_')}_{measure.value}_raw.parquet"
        
    def update_dataframe(self, df: pd.DataFrame, measure: Measure, river: str):
        for station_name, station_id in self.get_stations_on_river(river)[['label', 'notation']].values:
            print(f"Updating data for station: {station_name}")
            last = df[df['station'] == station_name]['dateTime'].max() if len(df) > 0 else None
            new_measurements = self.get_measure(station_id, measure, last)[['dateTime', 'value', 'quality']]
            new_measurements['station'] = station_name
            new_measurements['station'] = new_measurements['station'].astype('category')
            new_measurements['dateTime'] = pd.to_datetime(new_measurements['dateTime'])
            new_measurements['value'] = new_measurements['value'].astype('float32')
            print(f"Got {len(new_measurements)} new measurements")
            df = pd.concat([df, new_measurements])
        df.drop_duplicates(subset=['dateTime', 'station'], inplace=True)
        return df
            
    def load(self, measure: Measure, river):
        if not os.path.exists(self.DATA_DIR):
            os.mkdir(self.DATA_DIR)
        
        filename = self.get_filename(measure, river)
        filepath = os.path.join(self.DATA_DIR, filename)
        if os.path.exists(filepath):
            print(f"Loading {filepath}")
            df = pd.read_parquet(filepath)
            df['dateTime'] = pd.to_datetime(df['dateTime'])
            df['station'] = df['station'].astype('category')
            df['value'] = df['value'].astype('float32')
        else:
            print(f"Downloading {measure.value} data on: {river}")
            df = self.batch_get_measure_on_river(measure, river)
        df = self.update_dataframe(df, measure, river)
        df.to_parquet(filepath)
        return df

def process_hydrology_data(df):
    return df[df['quality'].isin(['Good', 'Unchecked', 'Estimated'])] \
        .pivot(index='dateTime', columns='station', values='value') \
        .resample('15min').interpolate('time', limit_direction='both', limit=2)
        

api = HydrologyApi(max_threads = 2)


In [4]:
level_df = api.load(Measure.LEVEL, "River Wear")
level_df.tail()

Downloading level data on: River Wear
Making request to: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=e7d8bbb6-5bba-4057-9f49-a299482c3348-level-i-900-m-qualified
Making request to: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=05784319-693a-4d75-b29e-32f01a99ee4f-level-i-900-m-qualified
Making request to: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=ddedb4d9-b2be-47c1-998d-acbc0ffb124b-level-i-900-m-qualified
Making request to: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=b29c481a-5012-40f5-bb0c-f9370be34975-level-i-900-m-qualified
Making request to: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=ba3f8598-e654-430d-9bb8-e1652e6ff93d-level-i-900-m-qualified
Updating data for station: Chester Le Street
Got 1279 new measurements
Updating data for station: Witton Park
Got 1279 new measurements
Updating data for station: Sunderland Bri

Unnamed: 0,dateTime,value,quality,station
1274,2023-11-02 17:45:00,2.451,Unchecked,Durham New Elvet Bridge
1275,2023-11-02 18:00:00,2.516,Unchecked,Durham New Elvet Bridge
1276,2023-11-02 18:15:00,2.52,Unchecked,Durham New Elvet Bridge
1277,2023-11-02 18:30:00,2.574,Unchecked,Durham New Elvet Bridge
1278,2023-11-02 18:45:00,2.595,Unchecked,Durham New Elvet Bridge


In [5]:
level_df = process_hydrology_data(level_df)
level_df.tail()

station,Chester Le Street,Durham New Elvet Bridge,Stanhope,Sunderland Bridge,Witton Park
dateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-11-02 17:45:00,2.333,2.451,1.572,2.1,2.719
2023-11-02 18:00:00,2.377,2.516,1.555,2.155,2.682
2023-11-02 18:15:00,2.415,2.52,1.532,2.192,2.647
2023-11-02 18:30:00,2.455,2.574,1.511,2.228,2.599
2023-11-02 18:45:00,2.491,2.595,1.49,2.256,2.536


In [6]:
level_df.to_parquet(os.path.join(HydrologyApi.DATA_DIR, 'river_wear_level.parquet'))

In [36]:
flow_df = api.load(Measure.FLOW, "River Wear")
flow_df.tail()

Downloading flow data on: River Wear
{'meta': {'@id': 'http://environment.data.gov.uk/hydrology/id/stations?riverName=River+Wear', 'publisher': 'Environment Agency', 'license': 'http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/', 'licenseName': 'OGL 3', 'comment': 'Hydrology API for sub-daily data', 'version': '2.0.0', 'hasFormat': ['http://environment.data.gov.uk/hydrology/id/stations.geojson?riverName=River+Wear', 'http://environment.data.gov.uk/hydrology/id/stations.json?riverName=River+Wear', 'http://environment.data.gov.uk/hydrology/id/stations.ttl?riverName=River+Wear', 'http://environment.data.gov.uk/hydrology/id/stations.html?riverName=River+Wear', 'http://environment.data.gov.uk/hydrology/id/stations.csv?riverName=River+Wear', 'http://environment.data.gov.uk/hydrology/id/stations.rdf?riverName=River+Wear'], 'limit': 100}, 'items': [{'@id': 'http://environment.data.gov.uk/hydrology/id/stations/e7d8bbb6-5bba-4057-9f49-a299482c3348', 'label': 'Chester Le S

Query in progress
Estimated completion: 59.973
Query in progress
Estimated completion: 59.99
Making request to: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=e7d8bbb6-5bba-4057-9f49-a299482c3348-flow-i-900
Making request to: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=05784319-693a-4d75-b29e-32f01a99ee4f-flow-i-900
Got empty CSV
Making request to: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=ddedb4d9-b2be-47c1-998d-acbc0ffb124b-flow-i-900
No new data for station: Chester Le Street
Got empty CSV
Making request to: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=b29c481a-5012-40f5-bb0c-f9370be34975-flow-i-900
No new data for station: Witton Park
Query in progressQuery in progress
Estimated completion: 59.982

Estimated completion: 59.937
Making request to: https://environment.data.gov.uk/hydrology/data/batch-readings/batch/?measure=b29c481a-5012-40f5-bb0c-f9370be34

KeyError: "None of [Index(['dateTime', 'value', 'quality'], dtype='object')] are in the [columns]"