In [21]:
from urllib.request import urlopen
from urllib.parse import urlencode
import urllib3
import json
import pandas as pd
from time import sleep
from tqdm import tqdm
from io import StringIO
import os
from typing import Literal
from enum import Enum

In [173]:
class Measure(Enum):
    LEVEL = 'level'
    RAINFALL = 'rainfall'

class HydrologyApi:
    API_BASE_URL = "https://environment.data.gov.uk/hydrology/"
    DATA_DIR = "data"
    
    def __init__(self):
        self.http = urllib3.PoolManager()
    
    def get_stations_on_river(self, river):
        api_url = self.API_BASE_URL + 'id/stations'
        result = urlopen(
            api_url + '?' + urlencode({'riverName': river})).read().decode('utf-8')
        data = json.loads(result)
        return pd.DataFrame(data['items'])
    
    def get_levels(self, station_id, start):
        api_url = self.API_BASE_URL + f"id/measures/{station_id}-level-i-900-m-qualified/readings"
        # result = urlopen(api_url).read().decode('utf-8')
        result = self.http.request(
            'GET',
            api_url,
            fields={
                'mineq-date': start.strftime('%Y-%m-%d')             
            }
        ).data.decode('utf-8')
        
        data = json.loads(result)
        return pd.DataFrame(data['items'])
    
    def _batch_request(self, api_url, query_params):
        status = "Pending"

        while status in ("Pending", "InProgress"):
            
            request = self.http.request(
                'GET', 
                api_url, 
                headers={
                    'Accept-Encoding': 'gzip'
                }, 
                # fields=query_params
            )
            content_type = request.headers['Content-Type']

            if content_type == 'text/csv':
                if len(request.data) == 0:
                    print('Got empty CSV')
                    return None
                buffer = StringIO(request.data.decode('utf-8'))
                return pd.read_csv(buffer, low_memory=False)
            
            assert content_type in (
                'application/json',
                'application/json;charset=UTF-8'), f"Unexpected content type: {content_type}"

            data = json.loads(request.data.decode('utf-8'))
            status = data["status"]

            if status == "Pending":
                print(f"Query is pending")
                pos_in_queue = data["positionInQueue"]
                print(f"Position in queue: {pos_in_queue}")
                eta = data["eta"] / 1000
                print(f"Estimated completion: {eta}")
                sleep(eta * 1.1)

            elif status == "InProgress":
                print(f"Query in progress")
                eta = data["eta"] / 1000
                print(f"Estimated completion: {eta}")
                sleep(eta * 1.1)

            elif status in ("Complete", "Completed"):
                print(f"Query completed: {data}")
                csv_url = data["dataUrl"] if "dataUrl" in data else data["url"]
                return pd.read_csv(csv_url, low_memory=False)

            elif status == "Failed":
                raise Exception(f"Query failed, response: {data}")

            else:
                raise Exception(f"Unknown status: {data['status']}")
        
    
    def batch_get_levels(self, station_id, start_date=None):
        api_url = self.API_BASE_URL + \
            f"data/batch-readings/batch/?measure={station_id}-level-i-900-m-qualified"
            
        return self._batch_request(api_url, {
            'mineq-date': start_date
        } if start_date else {})
            
    def batch_get_rainfall(self, station_id, start_date=None):
        api_url = self.API_BASE_URL + \
            f"data/batch-readings/batch/?measure={station_id}-rainfall-i-900"
            
        return self._batch_request(api_url, {
            'mineq-date': start_date
        } if start_date else {})
    
    def batch_get_measure(self, measure: Measure, station_id, start_date=None):
        return {
            Measure.LEVEL: self.batch_get_levels,
            Measure.RAINFALL: self.batch_get_rainfall
        }[measure](station_id, start_date)
        
    def batch_get_measure_on_river(self, measure: Measure, river, start_date=None):
        data = pd.DataFrame()
        for station_id, station_name in self.get_stations_on_river(river)[['notation', 'label']].values:
            print(f"Downloading {measure.value} data for station: {station_name}")
            new_data = self.batch_get_measure(measure, station_id, start_date)
            if new_data is None:
                print(f"No new data for station: {station_name}")
                continue
            new_data = new_data.drop(columns=['measure', 'date', 'qcode', 'completeness'])
            new_data['station'] = station_name
            new_data['station'] = new_data['station'].astype('category')
            new_data['dateTime'] = pd.to_datetime(new_data['dateTime'])
            new_data['value'] = new_data['value'].astype('float32')
            new_data['quality'] = new_data['quality'].astype('category')
            data = pd.concat([data, new_data])
            data.drop_duplicates(subset=['dateTime', 'station'], inplace=True)
        return data
        
    def get_filename(self, measure: Measure, river):
        return f"{river.lower().replace(' ', '_')}_{measure.value}_raw.parquet"
        
    def update_dataframe(self, df: pd.DataFrame, measure: Measure, river: str):
        # last_date = df['dateTime'].max()
        # if last_date >= pd.to_datetime('today'):
        #     print(f"Data is up to date")
        #     return df
        # df = pd.concat([df, self.batch_get_measure_on_river(measure, river, last_date.strftime('%Y-%m-%d'))])
        # df.drop_duplicates(subset=['dateTime', 'station'], inplace=True)
        # return df
        assert measure == Measure.LEVEL
        for station_name, station_id in self.get_stations_on_river(river)[['label', 'notation']].values:
            print(f"Updating data for station: {station_name}")
            last = df[df['station'] == station_name]['dateTime'].max()
            new_measurements = self.get_levels(station_id, last)[['dateTime', 'value', 'quality']]
            new_measurements['station'] = station_name
            new_measurements['station'] = new_measurements['station'].astype('category')
            new_measurements['dateTime'] = pd.to_datetime(new_measurements['dateTime'])
            new_measurements['value'] = new_measurements['value'].astype('float32')
            print(f"Got {len(new_measurements)} new measurements")
            df = pd.concat([df, new_measurements])
        df.drop_duplicates(subset=['dateTime', 'station'], inplace=True)
        return df
            
    def load(self, measure: Measure, river):
        filename = self.get_filename(measure, river)
        filepath = os.path.join(self.DATA_DIR, filename)
        if os.path.exists(filepath):
            print(f"Loading {filepath}")
            df = pd.read_parquet(filepath)
            df['dateTime'] = pd.to_datetime(df['dateTime'])
            df['station'] = df['station'].astype('category')
            df['value'] = df['value'].astype('float32')
            df = self.update_dataframe(df, measure, river)
        else:
            print(f"Downloading {measure.value} data on: {river}")
            df = self.batch_get_measure_on_river(measure, river)
        df.to_parquet(filepath)
        return df

def process_hydrology_data(df):
    return df[df['quality'].isin(['Good', 'Unchecked', 'Estimated'])] \
        .drop(columns=['completeness']) \
        .pivot(index='dateTime', columns='station', values='value') \
        .resample('15min').interpolate('time', limit_direction='both', limit=2)
        

api = HydrologyApi()


In [174]:
level_df = api.load(Measure.LEVEL, "River Wear")
level_df.head()

Loading data/river_wear_level_raw.parquet
Updating data for station: Chester Le Street
Got 1252 new measurements
Updating data for station: Witton Park
Got 1252 new measurements
Updating data for station: Sunderland Bridge
Got 1252 new measurements
Updating data for station: Stanhope
Got 1252 new measurements
Updating data for station: Durham New Elvet Bridge
Got 1252 new measurements


Unnamed: 0,dateTime,value,completeness,quality,station
0,1981-09-29 10:00:00,0.443,,Unchecked,Chester Le Street
1,1981-09-29 04:00:00,0.44,,Unchecked,Chester Le Street
2,1981-09-29 04:15:00,0.44,,Unchecked,Chester Le Street
3,1981-09-29 04:30:00,0.44,,Unchecked,Chester Le Street
4,1981-09-29 04:45:00,0.44,,Unchecked,Chester Le Street


In [176]:
level_df.sort_values(by='dateTime', inplace=True)
level_df

Unnamed: 0,dateTime,value,completeness,quality,station
0,1961-01-29 01:00:00,0.610,,Unchecked,Stanhope
1,1961-01-29 01:15:00,0.610,,Unchecked,Stanhope
2,1961-01-29 01:30:00,0.610,,Unchecked,Stanhope
3,1961-01-29 01:45:00,0.610,,Unchecked,Stanhope
4,1961-01-29 02:00:00,0.610,,Unchecked,Stanhope
...,...,...,...,...,...
1251,2023-11-02 12:00:00,1.416,,,Chester Le Street
1251,2023-11-02 12:00:00,1.308,,,Sunderland Bridge
1251,2023-11-02 12:00:00,1.538,,,Witton Park
1251,2023-11-02 12:00:00,1.276,,,Stanhope


In [None]:
level_df = api.update_dataframe(level_df, Measure.LEVEL, "River Wear")

Updating data for station: Chester Le Street
Got 1252 new measurements
Updating data for station: Witton Park
Got 1252 new measurements
Updating data for station: Sunderland Bridge
Got 1252 new measurements
Updating data for station: Stanhope
Got 1252 new measurements
Updating data for station: Durham New Elvet Bridge
Got 1252 new measurements


In [168]:
level_df.dateTime.max()

Timestamp('2023-10-20 21:30:00')

In [130]:
level_df = process_hydrology_data(level_df)
level_df.head()

station,Chester Le Street,Durham New Elvet Bridge,Stanhope,Sunderland Bridge,Witton Park
dateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2005-10-18 13:00:00,0.307,0.248,0.162,0.307,0.372
2005-10-18 13:15:00,0.313,0.248,0.16,0.312,0.372
2005-10-18 13:30:00,0.313,0.248,0.16,0.31,0.371
2005-10-18 13:45:00,0.314,0.248,0.161,0.31,0.371
2005-10-18 14:00:00,0.312,0.248,0.16,0.309,0.371


In [131]:
level_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 463280 entries, 2005-10-18 13:00:00 to 2023-08-08 10:45:00
Data columns (total 5 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Chester Le Street        463280 non-null  float32
 1   Durham New Elvet Bridge  463280 non-null  float32
 2   Stanhope                 463280 non-null  float32
 3   Sunderland Bridge        463280 non-null  float32
 4   Witton Park              463280 non-null  float32
dtypes: float32(5)
memory usage: 12.4 MB


In [None]:
level_df.to_parquet('data/river_wear_level.parquet')
del level_df

In [125]:
rainfall_df = api.load(Measure.RAINFALL, "River Wear")
rainfall_df.head()

Downloading rainfall data on: River Wear
Downloading rainfall data for station: Chester Le Street
Query in progress
Estimated completion: 59.929
No new data for station: Chester Le Street
Downloading rainfall data for station: Witton Park
Query in progress
Estimated completion: 59.993
No new data for station: Witton Park
Downloading rainfall data for station: Sunderland Bridge
Query in progress
Estimated completion: 59.963
No new data for station: Sunderland Bridge
Downloading rainfall data for station: Stanhope
Query in progress
Estimated completion: 59.916


KeyboardInterrupt: 

In [None]:
rainfall_df = process_hydrology_data(rainfall_df)
rainfall_df.head()

In [None]:
rainfall_df.to_parquet('data/river_wear_rainfall.parquet')
del rainfall_df

The majority of the data is classed as good, so we can drop rows that arn't.