In [40]:
from urllib.request import urlopen
import urllib3
import json
import pandas as pd
from time import sleep
from tqdm import tqdm
from io import StringIO
import os

In [24]:
class HydrologyApi:
    API_BASE_URL = "https://environment.data.gov.uk/hydrology/"
    
    def __init__(self):
        self.http = urllib3.PoolManager()
    
    def get_stations_on_river(self, river):
        api_url = self.API_BASE_URL + 'id/stations'
        result = urlopen(
            api_url + '?' + urlencode({'riverName': river})).read().decode('utf-8')
        data = json.loads(result)
        return pd.DataFrame(data['items'])
    
    def get_levels(self, station):
        api_url = self.API_BASE_URL + f"id/measures/{station}-level-i-900-m-qualified/readings"
        result = urlopen(api_url).read().decode('utf-8')
        data = json.loads(result)
        return pd.DataFrame(data['items'])
    
    def batch_get_levels(self, station):
        
        api_url = self.API_BASE_URL + \
            f"data/batch-readings/batch/?measure={station}-level-i-900-m-qualified"
            
        status = "Pending"
        
        while status == "Pending":
            request = self.http.request('GET', api_url, headers={'Accept-Encoding': 'gzip'})
            content_type = request.headers['Content-Type']
            
            if content_type == 'text/csv':
                buffer = StringIO(request.data.decode('utf-8'))
                return pd.read_csv(buffer, low_memory=False)
            
            assert content_type == 'application/json'
            
            data = json.loads(request.data.decode('utf-8'))
            status = data["status"]
            
            if status == "Pending":
                print(f"Query is pending")
                pos_in_queue = data["positionInQueue"]
                print(f"Position in queue: {pos_in_queue}")
                eta = data["eta"] / 1000
                print(f"Estimated completion: {eta}")
                sleep(eta * 0.8)
                
            elif status == "InProgress":
                print(f"Query in progress")
                eta = data["eta"] / 1000
                print(f"Estimated completion: {eta}")
                sleep(eta * 0.8)
                
            elif status == "Complete":
                csv_url = data["dataUrl"]
                return pd.read_csv(csv_url, low_memory=False)
            
            elif status == "Failed":
                raise Exception("Query failed")
            
            else:
                raise Exception(f"Unknown status: {data['status']}")

In [29]:
api = HydrologyApi()
stations = api.get_stations_on_river('River Wear')

print(f"Found {len(stations)} stations: {stations['label'].values}")

Found 5 stations: ['Chester Le Street' 'Witton Park' 'Sunderland Bridge' 'Stanhope'
 'Durham New Elvet Bridge']


In [58]:
if os.path.exists('data/water_level_data_raw.parquet'):
    print(f"Found existing data, skipping download\n")
    water_level_data = pd.read_parquet('data/water_level_data_raw.parquet')
    
else:
  water_level_data = pd.DataFrame()

  for station_name, station_id in stations[['label', 'notation']].values:
    print(f"Loading data for {station_name}")
    try:
      df = api.batch_get_levels(station_id)
      df = df.drop(columns=['measure', 'date', 'qcode', 'completeness'])
      df['station'] = station_name
      
      df['dateTime'] = pd.to_datetime(df['dateTime'])
      df['station'] = df['station'].astype('category')
      df['value'] = df['value'].astype('float32')
      df['quality'] = df['quality'].astype('category')
      
      print(f"Loaded {len(df)} datapoints for {station_name}\n")
      water_level_data = pd.concat([water_level_data, df])
    except Exception as e:
      print(f"Failed to load data for {station_name}: {e}")
      
    
  print(f"Loaded {len(water_level_data)} datapoints in total")
  print(f"Saving to disk")
    
  water_level_data.to_parquet('data/water_level_data_raw.parquet')

water_level_data.info()

Found existing data, skipping download

<class 'pandas.core.frame.DataFrame'>
Index: 6635237 entries, 0 to 631478
Data columns (total 5 columns):
 #   Column        Dtype         
---  ------        -----         
 0   dateTime      datetime64[ns]
 1   value         float32       
 2   completeness  float64       
 3   quality       object        
 4   station       object        
dtypes: datetime64[ns](1), float32(1), float64(1), object(2)
memory usage: 278.4+ MB


In [51]:
water_level_data.quality.value_counts(normalize=True).to_frame().style.format('{:.1%}')

Unnamed: 0_level_0,proportion
quality,Unnamed: 1_level_1
Good,94.0%
Missing,2.5%
Suspect,2.5%
Unchecked,0.5%
Estimated,0.5%


The majority of the data is classed as good, so we can drop rows that arn't.

In [52]:
water_level_data.drop(water_level_data[water_level_data.quality != 'Good'].index, inplace=True)
water_level_data.drop(columns=['quality'], inplace=True)

In [68]:
water_level_data_wide = water_level_data.pivot(index='dateTime', columns='station', values='value').reset_index().rename_axis(None, axis=1)
water_level_data_wide

Unnamed: 0,dateTime,Chester Le Street,Durham New Elvet Bridge,Stanhope,Sunderland Bridge,Witton Park
0,1961-01-29 01:00:00,,,0.610,,
1,1961-01-29 01:15:00,,,0.610,,
2,1961-01-29 01:30:00,,,0.610,,
3,1961-01-29 01:45:00,,,0.610,,
4,1961-01-29 02:00:00,,,0.610,,
...,...,...,...,...,...,...
1563781,2023-10-20 20:30:00,2.277,2.260,1.489,1.816,2.271
1563782,2023-10-20 20:45:00,2.315,2.291,1.480,1.841,2.278
1563783,2023-10-20 21:00:00,2.343,2.325,1.463,1.859,2.234
1563784,2023-10-20 21:15:00,2.376,2.340,1.456,1.884,2.235


In [69]:
# Where there are missing values, use a linear interpolation to fill them in

water_level_data_wide = water_level_data_wide.interpolate(method='linear', limit_direction='both', limit=3)
water_level_data_wide

Unnamed: 0,dateTime,Chester Le Street,Durham New Elvet Bridge,Stanhope,Sunderland Bridge,Witton Park
0,1961-01-29 01:00:00,,,0.610,,
1,1961-01-29 01:15:00,,,0.610,,
2,1961-01-29 01:30:00,,,0.610,,
3,1961-01-29 01:45:00,,,0.610,,
4,1961-01-29 02:00:00,,,0.610,,
...,...,...,...,...,...,...
1563781,2023-10-20 20:30:00,2.277,2.260,1.489,1.816,2.271
1563782,2023-10-20 20:45:00,2.315,2.291,1.480,1.841,2.278
1563783,2023-10-20 21:00:00,2.343,2.325,1.463,1.859,2.234
1563784,2023-10-20 21:15:00,2.376,2.340,1.456,1.884,2.235
