In [1]:
import urllib.request, json
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import math
import datetime
import pickle
import math
import collections
from tqdm import tqdm
import time

In [4]:
url_text = "https://webtris.highwaysengland.co.uk/api/v1/sites"
with urllib.request.urlopen(url_text) as url:
    data = json.loads(url.read().decode())
    
sites = data['sites']
sites[0]

def get_quality_area(sites,
                     max_lat,
                     max_long,
                     min_lat,
                     min_long,
                     start_date='01062021',
                     end_date = '15062022',
                     quality_threshold = 90):
    
    # Convert sites query into df and filter onto our area
    sites_df = pd.DataFrame(data = sites)
    area_sites_df = sites_df.loc[(min_long < sites_df.Longitude) & (sites_df.Longitude < max_long)
                                & (min_lat < sites_df.Latitude) & (sites_df.Latitude < max_lat)]
    area_sites_df = area_sites_df.reset_index(drop=True)
    area_ids = list(area_sites_df.Id)
    
    # Next filter onto sites with good quality data:
    quality_responces = []
    for site_id in tqdm(area_ids):
        url_text = f"https://webtris.highwaysengland.co.uk/api/v1/quality/overall?sites={site_id}&start_date={start_date}&end_date={end_date}"
        with urllib.request.urlopen(url_text) as url:
            responce = json.loads(url.read().decode())
        quality_responces.append(responce)
        
    # We only want sites with quality greater than threshold
    good_quality_ids = []
    for responce in quality_responces:
        if responce['data_quality'] >= quality_threshold:
            good_quality_ids.append(responce['sites'])

    quality_area_sites_df = area_sites_df.loc[area_sites_df.Id.isin(good_quality_ids)]
    quality_area_sites_df = quality_area_sites_df.reset_index(drop=True)
    
    return quality_area_sites_df

In [5]:
# Specify dates
start_date='19032019'
end_date = '08042022'
# Specify the train area we are looking at
max_lat = 52.50
max_long = -1.67
min_lat = 52.42
min_long = -1.75
birmingham_sites_df = get_quality_area(sites,
                     max_lat,
                     max_long,
                     min_lat,
                     min_long,
                     start_date,
                     end_date)

# Specify the train area we are looking at
max_lat = 53.51
max_long = -2.31
min_lat = 53.44 
min_long = -2.39
manc_sites_df = get_quality_area(sites,
                     max_lat,
                     max_long,
                     min_lat,
                     min_long,
                     start_date,
                     end_date)

# Specify the train area we are looking at
max_lat = 52.25
max_long = 0.11
min_lat = 52.19
min_long = 0.02
# Get the quality reports
cam_sites_df = get_quality_area(sites,
                     max_lat,
                     max_long,
                     min_lat,
                     min_long,
                     start_date,
                     end_date)

# Specify the train area we are looking at
max_lat = 51.43
max_long = -0.50
min_lat = 51.38
min_long = -0.57
quality_threshold = 40
# Get the quality reports
thorpe_sites_df = get_quality_area(sites,
                     max_lat,
                     max_long,
                     min_lat,
                     min_long,
                     start_date,
                     end_date)

# Specify the train area we are looking at
max_lat = 51.72
max_long = 0.15
min_lat = 51.62
min_long = 0.09
quality_threshold = 40
# Get the quality reports
epping_sites_df = get_quality_area(sites,
                     max_lat,
                     max_long,
                     min_lat,
                     min_long,
                     start_date,
                     end_date)

# Specify the train area we are looking at
max_lat = 51.60
max_long = -2.52
min_lat = 51.52
min_long = -2.59
# Get the quality reports
bristol_sites_df = get_quality_area(sites,
                     max_lat,
                     max_long,
                     min_lat,
                     min_long,
                     start_date,
                     end_date)

100%|██████████| 331/331 [00:23<00:00, 14.25it/s]
100%|██████████| 231/231 [00:48<00:00,  4.78it/s]
100%|██████████| 71/71 [00:05<00:00, 13.97it/s]
100%|██████████| 69/69 [00:04<00:00, 14.11it/s]
100%|██████████| 102/102 [00:07<00:00, 13.91it/s]
100%|██████████| 150/150 [00:10<00:00, 14.12it/s]


In [8]:
birmingham_sites_df.to_feather('high_quality_site_reports/birmingham_sites_df')
manc_sites_df.to_feather('high_quality_site_reports/manc_sites_df')
cam_sites_df.to_feather('high_quality_site_reports/cam_sites_df')
thorpe_sites_df.to_feather('high_quality_site_reports/thorpe_sites_df')
epping_sites_df.to_feather('high_quality_site_reports/epping_sites_df')
bristol_sites_df.to_feather('high_quality_site_reports/bristol_sites_df')

In [13]:
def daily_report_query_url(site_id, page_num, start_date = '15062021', end_date = '15062022'):
    query_url = f"https://webtris.highwaysengland.co.uk/api/v1/reports/Daily?sites={site_id}&start_date={start_date}&end_date={end_date}&page={page_num}&page_size=10000"
    return query_url


# Function gets the report for a particular site_id
def get_site_report(site_id, start_date='15062021', end_date='15062022'):
    # Download page 1
    report_url = daily_report_query_url(site_id, 1, start_date, end_date)
    with urllib.request.urlopen(report_url) as url:
        report_page = json.loads(url.read().decode())
        
    # Work out how many pages are required    
    header = report_page['Header']
    rows = report_page['Rows']
    row_count = header['row_count']
    total_pages = math.ceil(row_count / 10000)
    # Make a dataframe of the rows so dar
    report_df = pd.DataFrame(data = rows)
    
    for i in range(2, total_pages+1):
        # Get page i of the report
        report_url = daily_report_query_url(site_id, i, start_date, end_date)
        with urllib.request.urlopen(report_url) as url:
            report_page = json.loads(url.read().decode())
        
        rows = report_page['Rows']
        current_page_df = pd.DataFrame(data = rows)
        report_df = pd.concat([report_df, current_page_df], ignore_index=True)

    return report_df, header

# Function takes a dataframe of site_df and gets the reports
def get_reports_from_sites_df(sites_df, start_date, end_date):
    # Get the reports on the site
    train_reports =  collections.defaultdict(str)
    # Go through all the site ids and get reports
    for site_id in tqdm(sites_df.Id):
        report, header = get_site_report(site_id, start_date, end_date)
        report['site_id'] = site_id
        train_reports[site_id] = report
        
    # Combine reports into one df
    report_df = pd.concat(list(train_reports.values()), ignore_index=True)
    return report_df

In [14]:
# Take the start and end dates as the maximum and minima of the datas + a month
start_date='19032019'
end_date = '08042022'

birmingham_report_df = get_reports_from_sites_df(birmingham_sites_df, start_date, end_date)
birmingham_report_df.to_feather('high_quality_traffic_reports/birmingham_report_df')

manc_report_df = get_reports_from_sites_df(manc_sites_df, start_date, end_date)
manc_report_df.to_feather('high_quality_traffic_reports/manc_report_df')

cam_report_df = get_reports_from_sites_df(cam_sites_df, start_date, end_date)
cam_report_df.to_feather('high_quality_traffic_reports/cam_report_df')

thorpe_report_df = get_reports_from_sites_df(thorpe_sites_df, start_date, end_date)
thorpe_report_df.to_feather('high_quality_traffic_reports/thorpe_report_df')

epping_report_df = get_reports_from_sites_df(epping_sites_df, start_date, end_date)
epping_report_df.to_feather('high_quality_traffic_reports/epping_report_df')

bristol_report_df = get_reports_from_sites_df(bristol_sites_df, start_date, end_date)
bristol_report_df.to_feather('high_quality_traffic_reports/bristol_report_df')

100%|██████████| 26/26 [10:31<00:00, 24.30s/it]
100%|██████████| 40/40 [16:07<00:00, 24.18s/it]
100%|██████████| 7/7 [02:47<00:00, 23.95s/it]
100%|██████████| 36/36 [14:29<00:00, 24.16s/it]
100%|██████████| 15/15 [06:04<00:00, 24.31s/it]
100%|██████████| 5/5 [01:59<00:00, 24.00s/it]
