In [15]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

### Getting Length of Commute to Cannon Street for Each London, Surrey, Sussex and Kent Station

In [2]:
class Destination:
    def __init__(self, name, code, arrive_by):
        self.name = name
        self.code = code
        self.arrive_by = arrive_by
        self.date = "060120"

In [3]:
destinations = [
    Destination("Blackfriars", "BFR", "0845"),
    Destination("Cannon Street", "CST", "0845"),
    Destination("London Bridge", "LBG", "0830"),
]

In [4]:
def format_url(departure_station, destination):
    base_url = "http://ojp.nationalrail.co.uk/service/timesandfares"
    return (
        f"{base_url}/{departure_station}/{destination.code}/"
        f"{destination.date}/{destination.arrive_by}/arr"
    )

In [5]:
def has_mtx(row):
    """return true if bs4 table row element has class mtx"""
    attributes = row.attrs.get("class")
    if attributes is None:
        return False
    return "mtx" in attributes

In [6]:
def parse_breakdown(breakdown):
    match = re.match("{.*}", re.sub(r"[\n\t]*", "", breakdown))
    breakdown_json = json.loads(match.group())["jsonJourneyBreakdown"]
    return {
        "from": breakdown_json["departureStationCRS"],
        "to": breakdown_json["arrivalStationCRS"],
        "changes": breakdown_json["changes"],
        "departure": breakdown_json["departureTime"],
        "arrival": breakdown_json["arrivalTime"],
        "duration": 60 * breakdown_json["durationHours"]
        + breakdown_json["durationMinutes"],
    }

In [13]:
def query_national_rail(departure_station, destination):
    results = []
    url = format_url(departure_station, destination)
    r = requests.get(url)
    soup = BeautifulSoup(r.text)
    table = soup.find("table", {"id": "oft"})
    rows = table.find_all("tr")
    row = rows[0]
    journeys = list(filter(lambda row: has_mtx(row), rows))

    for journey in journeys:
        columns = journey.find_all("td")
        breakdown = columns[-1].text
        results.append(parse_breakdown(breakdown))

    return pd.DataFrame(results)

In [22]:
stations = pd.read_csv("stations.csv", index_col=0)

In [27]:
sample_stations = stations["code"].tolist()[:5]

In [32]:
data = []
for departure_station in stations["code"]:
    for destination in destinations:
        try:
            trains = query_national_rail(departure_station, destination)
            data.append(trains)
            print(f"success {departure_station}-{destination.code}")

        except:
            print(f"failure {departure_station}-{destination.code}")

success ABW-BFR
success ABW-CST
success ABW-LBG
success ACC-BFR
success ACC-CST
success ACC-LBG
success AML-BFR
success AML-CST
success AML-LBG
success ASN-BFR
success ASN-CST
success ASN-LBG
success ADM-BFR
success ADM-CST
success ADM-LBG
success AYP-BFR
success AYP-CST
success AYP-LBG
success AAP-BFR
success AAP-CST
success AAP-LBG
success AMY-BFR
success AMY-CST
success AMY-LBG
success AMR-BFR
success AMR-CST
success AMR-LBG
success ANZ-BFR
success ANZ-CST
success ANZ-LBG
success ANG-BFR
success ANG-CST
success ANG-LBG
success APD-BFR
success APD-CST
success APD-LBG
success ARU-BFR
success ARU-CST
success ARU-LBG
success AHV-BFR
success AHV-CST
success AHV-LBG
success ASH-BFR
success ASH-CST
success ASH-LBG
success AFK-BFR
success AFK-CST
success AFK-LBG
success AFS-BFR
success AFS-CST
success AFS-LBG
success AHD-BFR
success AHD-CST
success AHD-LBG
success AYL-BFR
success AYL-CST
success AYL-LBG
success AYH-BFR
success AYH-CST
success AYH-LBG
success BAG-BFR
success BAG-CST
success 

success EGR-BFR
success EGR-CST
success EGR-LBG
success EML-BFR
success EML-CST
success EML-LBG
success EWR-BFR
success EWR-CST
success EWR-LBG
success EBN-BFR
success EBN-CST
success EBN-LBG
success EBD-BFR
success EBD-CST
success EBD-LBG
success EDN-BFR
success EDN-CST
success EDN-LBG
success EBT-BFR
success EBT-CST
success EBT-LBG
success EBR-BFR
success EBR-CST
success EBR-LBG
success EDR-BFR
success EDR-CST
success EDR-LBG
success EFF-BFR
success EFF-CST
success EFF-LBG
success EGH-BFR
success EGH-CST
success EGH-LBG
success EPH-BFR
success EPH-CST
success EPH-LBG
success ELE-BFR
success ELE-CST
success ELE-LBG
success ESD-BFR
success ESD-CST
success ESD-LBG
success ELS-BFR
success ELS-CST
success ELS-LBG
success ELW-BFR
success ELW-CST
success ELW-LBG
success EMP-BFR
success EMP-CST
success EMP-LBG
success ENC-BFR
success ENC-CST
success ENC-LBG
success ENL-BFR
success ENL-CST
success ENL-LBG
success ENF-BFR
success ENF-CST
success ENF-LBG
success EPD-BFR
success EPD-CST
success 

success LBG-BFR
success LBG-CST
failure LBG-LBG
success LOF-BFR
success LOF-CST
success LOF-LBG
success LRD-BFR
success LRD-CST
success LRD-LBG
success LNG-BFR
success LNG-CST
success LNG-LBG
success LGF-BFR
success LGF-CST
success LGF-LBG
success LGJ-BFR
success LGJ-CST
success LGJ-LBG
success LSY-BFR
success LSY-CST
success LSY-LBG
success MDB-BFR
success MDB-CST
success MDB-LBG
success MDE-BFR
success MDE-CST
success MDE-LBG
success MDW-BFR
success MDW-CST
success MDW-LBG
success MAL-BFR
success MAL-CST
success MAL-LBG
success MNP-BFR
success MNP-CST
success MNP-LBG
success MRN-BFR
success MRN-CST
success MRN-LBG
success MAR-BFR
success MAR-CST
success MAR-LBG
success MTM-BFR
success MTM-CST
success MTM-LBG
success MYL-BFR
success MYL-CST
success MYL-LBG
success MYB-BFR
success MYB-CST
success MYB-LBG
success MZH-BFR
success MZH-CST
success MZH-LBG
success MEP-BFR
success MEP-CST
success MEP-LBG
success MRW-BFR
success MRW-CST
success MRW-LBG
success MHM-BFR
success MHM-CST
success 

success SUC-BFR
success SUC-CST
success SUC-LBG
success SAY-BFR
success SAY-CST
success SAY-LBG
success SAY-BFR
success SAY-CST
success SAY-LBG
success SWM-BFR
success SWM-CST
success SWM-LBG
success SYD-BFR
success SYD-CST
success SYD-LBG
success SYH-BFR
success SYH-CST
success SYH-LBG
success SYL-BFR
success SYL-CST
success SYL-LBG
success TAD-BFR
success TAD-CST
success TAD-LBG
success TAD-BFR
success TAD-CST
success TAD-LBG
success TAT-BFR
success TAT-CST
success TAT-LBG
success TAT-BFR
success TAT-CST
success TAT-LBG
success TED-BFR
success TED-CST
success TED-LBG
success THD-BFR
success THD-CST
success THD-LBG
success THD-BFR
success THD-CST
success THD-LBG
success TEO-BFR
success TEO-CST
success TEO-LBG
success TTH-BFR
success TTH-CST
success TTH-LBG
success TBD-BFR
success TBD-CST
success TBD-LBG
success TOK-BFR
success TOK-CST
success TOK-LBG
success TOL-BFR
success TOL-CST
success TOL-LBG
success TON-BFR
success TON-CST
success TON-LBG
success TOO-BFR
success TOO-CST
success 

In [33]:
df = pd.concat(data).sort_values("duration")

In [35]:
df.to_csv("commute_length.csv")