In [1]:
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Getting a Full List of Stations in London, Surrey, Sussex and Kent from Wikipedia

### London

In [2]:
frames = []

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_London_railway_stations"
r = requests.get(url)
soup = BeautifulSoup(r.text)
table = soup.find("table", {"class": "wikitable sortable"})

rows = []
for row in table.find_all("tr"):
    rows.append(list(map(lambda x: x.text.strip(), row.find_all("td"))))

In [4]:
df = pd.DataFrame(rows[1:]).rename(columns={0: "station", 3: "code", 7: "location"})

In [5]:
df = df[["station", "code", "location"]]

In [6]:
location = df["location"].str.extract("(51.\d{4}).*(0.\d{4})")

In [7]:
location = location.rename(columns={0: "lat", 1: "long"})

In [8]:
df = pd.concat([df.drop(axis=1, labels="location"), location], sort=False, axis=1)

In [9]:
df["station"] = df["station"].str.replace("\[.*\]", "")
frames.append(df)

### Surrey, Sussex and Kent

In [95]:
wiki_urls = [
    "https://en.wikipedia.org/wiki/Category:Railway_stations_in_East_Sussex",
    "https://en.wikipedia.org/wiki/Category:Railway_stations_in_Surrey",
    "https://en.wikipedia.org/wiki/Category:Railway_stations_in_West_Sussex",
    "https://en.wikipedia.org/wiki/Category:Railway_stations_in_Kent",
]

In [96]:
for wiki_url in wiki_urls:
    r = requests.get(wiki_url)
    soup = BeautifulSoup(r.text)
    by_letter = soup.findAll("div", {"class": "mw-category-group"})

    # get station names and links to individual wiki pages
    result = []
    for letter in by_letter:
        stations = letter.findAll("li")
        for station in stations:
            station_name = station.text
            station_link = station.find("a", href=True)["href"]
            result.append((station_name, station_link))

    df = pd.DataFrame(result, columns=["station", "link"])
    df["link"] = "https://en.wikipedia.org" + df["link"]

    # get station codes from individual pages
    station_codes = []
    locations = []
    for link in df["link"]:
        r = requests.get(link)
        soup = BeautifulSoup(r.text)
        element = soup.find(text="Station code")
        if element is not None:
            parent = element.find_parent("tr")
            station_code = parent.find("td").text
            station_codes.append(station_code)
            print(station_code)
            time.sleep(1)
        else:
            print(f"Could not find code for {link}")
            station_codes.append(None)
            time.sleep(1)
        element = soup.find("span", {"class", "geo-default"})
        if element is not None:
            location = element.text
            locations.append(location)
        else:
            print(f"Could not find location for {link}")
            locations.append(None)

    df["code"] = station_codes
    df["location"] = locations
    del df["link"]
    frames.append(df)

BAT
BRK
BEX
BIP
BXD
CLL
COB
CBR
COH
CWU
DLH
EBN
ERI
ETC
FRT
GLY
HMD
HGS
LWS
NVH
NVN
NSB
ORE
PEV
PEB
PMP
PLG
RBR
RYE
SLQ
SEF
SEE
SOG
TOK
UCK
WAD
WLD
WSE
ASN
ASH
AHV
AFS
AHD
BAG
BAD
BTO
BKA
BXW
BKO
BFN
CAM
CAT
CHY
CHL
CHP
CLA
CLG
CSD
DPD
DKG
DKT
DMS
ELD
EFF
EGH
EPD
EPS
ESH
EWE
EWW
FNC
FNH
FML
GOD
GDN
GOM
GLD
HMC
HSL
HER
HYW
HLM
HOR
HSY
HUR
KMP
KND
LHD
LFD
LRD
LNG
MHM
MLF
NCM
NUF
OLY
OXS
OXT
RDH
REI
SAF
SFR
SHP
SNS
SNL
SUU
TAD
TAT
THD
UPH
UWL
VIR
WAL
WAN
WBY
WYB
WHY
WHS
WTY
WOK
WOH
WPL
AMY
ANG
ARU
BAB
BAA
BIG
BOG
BOH
BUG
CCH
CHH
CRW
DUR
EGR
EWR
FGT
FSB
FSG
FOD
GTW
GBS
HSK
HHE
HRH
IFI
LAC
LIT
LVN
NUT
PUL
SSE
SOB
SWK
TBD
WNH
WWO
WVF
WRH
Could not find code for https://en.wikipedia.org/wiki/List_of_railway_stations_in_Dover
Could not find location for https://en.wikipedia.org/wiki/List_of_railway_stations_in_Dover
Could not find code for https://en.wikipedia.org/wiki/List_of_railway_stations_in_Kent
Could not find location for https://en.wikipedia.org/wiki/List_of_railway_stations_in_Kent
AD

In [104]:
df = pd.concat(frames[2:], ignore_index=True)

In [106]:
df.to_pickle("stations_with_raw_location")

In [113]:
alt = df["location"].str.extract("(.*)°(.*)′(.*)″N (.*)°(.*)′(.*)″([EW])")

In [114]:
alt.columns = [
    "lat_deg",
    "lat_min",
    "lat_sec",
    "long_deg",
    "long_min",
    "long_sec",
    "alt_EorW",
]

In [115]:
alt["lat_deg"] = alt["lat_deg"].astype(float)
alt["lat_min"] = alt["lat_min"].astype(float)
alt["lat_sec"] = alt["lat_sec"].astype(float)
alt["long_deg"] = alt["long_deg"].astype(float)
alt["long_min"] = alt["long_min"].astype(float)
alt["long_sec"] = alt["long_sec"].astype(float)

In [116]:
alt["alt_lat"] = alt["lat_deg"] + alt["lat_min"] / 60 + alt["lat_sec"] / 3600
alt["alt_long"] = alt["long_deg"] + alt["long_min"] / 60 + alt["long_sec"] / 3600

In [146]:
output = pd.concat(
    [
        df,
        df["location"].str.extract("(5\d\.\d+).*([01]\.\d+).*([EW])"),
        alt[["alt_lat", "alt_long", "alt_EorW"]]
    ], 
    axis=1,
).rename(columns={0: "lat", 1: "long", 2: "EorW"}).fillna(0)

### todo
 - think of better way to choose the alternative if the standard regex doesnt match
 - make the longitude negative if West
 - use the same system on London stations as we have done above
 - re-run and output the result to csv

In [149]:
latitudes = output["lat"].astype(float) + output["alt_lat"].astype(float)

In [154]:
output["latitude"] = latitudes

In [156]:
output.sort_values("latitude")

Unnamed: 0,station,code,location,lat,long,EorW,alt_lat,alt_long,alt_EorW,latitude
0,List of railway stations in Dover,0,0,0,0,0,0.000000,0.000000,0,0.000000
1,List of railway stations in Kent,0,0,0,0,0,0.000000,0.000000,0,0.000000
11,Eastbourne railway station,EBN,50.769°N 0.281°E﻿ / 50.769; 0.281,50.769,0.281,E,0.000000,0.000000,0,50.769000
30,Seaford railway station (England),SEF,50.773°N 0.100°E﻿ / 50.773; 0.100,50.773,0.100,E,0.000000,0.000000,0,50.773000
3,Bishopstone railway station,BIP,50.780°N 0.083°E﻿ / 50.780; 0.083,50.780,0.083,E,0.000000,0.000000,0,50.780000
...,...,...,...,...,...,...,...,...,...,...
23,Dartford railway station,DFD,51.4475°N 0.2193°E﻿ / 51.4475; 0.2193,51.4475,0.2193,E,0.000000,0.000000,0,51.447500
82,Swanscombe railway station,SWM,51.449120°N 0.309867°E﻿ / 51.449120; 0.309867,51.449120,0.309867,E,0.000000,0.000000,0,51.449120
39,Greenhithe railway station,GNH,51°27′2.29″N 0°16′44.33″E,0,0,0,51.450636,0.278981,E,51.450636
79,Stone Crossing railway station,SCG,51°27′5.03″N 0°15′50.13″E,0,0,0,51.451397,0.263925,E,51.451397


### Export to CSV

In [None]:
stations = pd.concat(frames, ignore_index=True)
stations = (
    stations.sort_values("station").dropna().reset_index().drop(axis=1, labels="index")
)
stations.to_csv("stations.csv")