In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

In [2]:
!pip install unidecode
from unidecode import unidecode

Collecting unidecode
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.6


The `TrainCompany` interface, used as a template for the company subclasses.

In [4]:
class TrainCompany:
  def create_soup(URL: str, verify = False, type_ = 'xml'):
    res = requests.get(URL, verify = verify)
    soup = BeautifulSoup(res.text, type_)
    return soup

  def get_train_data(URL: str) -> dict[float: int]:
    TrainCompany.create_soup(URL)
    pass

  def get_station_data(URL: str) -> dict[float: int]:
    TrainCompany.create_soup(URL)
    pass

### Tokyo Metro

In [5]:
class TokyoMetro(TrainCompany):
  def get_train_data(URL):
    soup = TrainCompany.create_soup(URL)

    timings, count = {}, 5

    for rect in soup.find_all('rect'):
      if rect['height'] == '24' and 'fill' in rect.attrs.keys() and rect['fill'] != 'rgb(242, 242, 242)':

        spectrum = ['rgb(148, 191, 230)', 'rgb(174, 214, 160)', 'rgb(248, 183, 60)', 'rgb(255, 153, 153)']
        timings[count] = spectrum.index(rect['fill'])

        count += 0.5

    return timings

  def get_station_data(URL):
    soup = TrainCompany.create_soup(URL)

    timings, count = {}, 5

    for g in soup.find_all('g'):
      if 'fill' in g.attrs.keys() and g['fill'] in ('rgb(255, 153, 153)', 'rgb(148, 191, 230)'):

        for rect in g.find_all('rect'):

          people = round((float(rect['width']) / 105) * 2000)

          if count in timings.keys(): timings[count].append(people)
          else: timings[count] = [people]

          count += 0.5

        count = 5

    return timings

In [None]:
print(TokyoMetro.get_train_data('https://tmap-sid.tokyometro.jp/o/img/v1/train_T16_Nishi-kasai_B_W.svg'))
print(TokyoMetro.get_station_data('https://tmap-sid.tokyometro.jp/o/img/v1/gate_Shibuya_W.svg'))



{5: 2, 5.5: 1, 6.0: 1, 6.5: 1, 7.0: 2, 7.5: 3, 8.0: 3, 8.5: 2, 9.0: 1, 9.5: 1, 10.0: 1, 10.5: 1, 11.0: 0, 11.5: 1, 12.0: 1, 12.5: 1, 13.0: 0, 13.5: 0, 14.0: 0, 14.5: 0, 15.0: 0, 15.5: 0, 16.0: 0, 16.5: 0, 17.0: 0, 17.5: 0, 18.0: 0, 18.5: 0, 19.0: 0, 19.5: 0, 20.0: 0, 20.5: 0, 21.0: 0, 21.5: 0, 22.0: 0, 22.5: 0, 23.0: 0, 23.5: 0, 24.0: 0}




{5: [209, 1], 5.5: [255, 143], 6.0: [523, 217], 6.5: [1159, 319], 7.0: [2061, 494], 7.5: [3227, 714], 8.0: [4369, 1171], 8.5: [3917, 1544], 9.0: [4127, 1427], 9.5: [3833, 1757], 10.0: [2351, 1297], 10.5: [1950, 1415], 11.0: [1614, 1254], 11.5: [1644, 1526], 12.0: [1806, 1609], 12.5: [1927, 2078], 13.0: [1670, 1677], 13.5: [1537, 1796], 14.0: [1596, 1641], 14.5: [1577, 1927], 15.0: [1623, 2071], 15.5: [1623, 2282], 16.0: [1703, 2392], 16.5: [1729, 2774], 17.0: [1893, 3055], 17.5: [1920, 3713], 18.0: [1980, 3927], 18.5: [1907, 3994], 19.0: [1821, 3989], 19.5: [1577, 3983], 20.0: [1373, 3578], 20.5: [1250, 2562], 21.0: [1379, 1912], 21.5: [1210, 1634], 22.0: [1155, 1663], 22.5: [907, 1181], 23.0: [744, 1142], 23.5: [585, 905], 24.0: [21, 578], 24.5: [0, 91]}


### Trains: https://tmap-sid.tokyometro.jp/o/img/v1/train_{code}_{sname}_{dir}_{type}.svg
### Fare gates: https://tmap-sid.tokyometro.jp/o/img/v1/gate_{sname}_{type}.svg
- `code`: The sign & number of a train station (e.g G02)
- `sname`: The name of station in Romaji, with hyphens for spaces (e.g. Omote-sando)
- `dir`: [`A`, `B`] The direction of the train, `A` = towards end (T23), `B` = towards start (T01)
- `type`: [`W`, `H`] `W` = weekend, `H` = holiday

### Wikipedia

In [6]:
soup = TrainCompany.create_soup('https://en.wikipedia.org/wiki/Tokyo_Metro', type_ = 'html.parser')



In [7]:
class Station:
  def __init__(self, station_code, name):
    self.station_code_ = station_code
    self.name_ = name

  def __repr__(self):
    return self.name_

In [8]:
class Train:
  def __init__(self, name, symbol, start, end):
    self.name_ = name
    self.symbol_ = symbol
    self.start_ = start
    self.end_ = end
    self.stations_ = []

  def __repr__(self):
    return self.name_

  def get_stations(self):
    soup = TrainCompany.create_soup(f'https://en.wikipedia.org/wiki/Tokyo_Metro_{self.name_}_Line')
    table = soup.find('table', class_ = 'wikitable')

    for tr in table.find_all('tr'):

      row = [x.text.replace('\n', '') for x in tr.find_all('td')]
      if len(row) >= 6:
        station = Station(row[0], row[1])
        self.stations_.append(station)

In [None]:
lines = []

In [None]:
table = soup.find('table', class_ = 'wikitable')
reference_colors = ['red', 'orange', 'silver', 'sky blue', 'green', 'gold', 'purple', 'teal', 'brown']

for tr in table.find_all('tr'):

  row = [x.text.replace('\n', '') for x in tr.find_all('td')]
  if row and row[0] != 'Total':
    name = row[3].replace(' Line', '')
    path = row[5].split(' to ')

    if row[0] not in reference_colors:
      continue

    train = Train(name, row[1], path[0], path[1])
    lines.append(train)

In [None]:
print(lines)

[Ginza, Marunouchi, Hibiya, Tōzai, Chiyoda, Yūrakuchō, Hanzōmon, Namboku, Fukutoshin]


In [None]:
for train in lines:
  train.get_stations()



### Brute Force

In [12]:
def brute_force(URL, type_ = ['train', 'station']):
  try:
    if type_ == 'train':
      return TokyoMetro.get_train_data(URL)

    elif type_ == 'station':
      return TokyoMetro.get_station_data(URL)

  except:
    return None

In [17]:
def get_combined_data(station, do_fare_gate = True):
  code = station.station_code_
  # The capitalize is to make sure the URL can be fetched
  # Unidecode gets rid of the accents that wikipedia puts on the names
  sname = unidecode(station.name_.capitalize())
  if sname == 'Nakano-shimbashi': sname = 'Nakano-Shimbashi' # Don't really know what's causing this

  cols = ['Time', 'Up/Weekday', 'Down/Weekday', 'Up/Holiday', 'Down/Holiday', 'Entered/Weekday',
          'Exited/Weekday', 'Entered/Holiday', 'Exited/Weekend']
  df = pd.DataFrame(columns = cols)

  df['Time'] = list(map(lambda n: n / 2, range(10, 50)))
  empty = ['-'] * 40
  idx = 1

  print(sname, 'Train Data')

  aw = f'https://tmap-sid.tokyometro.jp/o/img/v1/train_{code}_{sname}_A_W.svg'
  bw = f'https://tmap-sid.tokyometro.jp/o/img/v1/train_{code}_{sname}_B_W.svg'
  ah = f'https://tmap-sid.tokyometro.jp/o/img/v1/train_{code}_{sname}_A_H.svg'
  bh = f'https://tmap-sid.tokyometro.jp/o/img/v1/train_{code}_{sname}_B_H.svg'
  for link in [aw, bw, ah, bh]:
    tdata = list(brute_force(link, type_ = 'train').values())
    for _ in range(40 - len(tdata)):
      tdata.append('-')

    df[cols[idx]] = tdata if tdata else empty
    idx += 1

  if do_fare_gate:
    print(sname, 'Fare Gate Data')

    if sname == 'Nakano-Shimbashi': sname = 'Nakano-shimbashi' # Don't really know what's causing this

    w = f'https://tmap-sid.tokyometro.jp/o/img/v1/gate_{sname}_W.svg'
    h = f'https://tmap-sid.tokyometro.jp/o/img/v1/gate_{sname}_H.svg'
    for link in [w, h]:
      stdata = list(brute_force(link, type_ = 'station').values())
      df[cols[idx]] = [x[0] for x in stdata]
      idx += 1
      df[cols[idx]] = [x[1] for x in stdata]
      idx += 1

  return df

In [18]:
station = Station('Mb05', 'Nakano-Shimbashi')
df = get_combined_data(station, do_fare_gate = True)
df.to_csv(f'{station.station_code_}_{station.name_}.csv', index = False)

from google.colab import files
files.download(f'{station.station_code_}_{station.name_}.csv')

Nakano-Shimbashi Train Data




Nakano-Shimbashi Fare Gate Data




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# for station in lines[7].stations_:
#   try:
#     df = get_combined_data(station)
#   except ValueError:
#     st = Station(station.station_code_, input(">> ").strip())
#     try:
#       df = get_combined_data(st)
#     except:
#       print("Station doesn't have fare gate data probably")
#       continue

#   df.to_csv(f'Namboku/{station.station_code_}_{station.name_}', index = False)

#   time.sleep(4.5 + random.uniform(0, 1))

I don't know if this is the best solution:
1. Each station would have a separate csv file with information of each train.
2. This csv file would have multiple repeats of station fare gate data
3. Some stations, like Otemachi, might have too many columns and data inside it.
4. If each station has it's own csv file (and each station can have multiple csv files), then it might get really messy to process the info.
(Try creating folders for each of the trains)

In [None]:
# !zip -r Namboku.zip Namboku/

# from google.colab import files
# files.download("Namboku.zip")