In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
shared_folder = '/content/drive/MyDrive/Ariya Narayanasamy/Data'

os.chdir(shared_folder)

### Node attributes
- `station_name` str: Name of station
- `serviced_lines` list[dict{str: str}]: List of the trains that service station as a dict {line_name: code}
- `train_data_up` list[dict{str: list[int, len = 39]}]: List of train crowdedness (0~4) data for each train at every time
- `train_data_down`
- `fare_gate_data_in`
- `fare_gate_data_out`

Note: the train crowdedness value (0~4) is not an edge attribute, but a node attribute. You could use this value of the current and previous stations to estimate the number of people leaving the train at a given station.

Comparing this to the fare gate influx data gives an estimate of how many people are changing trains at that particular stop. Comparing this data for multiple trains could yield an estimate for the number of people transferring from one particular train to another

In [None]:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import math
import pickle

In [None]:
# from geopy.geocoders import Nominatim
import requests
from bs4 import BeautifulSoup
import re

## Nodes stuff

In [None]:
nodes_df = pd.DataFrame(columns = [
  'station_name', 'serviced_lines', 'train_data_up', 'train_data_down',
  'fare_gate_data_in', 'fare_gate_data_out',
  'latitude', 'longitude'
])

Really annoying thing, but instead of `Exited/Holiday` like every other key in the pandas DataFrame, the key is `Exited/Weekend` because I didn't realize I made this mistake.

In [None]:
def scrape_coords(station_name):
  pos_urls = [
    f'https://en.wikipedia.org/wiki/{station_name}_Station',
    f'https://en.wikipedia.org/wiki/{station_name}_Station_(Tokyo)',
    f'https://en.wikipedia.org/wiki/{station_name}_Station_(Tokyo_Metro)', # Only for Waseda station
    f'https://en.wikipedia.org/wiki/{station_name}_Station_(Chiba)',
    f'https://en.wikipedia.org/wiki/{station_name}_Station_(Saitama)',
    f'https://en.wikipedia.org/wiki/{station_name}_Station_(Kanagawa)'
  ]
  for url in pos_urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    element = soup.find('span', class_ ='geo-dms')
    if element:
      return element.get_text()
  return None

In [None]:
def dms_to_decimal(dms):
    # 35° 42′ 22″ N, 139° 39′ 56″ E
    split_coords = re.split(r"[°,′,″]", dms)

    degrees = float(split_coords[0])
    minutes = float(split_coords[1])
    seconds = float(split_coords[2])
    direction = split_coords[3]

    decimal_degrees = degrees + minutes / 60 + seconds / 3600

    if direction in ['S', 'W']:
        decimal_degrees = -decimal_degrees

    return decimal_degrees

In [None]:
def fill_df(nodes_df, filepath):
  for csvfile in sorted(os.listdir(filepath)):

    if csvfile.lower().endswith('csv'):
      df = pd.read_csv(os.path.join(filepath, csvfile))
      div_name = csvfile.split('.')[0].split('_')
      station_name, station_code = div_name[1], div_name[0]
      # print(station_code, station_name)

      # Station already exists in the df
      if any(nodes_df['station_name'] == station_name):
        row_index = nodes_df[nodes_df['station_name'] == station_name].index[0]

        row = nodes_df.at[row_index, 'serviced_lines']
        row[train] = station_code

        row = nodes_df.at[row_index, 'train_data_up']
        row[train] = {}
        row[train]['W'] = df['Up/Weekday'] if df['Up/Weekday'][0] != '-' else None
        row[train]['H'] = df['Up/Holiday'] if df['Up/Holiday'][0] != '-' else None

        row = nodes_df.at[row_index, 'train_data_down']
        row[train] = {}
        row[train]['W'] = df['Down/Weekday'] if df['Down/Weekday'][0] != '-' else None
        row[train]['H'] = df['Down/Holiday'] if df['Down/Holiday'][0] != '-' else None

      # Station is a new entry in the df
      else:
        coords = scrape_coords(station_name) # Web scraping (find coordinates)
        print(station_name, coords)
        if coords:
          latitude = dms_to_decimal(coords.split()[0])
          longitude = dms_to_decimal(coords.split()[1])

        new_row = {
            'station_name': station_name,
            'serviced_lines': {train: station_code},
            'train_data_up': {train: {
                'W': df['Up/Weekday'] if df['Up/Weekday'][0] != '-' else None,
                'H': df['Up/Holiday'] if df['Up/Holiday'][0] != '-' else None
            }},
            'train_data_down': {train: {
                'W': df['Down/Weekday'] if df['Down/Weekday'][0] != '-' else None,
                'H': df['Down/Holiday'] if df['Down/Holiday'][0] != '-' else None
            }},
            'fare_gate_data_in': {
                'W': df['Entered/Weekday'] if not df['Entered/Weekday'].isnull().all() else None,
                'H': df['Entered/Holiday'] if not df['Entered/Holiday'].isnull().all() else None
            },
            'fare_gate_data_out': {
                'W': df['Exited/Weekday'] if not df['Exited/Weekday'].isnull().all() else None,
                'H': df['Exited/Weekend'] if not df['Exited/Weekend'].isnull().all() else None
            },
            'latitude': latitude if coords else None,
            'longitude': longitude if coords else None
        }

        new_row_df = pd.DataFrame([new_row])
        nodes_df = pd.concat([nodes_df, new_row_df], ignore_index = True)

  return nodes_df

In [None]:
for train in ['Chiyoda', 'Fukutoshin', 'Ginza', 'Hanzomon', 'Hibiya', 'Marunouchi', 'Namboku', 'Tozai', 'Yurakucho']:
  parent_folder = 'Tokyo Metro/' + train + '/'
  filepath = os.path.join(shared_folder, parent_folder)

  nodes_df = fill_df(nodes_df, filepath)

Yoyogi-uehara 35°40′8.63″N 139°40′46.84″E
Yoyogi-koen 35°40′8.1984″N 139°41′28.7808″E
Meiji-jingumae 35°40′6.4164″N 139°42′19.2636″E
Omote-sando 35°39′54.5328″N 139°42′45.0756″E
Nogizaka 35°40′00″N 139°43′34″E
Akasaka 35°40′21″N 139°44′12″E
Kokkai-gijidomae 35°40′27″N 139°44′43″E
Kasumigaseki 35°40′26″N 139°45′04″E
Hibiya 35°40′30″N 139°45′35″E
Nijubashimae 35°40′49″N 139°45′42″E
Otemachi 35°41′05″N 139°45′57″E
Shin-ochanomizu 35°41′52″N 139°45′57″E
Yushima 35°42′29″N 139°46′12″E
Nezu 35°43′03″N 139°45′57″E
Sendagi 35°43′31″N 139°45′47″E
Nishi-nippori 35°43′55″N 139°46′00″E
Machiya 35°44′33″N 139°46′54″E
Kita-senju 35°44′58″N 139°48′18″E
Ayase 35°45′43.93″N 139°49′29.63″E
Kita-ayase 35°46′37″N 139°49′56″E
Wakoshi 35°47′18″N 139°36′46″E
Chikatetsu-narimasu 35°46′36″N 139°37′53″E
Chikatetsu-akatsuka 35°46′12″N 139°38′39″E
Heiwadai 35°45′28″N 139°39′14″E
Hikawadai 35°44′59″N 139°39′54″E
Kotake-mukaihara 35°44′38″N 139°40′43″E
Senkawa 35°44′18″N 139°41′21″E
Kanamecho 35°43′59.5164″N 139°41

In [None]:
nodes_df

Unnamed: 0,station_name,serviced_lines,train_data_up,train_data_down,fare_gate_data_in,fare_gate_data_out,latitude,longitude
0,Yoyogi-uehara,{'Chiyoda': 'C01'},"{'Chiyoda': {'W': None, 'H': None}}","{'Chiyoda': {'W': ['0', '0', '0', '0', '1', '2...","{'W': None, 'H': None}","{'W': None, 'H': None}",35.669064,139.679678
1,Yoyogi-koen,{'Chiyoda': 'C02'},"{'Chiyoda': {'W': ['0', '0', '0', '0', '0', '0...","{'Chiyoda': {'W': ['0', '0', '1', '1', '1', '2...","{'W': [156, 220, 356, 732, 1440, 2328, 3504, 3...","{'W': [12, 148, 316, 416, 832, 1176, 2400, 253...",35.668944,139.691328
2,Meiji-jingumae,"{'Chiyoda': 'C03', 'Fukutoshin': 'F15'}","{'Chiyoda': {'W': ['0', '0', '0', '0', '0', '0...","{'Chiyoda': {'W': ['0', '0', '0', '1', '1', '2...","{'W': [64, 90, 134, 200, 332, 566, 804, 1006, ...","{'W': [52, 230, 428, 800, 1412, 2556, 3732, 52...",35.668449,139.705351
3,Omote-sando,"{'Chiyoda': 'C04', 'Ginza': 'G02', 'Hanzomon':...","{'Chiyoda': {'W': ['0', '0', '0', '0', '0', '0...","{'Chiyoda': {'W': ['0', '0', '0', '0', '1', '2...","{'W': [49, 49, 94, 142, 244, 366, 589, 535, 55...","{'W': [35, 296, 391, 772, 1233, 2232, 3684, 62...",35.665148,139.712521
4,Nogizaka,{'Chiyoda': 'C05'},"{'Chiyoda': {'W': ['0', '0', '0', '0', '0', '0...","{'Chiyoda': {'W': ['0', '0', '0', '0', '1', '2...","{'W': [228, 164, 236, 308, 520, 676, 920, 860,...","{'W': [88, 416, 488, 1016, 1592, 2796, 4340, 6...",35.666667,139.726111
...,...,...,...,...,...,...,...,...
139,Shintomicho,{'Yurakucho': 'Y20'},"{'Yurakucho': {'W': ['0', '0', '0', '1', '1', ...","{'Yurakucho': {'W': ['0', '0', '0', '0', '0', ...","{'W': [102, 178, 276, 592, 880, 1322, 2140, 15...","{'W': [84, 238, 452, 862, 1534, 2772, 4362, 44...",35.670458,139.773723
140,Tsukishima,{'Yurakucho': 'Y21'},"{'Yurakucho': {'W': ['0', '0', '1', '1', '1', ...","{'Yurakucho': {'W': ['0', '0', '0', '0', '0', ...","{'W': [133, 292, 440, 854, 1475, 2387, 3537, 3...","{'W': [48, 118, 182, 494, 760, 1049, 1660, 118...",35.664353,139.784862
141,Toyosu,{'Yurakucho': 'Y22'},"{'Yurakucho': {'W': ['0', '0', '0', '0', '0', ...","{'Yurakucho': {'W': ['0', '0', '0', '0', '0', ...","{'W': [93, 123, 228, 414, 808, 1265, 1644, 117...","{'W': [97, 292, 458, 1002, 1864, 3043, 5381, 6...",35.655000,139.796111
142,Tatsumi,{'Yurakucho': 'Y23'},"{'Yurakucho': {'W': ['0', '0', '0', '0', '0', ...","{'Yurakucho': {'W': ['0', '0', '0', '0', '0', ...","{'W': [360, 428, 628, 1276, 2212, 3344, 4240, ...","{'W': [48, 232, 712, 1128, 1980, 2696, 4060, 3...",35.645567,139.810525


### Nodes Checks

Check if all the nodes have latitude and longitude values

In [None]:
print(nodes_df.loc[nodes_df['latitude'].isnull(), 'station_name'])
print(nodes_df.loc[nodes_df['longitude'].isnull(), 'station_name'])

Series([], Name: station_name, dtype: object)
Series([], Name: station_name, dtype: object)


In [None]:
nodes_df.shape

(144, 8)

In [None]:
for _, row in nodes_df.iterrows():
  station_name = row['station_name']
  for line in row['serviced_lines'].keys():

    if row['train_data_up'][line]['W'] is None:
      print(f'{line.upper()} line at {station_name.upper()} has no TRAIN DATA UP')
    if row['train_data_down'][line]['W'] is None:
      print(f'{line.upper()} line at {station_name.upper()} has no TRAIN DATA DOWN')

  if row['fare_gate_data_in']['W'] is None:
    print(f'{line.upper()} line at {station_name.upper()} has no FARE GATE DATA IN')
  if row['fare_gate_data_out']['W'] is None:
    print(f'{line.upper()} line at {station_name.upper()} has no FARE GATE DATA OUT')

CHIYODA line at YOYOGI-UEHARA has no TRAIN DATA UP
CHIYODA line at YOYOGI-UEHARA has no FARE GATE DATA IN
CHIYODA line at YOYOGI-UEHARA has no FARE GATE DATA OUT
HIBIYA line at KITA-SENJU has no TRAIN DATA DOWN
CHIYODA line at KITA-AYASE has no TRAIN DATA DOWN
FUKUTOSHIN line at WAKOSHI has no TRAIN DATA UP
YURAKUCHO line at WAKOSHI has no TRAIN DATA UP
YURAKUCHO line at WAKOSHI has no FARE GATE DATA IN
YURAKUCHO line at WAKOSHI has no FARE GATE DATA OUT
MARUNOUCHI line at IKEBUKURO has no TRAIN DATA DOWN
FUKUTOSHIN line at SHIBUYA has no TRAIN DATA DOWN
GINZA line at SHIBUYA has no TRAIN DATA UP
HANZOMON line at SHIBUYA has no TRAIN DATA UP
GINZA line at ASAKUSA has no TRAIN DATA DOWN
HANZOMON line at OSHIAGE has no TRAIN DATA DOWN
HIBIYA line at NAKA-MEGURO has no TRAIN DATA UP
HIBIYA line at NAKA-MEGURO has no FARE GATE DATA IN
HIBIYA line at NAKA-MEGURO has no FARE GATE DATA OUT
MARUNOUCHI line at OGIKUBO has no TRAIN DATA UP
MARUNOUCHI line at HONANCHO has no TRAIN DATA UP
NAMBOKU

In [None]:
nodes_df.to_pickle('tokyo_metro_nodes.pkl')

## Edges Stuff

In [None]:
edges_df = pd.DataFrame(columns = ['source', 'target', 'line', 'crowdedness'])

In [None]:
def get_name_from_file(filename):
  return filename.split('.')[0].split('_')[1]

In [None]:
def get_edges(edges_df, file, train):
  for i, filename in enumerate(files[1:]):
    prev_file = get_name_from_file(files[i])
    cur_file = get_name_from_file(filename)

    # FIx this later
    if (prev_file, cur_file) not in (('Ikebukuro', 'Honancho'), ('Honancho', 'Ikebukuro')):

      prev_idx = nodes_df[nodes_df['station_name'] == prev_file].index[0]
      cur_idx = nodes_df[nodes_df['station_name'] == cur_file].index[0]

      edges_df = pd.concat([edges_df, pd.DataFrame([{
          'source': prev_file,
          'target': cur_file,
          'line': train,
          'crowdedness': nodes_df.at[prev_idx, 'train_data_down'][train]
      }])], ignore_index = True)

      edges_df = pd.concat([edges_df, pd.DataFrame([{
          'source': cur_file,
          'target': prev_file,
          'line': train,
          'crowdedness': nodes_df.at[cur_idx, 'train_data_up'][train]
      }])], ignore_index = True)

  return edges_df

In [None]:
for train in ['Chiyoda', 'Fukutoshin', 'Ginza', 'Hanzomon', 'Hibiya', 'Marunouchi', 'Namboku', 'Tozai', 'Yurakucho']:
  parent_folder = 'Tokyo Metro/' + train + '/'
  filepath = os.path.join(shared_folder, parent_folder)

  files = sorted([csvfile for csvfile in sorted(os.listdir(filepath)) if csvfile.lower().endswith('csv')])
  edges_df = get_edges(edges_df, files, train)
  print('Completed station of', train, 'line')

Completed station of Chiyoda line
Completed station of Fukutoshin line
Completed station of Ginza line
Completed station of Hanzomon line
Completed station of Hibiya line
Completed station of Marunouchi line
Completed station of Namboku line
Completed station of Tozai line
Completed station of Yurakucho line


In [None]:
prev_file = 'Nakano-shimbashi'
cur_file = 'Nakano-sakaue'

prev_idx = nodes_df[nodes_df['station_name'] == prev_file].index[0]
cur_idx = nodes_df[nodes_df['station_name'] == cur_file].index[0]

edges_df = pd.concat([edges_df, pd.DataFrame([{
    'source': prev_file,
    'target': cur_file,
    'line': train,
    'crowdedness': nodes_df.at[prev_idx, 'train_data_down']['Marunouchi']
}])], ignore_index = True)

edges_df = pd.concat([edges_df, pd.DataFrame([{
    'source': cur_file,
    'target': prev_file,
    'line': train,
    'crowdedness': nodes_df.at[cur_idx, 'train_data_up']['Marunouchi']
}])], ignore_index = True)

In [None]:
edges_df

Unnamed: 0,source,target,line,crowdedness
0,Yoyogi-uehara,Yoyogi-koen,Chiyoda,"{'W': ['0', '0', '0', '0', '1', '2', '2', '1',..."
1,Yoyogi-koen,Yoyogi-uehara,Chiyoda,"{'W': ['0', '0', '0', '0', '0', '0', '0', '0',..."
2,Yoyogi-koen,Meiji-jingumae,Chiyoda,"{'W': ['0', '0', '1', '1', '1', '2', '2', '1',..."
3,Meiji-jingumae,Yoyogi-koen,Chiyoda,"{'W': ['0', '0', '0', '0', '0', '0', '0', '0',..."
4,Meiji-jingumae,Omote-sando,Chiyoda,"{'W': ['0', '0', '0', '1', '1', '2', '2', '1',..."
...,...,...,...,...
347,Tatsumi,Toyosu,Yurakucho,"{'W': ['0', '0', '0', '0', '0', '0', '0', '0',..."
348,Tatsumi,Shin-kiba,Yurakucho,"{'W': ['0', '0', '0', '0', '0', '0', '0', '0',..."
349,Shin-kiba,Tatsumi,Yurakucho,"{'W': ['0', '0', '0', '0', '0', '0', '0', '0',..."
350,Nakano-shimbashi,Nakano-sakaue,Yurakucho,"{'W': ['0', '0', '0', '0', '0', '1', '1', '1',..."


### Edges check

In [None]:
# Edges assertion
source_count = edges_df['source'].value_counts().reset_index(drop=True)
target_count = edges_df['target'].value_counts().reset_index(drop=True)

source_count.equals(target_count)

True

In [None]:
for _, row in edges_df.iterrows():
  if row['crowdedness']['W'] is None:
    print(row['line'])
    print(f'{row["source"].upper()} to {row["target"].upper()} has no WEEKDAY data')
  if row['crowdedness']['H'] is None:
    print(row['line'])
    print(f'{row["source"].upper()} to {row["target"].upper()} has no HOLIDAY data')

In [None]:
edges_df.to_pickle('tokyo_metro_edges.pkl')

## Getting coordinates of stations using `geopy`

In [None]:
# geolocator = Nominatim(user_agent = 'my_geocoder')

# def get_coordinates(station_name, train_line):
#   location_query = f"{station_name} Station, {train_line} Line, Tokyo, Japan"
#   loc = geolocator.geocode(location_query)

#   if loc:
#     return pd.Series({ 'latitude': loc.latitude, 'longitude': loc.longitude })
#   else:
#     location_query = f"{station_name} Station, {train_line} Line, Chiba, Japan" # For the Tozai Line
#     loc = geolocator.geocode(location_query)

#     if loc:
#       print(station_name, 'in Chiba')
#       return pd.Series({ 'latitude': loc.latitude, 'longitude': loc.longitude })
#   return pd.Series({ 'latitude': None, 'longitude': None })

In [None]:
# nodes_df[['latitude', 'longitude']] = nodes_df['station_name'].apply(get_coordinates, args=(train,))

In [None]:
# nodes_df

## Graph

In [None]:
G = nx.DiGraph()

In [None]:
G.add_nodes_from([
    (row['station_name'], {
        col: row[col] for col in nodes_df.columns if col != 'station_name'
    }) for _, row in nodes_df.iterrows()
])

In [None]:
G.add_edges_from(edges_list)

NameError: ignored

In [None]:
nx.draw(G, with_labels=True, node_size=1000, node_color='lightblue', font_size=10)

In [None]:
plt.figure(figsize = (16, 9))

layout = {node: (node_data['longitude'], node_data['latitude']) for node, node_data in G.nodes(data = True)}
nx.draw(G, pos=layout, with_labels=True, node_size=500, node_color='lightblue', font_size=2)
plt.show()

**Haversine formula** - very accurate way of computing distances between two points on the surface of a sphere using the latitude and longitude of the two points

In [None]:
# def haversine(lat1, lon1, lat2, lon2):
#     R = 6371000  # Radius of Earth in meters
#     phi_1 = math.radians(lat1)
#     phi_2 = math.radians(lat2)
#     delta_phi = math.radians(lat2 - lat1)
#     delta_lambda = math.radians(lon2 - lon1)

#     a = math.sin(delta_phi / 2.0) ** 2 + math.cos(phi_1) * math.cos(phi_2) * math.sin(delta_lambda / 2.0) ** 2
#     c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

#     distance = R * c
#     return distance

In [None]:
# distances = {(u, v): haversine(G.nodes[u]['latitude'], G.nodes[u]['longitude'],
#                                 G.nodes[v]['latitude'], G.nodes[v]['longitude'])
#              for u, v in G.edges()}

# k = 2.0
# scaled_distances = {(u, v): distance * k for (u, v), distance in distances.items()}

# spring_layout = nx.spring_layout(G)
# layout = {node: (G.nodes[node]['longitude'], G.nodes[node]['latitude']) for node in G.nodes()}

# nx.draw(G, pos=layout, with_labels=True, node_size=500, node_color='lightblue', font_size=8)
# plt.show()