In [1]:
from google.colab import drive
import os
import sys

drive.mount('/content/drive')

shared_folder = '/content/drive/MyDrive/Ariya Narayanasamy/Data'
os.chdir(shared_folder)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import csv
import pandas as pd
import numpy as np
import pickle

In [3]:
!pip install pykakasi



In [4]:
import pykakasi
import string
from datetime import datetime
import multiprocessing

In [5]:
def to_romaji(text, preprocess = "都道府県市町村区"):
    # Text preprocessing
    if preprocess:
        text = ''.join([c for c in text if c not in preprocess])

    to_replace = [('Kakuekiteisha', 'Local'), ('Kaisoku', 'Rapid'), ('Tokyou', 'Tokyo'), ('Taishi', 'Daishi'),
    ('Keiyou (1)', 'Keiyo'), ('Marunouchi (1)', 'Marunouchi'), ('Tokaidou', 'Tokaido'), ('Yuurakucho', 'Yurakucho')]

    kks = pykakasi.kakasi()
    result = kks.convert(text)

    romaji = ' '.join([el['passport'] for el in result]).title().strip()
    for replace_stuff in to_replace:
        romaji = romaji.replace(*replace_stuff)

    if romaji.split()[-1] == 'Hon': # '本線' check
        romaji = ''.join(romaji.split()[:-1]) + ' Main'

    romaji = ''.join([c for c in romaji if c in string.ascii_lowercase + string.ascii_uppercase + ' '])

    return romaji.strip()

In [6]:
train_companies = {
    "東日本旅客鉄道": "JR East",
    "東京地下鉄": "Tokyo Metro",
    "東武鉄道": "Tobu Railway",
    "西武鉄道": "Seibu Railway",
    "京成電鉄": "Keisei Electric Railway",
    "京浜急行電鉄": "Keikyu Corporation",
    # "東京臨海高速鉄道": "Tokyo Waterfront Area Rapid Transit",
    # "東京モノレール": "Tokyo Monorail",
    "小田急電鉄": "Odakyu Electric Railway",
    "相鉄": "Sagami Railway",
    "東急電鉄": "Tokyu Corporation",
    "京王電鉄": "Keio Corporation",
    "東京都交通局": "Toei Subway"
}

In [7]:
def translate_station(station):
    if station in train_companies.keys():
        return train_companies[station]

In [8]:
trip_cols = ['entry',
    'enter_company', 'enter_train_name', 'enter_station_name', 'enter_pref', 'enter_ward', 'enter_time',
    'exit_company', 'exit_train_name', 'exit_station_name', 'exit_pref', 'exit_ward', # Exit time is not a column header
    'time_taken', 'num_people'
]

In [9]:
trip_data = pd.DataFrame(columns=trip_cols)

In [10]:
process = 0

In [11]:
def process_chunk(chunk):
    global process

    aggregate_trip_data = pd.DataFrame(columns=trip_cols)

    print('Chunk process starting')

    for entry, row in chunk.iterrows():
        row_df = pd.DataFrame({
            'entry'               : [entry],
            'enter_train_name'    : [to_romaji(row['【入場】路線名'], preprocess="線")],
            'enter_company'       : [translate_station(row['【入場】事業者名'])],
            'enter_station_name'  : [to_romaji(row['【入場】駅名'], preprocess=False)],
            'enter_pref'          : [to_romaji(row['【入場】都道府県'])],
            'enter_ward'          : [to_romaji(row['【入場】市町村区'])],
            'enter_time'          : [row['【入場】時間帯']],
            'exit_company'        : [translate_station(row['【出場】事業者名'])],
            'exit_train_name'     : [to_romaji(row['【出場】路線名'], preprocess="線")],
            'exit_station_name'   : [to_romaji(row['【出場】駅名'], preprocess=False)],
            'exit_pref'           : [to_romaji(row['【出場】都道府県'])],
            'exit_ward'           : [to_romaji(row['【出場】市町村区'])],
            'time_taken'          : [row['所要時間（５分単位）']],
            'num_people'          : [row['人数']],
        })
        aggregate_trip_data = pd.concat([aggregate_trip_data, row_df], ignore_index=True)

        process += 1

    print('Chunk process completed')

    return aggregate_trip_data

In [12]:
filepath = os.path.join(shared_folder, 'Processed Tokyo Metro OD Data')
csv_files = sorted([f for f in os.listdir(filepath) if f.endswith('.csv')])

for csv_file in csv_files:
    csv_file = os.path.join(filepath, csv_file)

    print(f'Processing {csv_file}')

    num_cores = multiprocessing.cpu_count()
    chunk_size = 500

    chunks = pd.read_csv(csv_file, chunksize=chunk_size)
    # chunks_with_index = [(index, chunk) for index, chunk in enumerate(chunks)]

    pool = multiprocessing.Pool(processes=num_cores)
    chunk_trip_data = pool.map(process_chunk, chunks)

    pool.close()
    pool.join()

    trip_data = pd.concat([trip_data, chunk_trip_data], ignore_index=True)

Processing /content/drive/MyDrive/Ariya Narayanasamy/Data/Processed Tokyo Metro OD Data/tm_file1.csv
Chunk process starting
Chunk process starting
Chunk process completed
Chunk process starting
Chunk process completed
Chunk process starting
Chunk process completed
Chunk process starting
Chunk process completed
Chunk process starting


Process ForkPoolWorker-1:
Process ForkPoolWorker-2:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.10/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/usr/lib/python3.10/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/usr/lib/python3.10/multiprocessing/pool.py", line 48, in mapstar
    return list(map(*args))
  File "/usr/lib/python3.10/multiprocessing/pool.py", line 48, in mapstar
    return list(map(*args))
  File "<ipy

Chunk process starting
Chunk process starting


KeyboardInterrupt: ignored

Estimated 7.25 hours

In [None]:
trip_data.to_pickle(f'Pickle Saves/final_tm_trip_data.pkl')

In [None]:
shared_folder = '/content/drive/MyDrive/Ariya Narayanasamy/Data'
os.chdir(shared_folder)

In [None]:
def load_and_print(filename):
    loaded_df = pd.read_pickle(filename)
    return loaded_df

In [None]:
nodes_df = load_and_print('tokyo_metro_nodes.pkl')
nodes_df

In [None]:
nodes = [row['station_name'] for _, row in nodes_df.iterrows() if row['fare_gate_data_in']['W'] is not None]
nodes

In [None]:
def convert_to_string(text):
    return ''.join([c for c in text if c in string.ascii_letters]).capitalize()

In [None]:
unformatted_nodes = list(map(convert_to_string, nodes))

## Matrix Creation

In [None]:
correction_dict = {}

In [None]:
skip_station = []

In [None]:
def try_indexing(station_name):
    global correction_dict, skip_station

    if station_name in skip_station:
        return None

    try:
        origin = nodes.index(station_name)
    except (KeyError, ValueError):
        if station_name in correction_dict.keys():
            origin = nodes.index(correction_dict[station_name])
        else:
            unformat_st_string = convert_to_string(station_name)
            if unformat_st_string in unformatted_nodes:
                corr = nodes[unformatted_nodes.index(unformat_st_string)]
            else:
                corr = input(f'Change {station_name} to >>> ').strip()

            if corr == '':
                skip_station.append(station_name)
                return None

            correction_dict[station_name] = corr
            origin = nodes.index(corr)

    return origin

In [None]:
matrix_list = [np.zeros((138, 138)) for _ in range(24)]

In [None]:
for _, row in tm_df.iterrows():
    enter_time = row['enter_time']
    origin = try_indexing(row['enter_station_name'])
    dest = try_indexing(row['exit_station_name'])

    if origin and dest:
        matrix_list[enter_time - 1][origin, dest] += row['num_people']

In [None]:
correction_dict

In [None]:
skip_station

In [None]:
matrix_list

In [None]:
np.savez('matrix_list.npz', *matrix_list)