In [1]:
import pandas as pd
from collections import defaultdict
csv_file = "MBTA Bus Arrival Departure Apr-June 2019.csv"

In [None]:
df = pd.read_csv(csv_file, nrows=100000)

# Check what data looks like
# print(df.head())

# Check the types
# print(df.dtypes)

df["service_date"] = pd.to_datetime(df["service_date"])

df["scheduled"] = pd.to_datetime(df["scheduled"]).dt.strftime("%H:%M:%S")
df["actual"] = pd.to_datetime(df["actual"]).dt.strftime("%H:%M:%S")

df["scheduled"] = pd.to_datetime(df["service_date"].dt.date.astype(str) + " " + df["scheduled"])
df["actual"] = pd.to_datetime(df["service_date"].dt.date.astype(str) + " " + df["actual"])

# We remove earliness as there is no indication of what it means and how it's calculated 
# in the source (i.e. being late gives positive earliness scores sometimes but usually negative)
df = df.drop(columns=["service_date", "earliness"])

# calculate our own lateness score which is just in seconds
df["lateness"] = (df["actual"] - df["scheduled"]).dt.total_seconds()
average_lateness = df["lateness"].mean()
print(average_lateness)

df_grouped = df.groupby("route_id")


df.head(20)


170.26486090931266


Unnamed: 0,route_id,direction,half_trip_id,stop_id,time_point_id,time_point_order,point_type,standard_type,scheduled,actual,scheduled_headway,headway,lateness
0,1,Inbound,42976988.0,75,mit,4.0,Midpoint,Schedule,2019-04-01 05:19:00,2019-04-01 05:21:20,,,140.0
1,1,Inbound,42976988.0,79,hynes,5.0,Midpoint,Schedule,2019-04-01 05:23:00,2019-04-01 05:24:17,,,77.0
2,1,Inbound,42976988.0,187,masta,6.0,Midpoint,Schedule,2019-04-01 05:25:00,2019-04-01 05:26:05,,,65.0
3,1,Inbound,42976988.0,59,Wasma,7.0,Midpoint,Schedule,2019-04-01 05:29:00,2019-04-01 05:27:59,,,-61.0
4,1,Inbound,42977170.0,110,hhgat,1.0,Startpoint,Headway,2019-04-01 05:30:00,2019-04-01 05:29:55,1200.0,1230.0,-5.0
5,1,Inbound,42977170.0,67,maput,2.0,Midpoint,Headway,2019-04-01 05:33:00,2019-04-01 05:35:19,1200.0,1278.0,139.0
6,1,Inbound,42976988.0,62,Melwa,8.0,Midpoint,Schedule,2019-04-01 05:34:00,2019-04-01 05:31:28,,,-152.0
7,1,Inbound,42977170.0,72,cntsq,3.0,Midpoint,Headway,2019-04-01 05:36:00,2019-04-01 05:38:57,1200.0,1297.0,177.0
8,1,Inbound,42976988.0,64,Dudly,9.0,Endpoint,Schedule,2019-04-01 05:38:00,2019-04-01 05:32:00,,,-360.0
9,1,Inbound,42977170.0,75,mit,4.0,Midpoint,Headway,2019-04-01 05:39:00,2019-04-01 05:42:00,1200.0,1240.0,180.0


In [None]:
# Loading entire csv into memory crashes computer so we process in chunks
chunk_size = 500_000

for chunk in pd.read_csv(csv_file, chunksize=chunk_size):
    print(chunk.shape)

(500000, 14)
(500000, 14)
(500000, 14)


KeyboardInterrupt: 