In [147]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats.mstats import gmean

In [61]:
b_h = pd.read_csv("data/barts_hotspots.csv")
b_all = pd.read_csv("data/barts_to_all.csv")
h_all = pd.read_csv("data/hotspots_to_all.csv")
h_one = pd.read_csv("data/hours_q1.csv")
h_two = pd.read_csv("data/hours_q2.csv")

every_hour= pd.read_csv("data/avg_speed_per_path.csv")

In [62]:
# Compare and contrast travel times between different BART stations and 
# Hotspots based on these factors: time of day, day of the week, and the direction of travel.

hotspot_ids = h_all["Origin Movement ID"].unique()
station_ids = b_all["Origin Movement ID"].unique()

hotspot_bart = b_h[b_h["Origin Movement ID"].isin(hotspot_ids) & b_h["Destination Movement ID"].isin(station_ids)]
hotspot_bart_mean = hotspot_bart.set_index(["Origin Movement ID", "Destination Movement ID"])[["Daily Mean Travel Time (Seconds)"]]

hs_to_b = hotspot_bart_mean.groupby(hotspot_bart_mean.index).mean()

bart_hotspot = b_h[b_h["Origin Movement ID"].isin(station_ids) & b_h["Destination Movement ID"].isin(hotspot_ids)]
bart_hotspot_mean = bart_hotspot.set_index(["Origin Movement ID", "Destination Movement ID"])[["Daily Mean Travel Time (Seconds)"]]

b_to_hs = bart_hotspot_mean.swaplevel(0, 1).groupby(hotspot_bart_mean.index).mean()



# bart hotspots Daily Mean Travel Time
b_to_hs = b_to_hs[["Daily Mean Travel Time (Seconds)"]].rename(columns={"Daily Mean Travel Time (Seconds)": "Bart to Hotspot"})
hs_to_b = hs_to_b[["Daily Mean Travel Time (Seconds)"]].rename(columns={"Daily Mean Travel Time (Seconds)": "Hotspot to Bart"})


directions = pd.concat([b_to_hs, hs_to_b], axis = 1)
directions

# ax = sns.barplot(x= directions.index, y="Bart to Hotspot", data=directions)


Unnamed: 0,Bart to Hotspot,Hotspot to Bart
"(3394, 3603)",820.41573,779.541436
"(3394, 3692)",839.696133,1014.845304
"(3394, 3760)",961.376471,1030.812155
"(3396, 3603)",787.133333,1300.190751
"(3396, 3692)",877.104972,1491.711111
"(3396, 3760)",971.838323,1415.519774
"(3792, 3603)",814.594444,538.596685
"(3792, 3692)",836.666667,576.171271
"(3792, 3760)",963.166667,830.905556


Mean ride time stats from Embarcadero Station to Oracle Park (3603 --> 3792) from hours dataset from both quarter 1 and quarter 2

In [215]:
hours = pd.concat([h_one, h_two], axis=0, join='outer', ignore_index=True)
# 3603 --> 3792 Embarcadero Station to Oracle Park
hours = hours.loc[(hours["sourceid"] == 3603) & (hours["dstid"] == 3792)] 

count_rides = hours.groupby("hod").count()[["sourceid"]]

Unnamed: 0,Daily Mean Travel Time (Seconds),AM Mean Travel Time (Seconds),PM Mean Travel Time (Seconds),Midday Mean Travel Time (Seconds),Evening Mean Travel Time (Seconds),Early Morning Mean Travel Time (Seconds)
"(3394, 3603)",779.541436,852.519685,857.752874,754.877095,667.865497,697.307692
"(3394, 3692)",1014.845304,988.259036,1239.226519,1049.0,870.788889,705.333333
"(3394, 3760)",1030.812155,1056.231481,1170.61236,999.672222,945.329609,831.833333
"(3396, 3603)",1300.190751,1360.186916,1399.795918,1304.377358,1188.7,
"(3396, 3692)",1491.711111,1500.708661,1632.340659,1495.503937,1363.133333,1156.727273
"(3396, 3760)",1415.519774,1428.884615,1543.081633,1396.802632,1351.777778,1685.0
"(3792, 3603)",538.596685,665.742515,600.089385,520.222222,435.596591,391.422535
"(3792, 3692)",576.171271,630.02924,672.738889,570.005556,481.744444,386.072727
"(3792, 3760)",830.905556,892.666667,1055.794118,828.36,719.909091,584.0


In [186]:
# 3603 --> 3792
time_stats = count_rides.join(every_hour[["Embarcadero Station to Oracle Park"]]).rename(columns = {"sourceid":"count", "Embarcadero Station to Oracle Park":"mean"})
time_stats["sum"] = time_stats["count"] * time_stats["mean"]


time_stats

Unnamed: 0_level_0,count,mean,sum
hod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2,415.79,831.58
1,2,430.115,860.23
2,2,403.75,807.5
3,2,421.99,843.98
4,2,467.84,935.68
5,2,423.295,846.59
6,2,459.74,919.48
7,2,527.445,1054.89
8,2,606.52,1213.04
9,2,628.93,1257.86


Mean ride time stats from Embarcadero Station to Oracle Park (3603 --> 3792) from b_h dataset

In [187]:
# 3603 --> 3792
emb_op = b_h.loc[(b_h["Origin Movement ID"] == 3603) & (b_h["Destination Movement ID"] == 3792 )]
emb_op_mean_time = emb_op.loc[:, emb_op.columns.str.contains('Mean')].apply(np.mean)
len(emb_op) #181

181

In [205]:
category_stats = pd.DataFrame(emb_op_mean_time, columns = ["Categorized"])
total_time = category_stats["Categorized"][0]*24
category_stats["hour_count"] = [24, 12, 12, np.nan, np.nan, np.nan]
category_stats["sum"] = category_stats["Categorized"] * category_stats["hour_count"]
category_stats["portion"] = category_stats["sum"]/total_time
category_stats

Unnamed: 0,Categorized,hour_count,sum,portion
Daily Mean Travel Time (Seconds),579.160221,24.0,13899.845304,1.0
AM Mean Travel Time (Seconds),548.602273,12.0,6583.227273,0.473619
PM Mean Travel Time (Seconds),771.707182,12.0,9260.486188,0.666229
Midday Mean Travel Time (Seconds),606.734807,,,
Evening Mean Travel Time (Seconds),488.677778,,,
Early Morning Mean Travel Time (Seconds),438.892655,,,


In [208]:
np.mean([emb_op_mean_time[1], emb_op_mean_time[2]])
total_hours = np.sum(time_stats["sum"]) #26693.85
np.sum(time_stats["sum"][np.arange(0, 12)]) #11833.53
11833.53/26693.85 #0.44330548047584
np.sum(time_stats["sum"][np.arange(12,25)]) / total_hours

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


0.5566945195241602

Assign hours to time categories (Midday, Evening, Early Morning)

In [225]:
time_of_day = pd.DataFrame(index = np.arange(24), columns = ["Time Category"])

# midday, 11:00-15:00 || evening, 7:00-23:00|| early morning, 0-6:00 || AM, 7:00-10:00 || PM, 15:00-18:00

for i in np.arange(7):
    time_of_day["Time Category"][i] = "Early Morning"
    
for i in np.arange(7, 11):
    time_of_day["Time Category"][i] = "AM"

for i in np.arange(11, 15):
    time_of_day["Time Category"][i] = "Midday"
    
for i in np.arange(15, 19):
    time_of_day["Time Category"][i] = "PM"
    
for i in np.arange(19, 24):
    time_of_day["Time Category"][i] = "Evening"

    
time_of_day.to_csv("data/hour_to_category.csv", index=True)