# HW7 API - NYC MTA API

#### Authors: Ling Lin, Xuanyu Lu, Qingyang Xiao

### 1. Import the data from the online source

To request data from the MTA, you'll also need a free API key, 
[Register here](https://api.mta.info/).

Reference: https://github.com/nolanbconaway/underground

In [1]:
# import the necessary packages for using this API

import os

# get metadata and SubwayFeed function
from underground import metadata, SubwayFeed

In [2]:
API_KEY = '5H3pxsm5LT8OwtBs9LWoWaNGf0j67zER1dxQL214'    # API key
ROUTE = 'Q'   # the route information we want to see, route Q is selected as example

# get route feed
feed = SubwayFeed.get(ROUTE, api_key=API_KEY)

ValueError: Invalid feed ID. Must be in `{1, 2, 36, 11, 16, 51, 21, 26, 31}`.

In [3]:
# get route raw data of route Q and its expected arrival times

q_train_stops = feed.extract_stop_dict()[ROUTE]

In [4]:
# print q train information

q_train_stops

{'Q04N': [datetime.datetime(2020, 4, 8, 19, 1, 50, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
  datetime.datetime(2020, 4, 8, 19, 9, 36, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
  datetime.datetime(2020, 4, 8, 19, 19, 32, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
  datetime.datetime(2020, 4, 8, 19, 30, 26, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
  datetime.datetime(2020, 4, 8, 19, 37, 38, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
  datetime.datetime(2020, 4, 8, 19, 50, 2, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
  datetime.datetime(2020, 4, 8, 20, 1, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
  datetime.datetime(2020, 4, 8, 20, 11, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
  datetime.datetime(2020, 4, 8, 20, 21, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
  datetime.datetime(2020, 4, 8, 20, 31, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00

### 2. Transform online data into python dataframe

References:
1. https://thispointer.com/pandas-convert-dataframe-index-into-column-using-dataframe-reset_index-in-python/
2. https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rename.html
3. https://discuss.codecademy.com/t/can-we-add-a-new-column-at-a-specific-position-in-a-pandas-dataframe/355842
4. https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Timestamp.html

In [5]:
# import the necessary package for turning this online data into a dataframe

import pandas as pd

In [6]:
# Convert online data from dictionary format into python dataframe

Schedule = pd.DataFrame.from_dict(q_train_stops, orient='index')

In [7]:
# Uncomment to see what the original dataframe looks like

Schedule

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Q04N,2020-04-08 19:01:50-04:00,2020-04-08 19:09:36-04:00,2020-04-08 19:19:32-04:00,2020-04-08 19:30:26-04:00,2020-04-08 19:37:38-04:00,2020-04-08 19:50:02-04:00,2020-04-08 20:01:00-04:00,2020-04-08 20:11:00-04:00,2020-04-08 20:21:00-04:00,2020-04-08 20:31:00-04:00,2020-04-08 20:40:45-04:00,2020-04-08 20:51:15-04:00
Q05N,2020-04-08 19:04:05-04:00,2020-04-08 19:11:51-04:00,2020-04-08 19:21:47-04:00,2020-04-08 19:32:41-04:00,2020-04-08 19:40:08-04:00,2020-04-08 19:52:32-04:00,2020-04-08 20:03:30-04:00,2020-04-08 20:13:30-04:00,2020-04-08 20:23:30-04:00,2020-04-08 20:33:30-04:00,2020-04-08 20:43:30-04:00,2020-04-08 20:53:30-04:00
R14N,2020-04-08 19:02:21-04:00,2020-04-08 19:12:17-04:00,2020-04-08 19:23:11-04:00,2020-04-08 19:30:08-04:00,2020-04-08 19:42:32-04:00,2020-04-08 19:53:30-04:00,2020-04-08 20:03:30-04:00,2020-04-08 20:13:30-04:00,2020-04-08 20:23:30-04:00,2020-04-08 20:33:00-04:00,2020-04-08 20:44:00-04:00,NaT
B08N,2020-04-08 19:04:51-04:00,2020-04-08 19:14:47-04:00,2020-04-08 19:25:41-04:00,2020-04-08 19:32:38-04:00,2020-04-08 19:45:02-04:00,2020-04-08 19:56:00-04:00,2020-04-08 20:06:00-04:00,2020-04-08 20:16:00-04:00,2020-04-08 20:26:00-04:00,2020-04-08 20:35:30-04:00,2020-04-08 20:46:30-04:00,NaT
Q03N,2020-04-08 19:06:51-04:00,2020-04-08 19:16:47-04:00,2020-04-08 19:27:41-04:00,2020-04-08 19:34:38-04:00,2020-04-08 19:47:02-04:00,2020-04-08 19:58:00-04:00,2020-04-08 20:08:00-04:00,2020-04-08 20:18:00-04:00,2020-04-08 20:28:00-04:00,2020-04-08 20:37:30-04:00,2020-04-08 20:48:30-04:00,NaT
D43S,2020-04-08 19:01:55-04:00,2020-04-08 19:15:27-04:00,2020-04-08 19:32:37-04:00,2020-04-08 19:44:12-04:00,2020-04-08 20:00:42-04:00,2020-04-08 20:13:00-04:00,2020-04-08 20:17:00-04:00,2020-04-08 20:26:00-04:00,2020-04-08 20:38:00-04:00,2020-04-08 20:46:00-04:00,2020-04-08 20:56:00-04:00,NaT
R20N,2020-04-08 19:04:47-04:00,2020-04-08 19:15:41-04:00,2020-04-08 19:22:38-04:00,2020-04-08 19:35:02-04:00,2020-04-08 19:46:00-04:00,2020-04-08 19:56:00-04:00,2020-04-08 20:06:00-04:00,2020-04-08 20:16:00-04:00,2020-04-08 20:25:30-04:00,2020-04-08 20:36:30-04:00,NaT,NaT
R17N,2020-04-08 19:07:47-04:00,2020-04-08 19:18:41-04:00,2020-04-08 19:25:38-04:00,2020-04-08 19:38:02-04:00,2020-04-08 19:49:00-04:00,2020-04-08 19:59:00-04:00,2020-04-08 20:09:00-04:00,2020-04-08 20:19:00-04:00,2020-04-08 20:28:30-04:00,2020-04-08 20:39:30-04:00,NaT,NaT
R16N,2020-04-08 19:09:17-04:00,2020-04-08 19:20:11-04:00,2020-04-08 19:27:08-04:00,2020-04-08 19:39:32-04:00,2020-04-08 19:50:30-04:00,2020-04-08 20:00:30-04:00,2020-04-08 20:10:30-04:00,2020-04-08 20:20:30-04:00,2020-04-08 20:30:00-04:00,2020-04-08 20:41:00-04:00,NaT,NaT
D30S,2020-04-08 19:01:38-04:00,2020-04-08 19:09:07-04:00,2020-04-08 19:21:42-04:00,2020-04-08 19:38:12-04:00,2020-04-08 19:49:00-04:00,2020-04-08 19:53:30-04:00,2020-04-08 20:03:30-04:00,2020-04-08 20:13:30-04:00,2020-04-08 20:23:30-04:00,2020-04-08 20:33:30-04:00,NaT,NaT


In [8]:
# Add the index to be a new column in dataframe
# The original index is stop_id, so add stop_id to be a new column of dataframe

Schedule1 = Schedule.reset_index()

# Rename the newly added column as "stop_id"
Schedule2 = Schedule1.rename(columns={'index':'stop_id'})


# Read the last character of stop_id to identify the direction, N represents North, S represents South
Direction = []

for i in range(0, len(Schedule)):
    Direction.append(Schedule2.stop_id[i][-1])
        
# Direction

# Insert the new direction column to a specific postion, e.g. position 1 in this case
# Do not run this code the second time without rerun the above codes, 
# because it will appear an error if the column is already inserted 

Schedule2.insert(1, 'direction', Direction)

# Extract date information from the Timestamp column, create a new column and insert it to position 2
Schedule2.insert(2, 'date', Schedule2[0].dt.date)


# Another way of adding a new column to dataframe, but no specific position
# Schedule2['direction'] = Direction
# Schedule2['date'] = Schedule2[0].dt.date


# Replace the original Timestamp columns (Timestamp shows date and time) with time only

for j in range(0, len(Schedule2)):                # loop through rows
    for k in range(3, len(Schedule2.iloc[0])):       # loop through columns 
        if pd.notna(Schedule2.iloc[j, k]):              # Checking for missing value
            Schedule2.iloc[j, k] = pd.Timestamp.time(Schedule2.iloc[j, k])
            
            
# Another approach, work with dataframe that doesn't have NAs 
# Schedule2[0] = Schedule2[0].dt.time


In [52]:
# Uncomment to see what the new dataframe looks like now

Schedule2

Unnamed: 0,stop_id,direction,date,0,1,2,3,4,5,6,7,8,9,10,11
0,B08N,N,2020-04-08,18:56:18,19:05:50,19:15:09,19:27:20,19:32:20,19:45:50,19:56:00,20:06:00,20:16:00,20:26:00,20:35:30,20:46:30
1,Q03N,N,2020-04-08,18:58:18,19:07:50,19:17:09,19:29:20,19:34:20,19:47:50,19:58:00,20:08:00,20:18:00,20:28:00,20:37:30,20:48:30
2,Q04N,N,2020-04-08,19:01:03,19:10:35,19:19:54,19:32:05,19:37:20,19:50:50,20:01:00,20:11:00,20:21:00,20:31:00,20:40:45,20:51:15
3,Q05N,N,2020-04-08,19:03:18,19:12:50,19:22:09,19:34:20,19:39:50,19:53:20,20:03:30,20:13:30,20:23:30,20:33:30,20:43:30,20:53:30
4,D39S,S,2020-04-08,18:56:07,19:07:50,19:23:17,19:36:19,19:50:00,19:57:30,20:07:30,20:17:30,20:27:30,20:37:30,20:47:30,NaT
5,D40S,S,2020-04-08,18:58:22,19:10:20,19:25:47,19:38:49,19:52:30,20:00:45,20:10:30,20:20:00,20:31:00,20:40:00,20:50:00,NaT
6,D41S,S,2020-04-08,19:00:37,19:12:50,19:28:17,19:41:19,19:55:00,20:04:00,20:13:30,20:22:30,20:34:30,20:42:30,20:52:30,NaT
7,D42S,S,2020-04-08,19:02:37,19:14:50,19:30:17,19:43:19,19:57:00,20:06:00,20:15:30,20:24:30,20:36:30,20:44:30,20:54:30,NaT
8,D43S,S,2020-04-08,19:04:07,19:16:20,19:31:47,19:44:49,19:58:30,20:07:30,20:17:00,20:26:00,20:38:00,20:46:00,20:56:00,NaT
9,R20N,N,2020-04-08,18:55:50,19:05:09,19:17:20,19:22:20,19:35:50,19:46:00,19:56:00,20:06:00,20:16:00,20:25:30,20:36:30,NaT


### 3. Data Visulization

References:
- https://stackoverflow.com/questions/43757820/how-to-add-a-variable-to-python-plt-title
- https://stackoverflow.com/questions/14432557/matplotlib-scatter-plot-with-different-text-at-each-data-point

In [9]:
from datetime import date
from datetime import datetime

# Use a "magic command" to specify how we want our plots displayed
# %matplotlib notebook show dynamic/interactive plot

%matplotlib notebook
import matplotlib.pyplot as plt


#### Plot 1: This plot illustrates time lag  in minutes for nearest top 10 stops.

In [10]:
# Rename Schedule dataframe as sch_df
sch_df = Schedule2

# show top 10 time nearest station info
sch_df.head(10)

for i in range(10):
    sch_df.iloc[i]['stop_id']
    
# find top 10 time nearest stops
stop = []
for i in range(20):
    stop.append(sch_df.iloc[i]['stop_id'])

print(stop)
# stop = [sch_df.iloc[0]['stop_id'],sch_df.iloc[1]['stop_id']]
# stop = list.append('sch_df.iloc[0]['stop_id']')

['Q04N', 'Q05N', 'R14N', 'B08N', 'Q03N', 'D43S', 'R20N', 'R17N', 'R16N', 'D30S', 'D37S', 'D38S', 'D39S', 'D40S', 'D41S', 'D42S', 'D24N', 'R30N', 'Q01N', 'D26S']


In [11]:
# !more stops.txt

In [12]:
# Read the text file that contains stops information 

coord_df = pd.read_csv('stops.txt')
print(coord_df)

     stop_id  stop_code                    stop_name  stop_desc   stop_lat  \
0        101        NaN  Van Cortlandt Park - 242 St        NaN  40.889248   
1       101N        NaN  Van Cortlandt Park - 242 St        NaN  40.889248   
2       101S        NaN  Van Cortlandt Park - 242 St        NaN  40.889248   
3        103        NaN                       238 St        NaN  40.884667   
4       103N        NaN                       238 St        NaN  40.884667   
...      ...        ...                          ...        ...        ...   
1498    S30N        NaN                Tompkinsville        NaN  40.636949   
1499    S30S        NaN                Tompkinsville        NaN  40.636949   
1500     S31        NaN                    St George        NaN  40.643748   
1501    S31N        NaN                    St George        NaN  40.643748   
1502    S31S        NaN                    St George        NaN  40.643748   

       stop_lon  zone_id  stop_url  location_type parent_statio

In [13]:
# find stops name

stop_name = []
for i in range(10):
    stop_name.append(coord_df.loc[coord_df['stop_id'] == stop[i],'stop_name'].values[0][0:20])
print(stop_name)


# find latitude and longtitude of these 10 stops

x_lat = []
for i in range(10):
    x_lat.append(coord_df.loc[coord_df['stop_id'] == stop[i],'stop_lat'].values[0])
print(x_lat)


# find longtitude of these 10 stops

x_lon = []
for i in range(10):
    x_lon.append(coord_df.loc[coord_df['stop_id'] == stop[i],'stop_lon'].values[0])
print(x_lon)

['86 St', '96 St', '57 St - 7 Av', 'Lexington Av/63 St', '72 St', 'Coney Island - Still', '14 St - Union Sq', '34 St - Herald Sq', 'Times Sq - 42 St', 'Cortelyou Rd']
[40.777891, 40.784318, 40.764664, 40.764629, 40.768799, 40.577422, 40.735735999999996, 40.749567, 40.754672, 40.640927000000005]
[-73.951787, -73.947152, -73.98065799999999, -73.966113, -73.958424, -73.98123299999999, -73.99056800000001, -73.98795, -73.986754, -73.963891]


In [14]:
# calculate nearest waiting time btw top 2 trains in minutes for this train at these 10 stops
# sch_df.iloc[0].values[3]

# time_st = mean(sch_df.iloc[i].values[3])

lag_min = []

for i in range(10):
    dt1 = datetime.combine(date.today(), sch_df.iloc[i].values[3])
    dt2 = datetime.combine(date.today(), sch_df.iloc[i].values[4])
    #dt3 = datetime.combine(date.today(), sch_df.iloc[i].values[5])
    #lag_min = ((dt2 - dt1).total_seconds()+(dt3 - dt2).total_seconds())/2
    #lag_min = (dt2 - dt1).total_seconds()/60
    sec1 = dt2 - dt1
    lag = sec1.total_seconds()/60
    #print (lag_min)
    lag_min.append(lag)
    
print(lag_min)

[7.766666666666667, 7.766666666666667, 9.933333333333334, 9.933333333333334, 9.933333333333334, 13.533333333333333, 10.9, 10.9, 10.9, 7.483333333333333]


In [None]:
def trible (lst):
    return [i*5 for i in lst]
lag_t = fiveble(lag_min)

In [15]:
plt.title ('Time Lag (minutes) for nearest top 10 stops')

# plt.scatter (x_lat, x_lon, s = lag_min);

# different color
colors = []
for i in range (10):
    colors . append ( lag_min [i]/float(max(lag_min)))

plt.scatter(x_lat, x_lon ,s = lag_t, c=colors , alpha =1);

# add text
for i in range (10):
    myx = x_lat[i]
    myy = x_lon[i]
    mystop = stop_name [i]
    plt. text (myx , myy , mystop , color ="red", fontsize =7,horizontalalignment='left',verticalalignment='top')
    
# x and y labels
plt.ylabel('Station Longtitude')
plt.xlabel('Station Latitude')

# Save plot as a png. image
plt.savefig('TimeLag.png')

#plt.show()

<IPython.core.display.Javascript object>

#### Plot 2: This plot illustrates the time gap between train arrivals for a pre-identified stop.

In [16]:
# Identify the stop_id that we wish to search
# In this case, we use stop 'Q05N' as the pre-identified stop

id = 'Q05N'

In [17]:
# Find the index that associates with the corresponding stop
index = int(Schedule2.index[Schedule2['stop_id'] == id].values)

# Find the schedule of that stop
times = Schedule2.iloc[index]

# Drop the NAs in the data
y = times.dropna()[3::]

# Set the x-values
x = range(0, len(y), 1)

In [18]:
# Plot times data using matplotlib
pd.plotting.register_matplotlib_converters()

# Plot size
plt.figure(figsize=(8, 5))

# Plot title
plt.title('Train Arrival Schedule of Stop: '+str(id)+' on '+str(times['date']))

# x and y labels
plt.ylabel('Time of Arrivals')
plt.xlabel('Arrivals')

# Line plot
plt.plot(x, y, 'go-')

# Lable points
for i, txt in enumerate(y):
    plt.annotate(txt, (x[i]+0.3, y[i]))

# Save plot as a png. image
plt.savefig('MTA_TimeGap.png')


<IPython.core.display.Javascript object>

#### Plot 3: This plot illustrates the average time interval of Q train stops in seconds.
- Showing the busyness of the stops

In [19]:
# Select Q train stops from the dataframe and put it into a new dataframe q_schedule
index_list = list(Schedule.index)
index_q = []

# Loop through to select stops belong to Q train
for q in index_list:
    if 'Q' in q:
        index_q.append(q)
q_schedule = Schedule.loc[index_q, :]

In [20]:
# Calculate time interval of all stops
time_interval = []
for i in range(len(q_schedule.columns)-1):
    time_interval.append(list(q_schedule[i+1] - q_schedule[i]))

# Convert time intervals from timedelta format to seconds
for a in range(len(time_interval)):
    for b in range(len(time_interval[a])):
        time_interval[a][b] = time_interval[a][b].total_seconds()

# Convert time interval data from lists to dataframe
time_interval_df = pd.DataFrame(time_interval)

# Calculate the average time interval of all stops
average_time_interval = list(time_interval_df.mean())
q_schedule['Average Time Interval'] = average_time_interval

# Show the new dataframe
q_schedule

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,Average Time Interval
Q04N,2020-04-08 19:01:50-04:00,2020-04-08 19:09:36-04:00,2020-04-08 19:19:32-04:00,2020-04-08 19:30:26-04:00,2020-04-08 19:37:38-04:00,2020-04-08 19:50:02-04:00,2020-04-08 20:01:00-04:00,2020-04-08 20:11:00-04:00,2020-04-08 20:21:00-04:00,2020-04-08 20:31:00-04:00,2020-04-08 20:40:45-04:00,2020-04-08 20:51:15-04:00,596.818182
Q05N,2020-04-08 19:04:05-04:00,2020-04-08 19:11:51-04:00,2020-04-08 19:21:47-04:00,2020-04-08 19:32:41-04:00,2020-04-08 19:40:08-04:00,2020-04-08 19:52:32-04:00,2020-04-08 20:03:30-04:00,2020-04-08 20:13:30-04:00,2020-04-08 20:23:30-04:00,2020-04-08 20:33:30-04:00,2020-04-08 20:43:30-04:00,2020-04-08 20:53:30-04:00,596.818182
Q03N,2020-04-08 19:06:51-04:00,2020-04-08 19:16:47-04:00,2020-04-08 19:27:41-04:00,2020-04-08 19:34:38-04:00,2020-04-08 19:47:02-04:00,2020-04-08 19:58:00-04:00,2020-04-08 20:08:00-04:00,2020-04-08 20:18:00-04:00,2020-04-08 20:28:00-04:00,2020-04-08 20:37:30-04:00,2020-04-08 20:48:30-04:00,NaT,609.9
Q01N,2020-04-08 19:11:11-04:00,2020-04-08 19:18:08-04:00,2020-04-08 19:30:32-04:00,2020-04-08 19:41:30-04:00,2020-04-08 19:51:30-04:00,2020-04-08 20:01:30-04:00,2020-04-08 20:11:30-04:00,2020-04-08 20:21:00-04:00,2020-04-08 20:32:00-04:00,NaT,NaT,NaT,606.125
Q01S,2020-04-08 19:16:42-04:00,2020-04-08 19:27:30-04:00,2020-04-08 19:32:00-04:00,2020-04-08 19:42:00-04:00,2020-04-08 19:52:00-04:00,2020-04-08 20:02:00-04:00,2020-04-08 20:12:00-04:00,NaT,NaT,NaT,NaT,NaT,553.0
Q05S,2020-04-08 19:07:00-04:00,2020-04-08 19:11:30-04:00,2020-04-08 19:21:30-04:00,2020-04-08 19:31:30-04:00,2020-04-08 19:41:30-04:00,2020-04-08 19:51:30-04:00,NaT,NaT,NaT,NaT,NaT,NaT,534.0
Q04S,2020-04-08 19:09:00-04:00,2020-04-08 19:13:30-04:00,2020-04-08 19:23:30-04:00,2020-04-08 19:33:30-04:00,2020-04-08 19:43:30-04:00,2020-04-08 19:53:30-04:00,NaT,NaT,NaT,NaT,NaT,NaT,534.0
Q03S,2020-04-08 19:11:00-04:00,2020-04-08 19:15:30-04:00,2020-04-08 19:25:30-04:00,2020-04-08 19:35:30-04:00,2020-04-08 19:45:30-04:00,2020-04-08 19:55:30-04:00,NaT,NaT,NaT,NaT,NaT,NaT,534.0


In [21]:
# Set labels and x axis of the plot
labels = list(q_schedule.index)
x = range(len(labels))

# Plot the Average Time Interval for different Q train stops
fig, ax = plt.subplots()

# mark title, x and y axis labels
plt.bar(x, average_time_interval)
plt.xticks(x, labels)
ax.set_ylabel('Average Time Interval (sec.)')
ax.set_xlabel('Stops')
ax.set_title('Average Time Interval for Q Train Stops');

# Save plot as a png. image
plt.savefig('MTA_ATI.png')


<IPython.core.display.Javascript object>