# HW7 API - NYC MTA API

#### Authors: Ling Lin, Xuanyu Lu, Qingyang Xiao

### 1. Import the data from the online source

To request data from the MTA, you'll also need a free API key, 
[Register here](https://api.mta.info/).

Reference: https://github.com/nolanbconaway/underground

In [1]:
# import the necessary packages for using this API

import os

# get metadata and SubwayFeed function
from underground import metadata, SubwayFeed

In [2]:
API_KEY = '5H3pxsm5LT8OwtBs9LWoWaNGf0j67zER1dxQL214'    # API key
ROUTE = 'Q'   # the route information we want to see, route Q is selected as example

# get route feed
feed = SubwayFeed.get(ROUTE, api_key=API_KEY)

In [3]:
# get route raw data of route Q and its expected arrival times

q_train_stops = feed.extract_stop_dict()[ROUTE]

In [4]:
# print q train information

q_train_stops

{'D41S': [datetime.datetime(2020, 4, 8, 17, 26, 41, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
  datetime.datetime(2020, 4, 8, 17, 28, 36, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
  datetime.datetime(2020, 4, 8, 17, 54, 16, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
  datetime.datetime(2020, 4, 8, 18, 1, 26, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
  datetime.datetime(2020, 4, 8, 18, 10, 36, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
  datetime.datetime(2020, 4, 8, 18, 30, 30, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
  datetime.datetime(2020, 4, 8, 18, 44, 30, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
  datetime.datetime(2020, 4, 8, 18, 54, 30, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
  datetime.datetime(2020, 4, 8, 19, 14, 30, tzinfo=<DstTzInfo 'US/Eastern' EDT-1 day, 20:00:00 DST>),
  datetime.datetime(2020, 4, 8, 19, 29, tzinfo=<DstTzInfo 'US/Eastern' EDT-

### 2. Transform online data into python dataframe

References:
1. https://thispointer.com/pandas-convert-dataframe-index-into-column-using-dataframe-reset_index-in-python/
2. https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rename.html
3. https://discuss.codecademy.com/t/can-we-add-a-new-column-at-a-specific-position-in-a-pandas-dataframe/355842
4. https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Timestamp.html

In [5]:
# import the necessary package for turning this online data into a dataframe

import pandas as pd

In [6]:
# Convert online data from dictionary format into python dataframe

Schedule = pd.DataFrame.from_dict(q_train_stops, orient='index')

In [7]:
# Uncomment to see what the original dataframe looks like

Schedule

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
D41S,2020-04-08 17:26:41-04:00,2020-04-08 17:28:36-04:00,2020-04-08 17:54:16-04:00,2020-04-08 18:01:26-04:00,2020-04-08 18:10:36-04:00,2020-04-08 18:30:30-04:00,2020-04-08 18:44:30-04:00,2020-04-08 18:54:30-04:00,2020-04-08 19:14:30-04:00,2020-04-08 19:29:00-04:00
D42S,2020-04-08 17:28:41-04:00,2020-04-08 17:30:36-04:00,2020-04-08 17:56:16-04:00,2020-04-08 18:03:26-04:00,2020-04-08 18:12:36-04:00,2020-04-08 18:32:30-04:00,2020-04-08 18:46:30-04:00,2020-04-08 18:56:30-04:00,2020-04-08 19:16:30-04:00,2020-04-08 19:31:00-04:00
D43S,2020-04-08 17:30:11-04:00,2020-04-08 17:32:06-04:00,2020-04-08 17:57:46-04:00,2020-04-08 18:04:56-04:00,2020-04-08 18:14:06-04:00,2020-04-08 18:34:00-04:00,2020-04-08 18:48:00-04:00,2020-04-08 18:58:00-04:00,2020-04-08 19:18:00-04:00,2020-04-08 19:32:30-04:00
Q03N,2020-04-08 17:26:03-04:00,2020-04-08 17:47:55-04:00,2020-04-08 18:01:38-04:00,2020-04-08 18:35:00-04:00,2020-04-08 18:49:00-04:00,2020-04-08 18:19:56-04:00,2020-04-08 18:58:30-04:00,2020-04-08 19:08:30-04:00,2020-04-08 19:18:30-04:00,NaT
Q04N,2020-04-08 17:28:33-04:00,2020-04-08 17:50:25-04:00,2020-04-08 18:04:08-04:00,2020-04-08 18:37:30-04:00,2020-04-08 18:51:30-04:00,2020-04-08 18:22:26-04:00,2020-04-08 19:01:15-04:00,2020-04-08 19:11:15-04:00,2020-04-08 19:21:15-04:00,NaT
Q05N,2020-04-08 17:30:33-04:00,2020-04-08 17:52:25-04:00,2020-04-08 18:06:08-04:00,2020-04-08 18:39:30-04:00,2020-04-08 18:53:30-04:00,2020-04-08 18:24:26-04:00,2020-04-08 19:03:30-04:00,2020-04-08 19:13:30-04:00,2020-04-08 19:23:30-04:00,NaT
D40S,2020-04-08 17:26:21-04:00,2020-04-08 17:51:31-04:00,2020-04-08 17:59:11-04:00,2020-04-08 18:08:21-04:00,2020-04-08 18:28:15-04:00,2020-04-08 18:42:15-04:00,2020-04-08 18:52:15-04:00,2020-04-08 19:12:00-04:00,2020-04-08 19:26:30-04:00,NaT
Q01N,2020-04-08 17:30:55-04:00,2020-04-08 17:44:38-04:00,2020-04-08 18:18:00-04:00,2020-04-08 18:32:30-04:00,2020-04-08 18:02:56-04:00,2020-04-08 18:42:00-04:00,2020-04-08 18:52:00-04:00,2020-04-08 19:02:00-04:00,NaT,NaT
R20N,2020-04-08 17:35:25-04:00,2020-04-08 17:49:08-04:00,2020-04-08 18:22:30-04:00,2020-04-08 18:37:00-04:00,2020-04-08 18:07:26-04:00,2020-04-08 18:46:30-04:00,2020-04-08 18:56:30-04:00,2020-04-08 19:06:30-04:00,NaT,NaT
R17N,2020-04-08 17:38:55-04:00,2020-04-08 17:52:38-04:00,2020-04-08 18:26:00-04:00,2020-04-08 18:40:00-04:00,2020-04-08 18:10:56-04:00,2020-04-08 18:49:30-04:00,2020-04-08 18:59:30-04:00,2020-04-08 19:09:30-04:00,NaT,NaT


In [8]:
# Add the index to be a new column in dataframe
# The original index is stop_id, so add stop_id to be a new column of dataframe

Schedule1 = Schedule.reset_index()

# Rename the newly added column as "stop_id"
Schedule2 = Schedule1.rename(columns={'index':'stop_id'})


# Read the last character of stop_id to identify the direction, N represents North, S represents South
Direction = []

for i in range(0, len(Schedule)):
    Direction.append(Schedule2.stop_id[i][-1])
        
# Direction

# Insert the new direction column to a specific postion, e.g. position 1 in this case
# Do not run this code the second time without rerun the above codes, 
# because it will appear an error if the column is already inserted 

Schedule2.insert(1, 'direction', Direction)

# Extract date information from the Timestamp column, create a new column and insert it to position 2
Schedule2.insert(2, 'date', Schedule2[0].dt.date)


# Another way of adding a new column to dataframe, but no specific position
# Schedule2['direction'] = Direction
# Schedule2['date'] = Schedule2[0].dt.date


# Replace the original Timestamp columns (Timestamp shows date and time) with time only

for j in range(0, len(Schedule2)):                # loop through rows
    for k in range(3, len(Schedule2.iloc[0])):       # loop through columns 
        if pd.notna(Schedule2.iloc[j, k]):              # Checking for missing value
            Schedule2.iloc[j, k] = pd.Timestamp.time(Schedule2.iloc[j, k])
            
            
# Another approach, work with dataframe that doesn't have NAs 
# Schedule2[0] = Schedule2[0].dt.time


In [9]:
# Uncomment to see what the new dataframe looks like now

Schedule2

Unnamed: 0,stop_id,direction,date,0,1,2,3,4,5,6,7,8,9
0,Q05N,N,2020-04-08,16:36:26,16:50:26,17:06:56,17:23:26,17:33:11,17:54:30,18:04:30,18:24:30,18:34:30,NaT
1,D41S,S,2020-04-08,16:36:31,16:50:21,17:07:56,17:24:34,17:34:26,17:55:30,18:04:30,18:14:30,18:25:30,18:34:30
2,D42S,S,2020-04-08,16:38:31,16:52:21,17:09:56,17:26:34,17:36:26,17:57:30,18:06:30,18:16:30,18:27:30,18:36:30
3,D43S,S,2020-04-08,16:40:01,16:53:51,17:11:26,17:28:04,17:37:56,17:59:00,18:08:00,18:18:00,18:29:00,18:38:00
4,R17N,N,2020-04-08,16:36:56,16:53:26,17:09:56,17:19:41,17:41:00,17:51:00,18:11:00,18:21:00,NaT,NaT
5,R16N,N,2020-04-08,16:38:26,16:54:56,17:11:26,17:21:11,17:42:30,17:52:30,18:12:30,18:22:30,NaT,NaT
6,R14N,N,2020-04-08,16:41:26,16:57:56,17:14:26,17:24:11,17:45:30,17:55:30,18:15:30,18:25:30,NaT,NaT
7,B08N,N,2020-04-08,16:43:56,17:00:26,17:16:56,17:26:41,17:48:00,17:58:00,18:18:00,18:28:00,NaT,NaT
8,Q03N,N,2020-04-08,16:45:56,17:02:26,17:18:56,17:28:41,17:50:00,18:00:00,18:20:00,18:30:00,NaT,NaT
9,Q04N,N,2020-04-08,16:48:26,17:04:56,17:21:26,17:31:11,17:52:30,18:02:30,18:22:30,18:32:30,NaT,NaT


### 3. Data Visulization

References:
- https://stackoverflow.com/questions/43757820/how-to-add-a-variable-to-python-plt-title
- https://stackoverflow.com/questions/14432557/matplotlib-scatter-plot-with-different-text-at-each-data-point

In [9]:
from datetime import date
from datetime import datetime

# Use a "magic command" to specify how we want our plots displayed
# %matplotlib notebook show dynamic/interactive plot

%matplotlib notebook
import matplotlib.pyplot as plt


#### Plot 1: This plot illustrates time lag  in minutes for nearest top 10 stops.

In [10]:
# Rename Schedule dataframe as sch_df
sch_df = Schedule2

# show top 10 time nearest station info
sch_df.head(10)

for i in range(10):
    sch_df.iloc[i]['stop_id']
    
# find top 10 time nearest stops
stop = []
for i in range(20):
    stop.append(sch_df.iloc[i]['stop_id'])

print(stop)
# stop = [sch_df.iloc[0]['stop_id'],sch_df.iloc[1]['stop_id']]
# stop = list.append('sch_df.iloc[0]['stop_id']')

['D41S', 'D42S', 'D43S', 'Q03N', 'Q04N', 'Q05N', 'D40S', 'Q01N', 'R20N', 'R17N', 'R16N', 'R14N', 'B08N', 'D26S', 'D27S', 'D28S', 'D29S', 'D30S', 'D31S', 'D32S']


In [None]:
!more stops.txt

[?1h=stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,locatio n_type,parent_station
101,,Van Cortlandt Park - 242 St,,40.889248,-73.898583,,,1,
101N,,Van Cortlandt Park - 242 St,,40.889248,-73.898583,,,0,101
101S,,Van Cortlandt Park - 242 St,,40.889248,-73.898583,,,0,101
103,,238 St,,40.884667,-73.90087,,,1,
103N,,238 St,,40.884667,-73.90087,,,0,103
103S,,238 St,,40.884667,-73.90087,,,0,103
104,,231 St,,40.878856,-73.904834,,,1,
104N,,231 St,,40.878856,-73.904834,,,0,104
104S,,231 St,,40.878856,-73.904834,,,0,104
106,,Marble Hill - 225 St,,40.874561,-73.909831,,,1,
106N,,Marble Hill - 225 St,,40.874561,-73.909831,,,0,106
106S,,Marble Hill - 225 St,,40.874561,-73.909831,,,0,106
107,,215 St,,40.869444,-73.915279,,,1,
107N,,215 St,,40.869444,-73.915279,,,0,107
107S,,215 St,,40.869444,-73.915279,,,0,107
108,,207 St,,40.864621,-73.918822,,,1,
108N,,207 St,,40.864621,-73.918822,,,0,108
108S,,207 St,,40.864621,-73.918822,,,0,108
109,,Dyckman St,,

In [None]:
# Read the text file that contains stops information 

coord_df = pd.read_csv('stops.txt')
print(coord_df)

In [None]:
# find stops name

stop_name = []
for i in range(10):
    stop_name.append(coord_df.loc[coord_df['stop_id'] == stop[i],'stop_name'].values[0][0:20])
print(stop_name)


# find latitude and longtitude of these 10 stops

x_lat = []
for i in range(10):
    x_lat.append(coord_df.loc[coord_df['stop_id'] == stop[i],'stop_lat'].values[0])
print(x_lat)


# find longtitude of these 10 stops

x_lon = []
for i in range(10):
    x_lon.append(coord_df.loc[coord_df['stop_id'] == stop[i],'stop_lon'].values[0])
print(x_lon)

In [None]:
# calculate nearest waiting time btw top 2 trains in minutes for this train at these 10 stops
# sch_df.iloc[0].values[3]

# time_st = mean(sch_df.iloc[i].values[3])

lag_min = []

for i in range(10):
    dt1 = datetime.combine(date.today(), sch_df.iloc[i].values[3])
    dt2 = datetime.combine(date.today(), sch_df.iloc[i].values[4])
    #dt3 = datetime.combine(date.today(), sch_df.iloc[i].values[5])
    #lag_min = ((dt2 - dt1).total_seconds()+(dt3 - dt2).total_seconds())/2
    #lag_min = (dt2 - dt1).total_seconds()/60
    sec1 = dt2 - dt1
    lag = sec1.total_seconds()/60
    #print (lag_min)
    lag_min.append(lag)
    
print(lag_min)

In [None]:
plt.title ('Time Lag (minutes) for nearest top 10 stops')

# plt.scatter (x_lat, x_lon, s = lag_min);

# different color
colors = []
for i in range (10):
    colors . append ( lag_min [i]/float(max(lag_min)))

plt.scatter(x_lat, x_lon ,s = lag_min, c=colors , alpha =1);

# add text
for i in range (10):
    myx = x_lat[i]
    myy = x_lon[i]
    mystop = stop_name [i]
    plt. text (myx , myy , mystop , color ="red", fontsize =7,horizontalalignment='left',verticalalignment='top')
    
# x and y labels
plt.ylabel('Station Longtitude')
plt.xlabel('Station Latitude')

# Save plot as a png. image
plt.savefig('TimeLag.png')

plt.show()

#### Plot 2: This plot illustrates the time gap between train arrivals for a pre-identified stop.

In [None]:
# Identify the stop_id that we wish to search
# In this case, we use stop 'Q05N' as the pre-identified stop

id = 'Q05N'

In [None]:
# Find the index that associates with the corresponding stop
index = int(Schedule2.index[Schedule2['stop_id'] == id].values)

# Find the schedule of that stop
times = Schedule2.iloc[index]

# Drop the NAs in the data
y = times.dropna()[3::]

# Set the x-values
x = range(0, len(y), 1)

In [None]:
# Plot times data using matplotlib
pd.plotting.register_matplotlib_converters()

# Plot size
plt.figure(figsize=(8, 5))

# Plot title
plt.title('Train Arrival Schedule of Stop: '+str(id)+' on '+str(times['date']))

# x and y labels
plt.ylabel('Time of Arrivals')
plt.xlabel('Arrivals')

# Line plot
plt.plot(x, y, 'go-')

# Lable points
for i, txt in enumerate(y):
    plt.annotate(txt, (x[i]+0.3, y[i]))

# Save plot as a png. image
plt.savefig('MTA_TimeGap.png')


#### Plot 3: This plot illustrates the average time interval of Q train stops in seconds.
- Showing the busyness of the stops

In [None]:
# Select Q train stops from the dataframe and put it into a new dataframe q_schedule
index_list = list(Schedule.index)
index_q = []

# Loop through to select stops belong to Q train
for q in index_list:
    if 'Q' in q:
        index_q.append(q)
q_schedule = Schedule.loc[index_q, :]

In [None]:
# Calculate time interval of all stops
time_interval = []
for i in range(len(q_schedule.columns)-1):
    time_interval.append(list(q_schedule[i+1] - q_schedule[i]))

# Convert time intervals from timedelta format to seconds
for a in range(len(time_interval)):
    for b in range(len(time_interval[a])):
        time_interval[a][b] = time_interval[a][b].total_seconds()

# Convert time interval data from lists to dataframe
time_interval_df = pd.DataFrame(time_interval)

# Calculate the average time interval of all stops
average_time_interval = list(time_interval_df.mean())
q_schedule['Average Time Interval'] = average_time_interval

# Show the new dataframe
q_schedule

In [None]:
# Set labels and x axis of the plot
labels = list(q_schedule.index)
x = range(len(labels))

# Figure size
plt.figure(figsize=(6, 4))

# Plot the Average Time Interval for different Q train stops
# fig, ax = plt.subplots()

# mark title, x and y axis labels
plt.bar(x, average_time_interval)
plt.xticks(x, labels)
ax.set_ylabel('Average Time Interval (sec.)')
ax.set_xlabel('Stops')
ax.set_title('Average Time Interval for Q Train Stops');

# Save plot as a png. image
plt.savefig('MTA_ATI.png')
