In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime as dt
import pickle
import seaborn as sns

%matplotlib inline

In [2]:
#Links to raw MTA urls - using month of May for data as will want to promote the last month before Gala starts on June 1st. 
url1 = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_190504.txt'
url2 = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_190511.txt'
url3 = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_190518.txt'
url4 = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_190525.txt'

In [13]:
def get_data(week_nums):
    url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
    dfs = []
    col_names= ['CA','UNIT','SCP','STATION','LINENAME','DIVISION','DATE','TIME','DESC','ENTRIES','EXITS']
    for week_num in week_nums:
        file_url = url.format(week_num)
        dfs.append(pd.read_csv(file_url, sep=",", header=0, names = col_names, parse_dates=[['DATE', 'TIME']]))
    return pd.concat(dfs)
        
week_nums = [190504, 190511, 190518, 190525]
df = get_data(week_nums)

In [14]:
df.head()

Unnamed: 0,DATE_TIME,CA,UNIT,SCP,STATION,LINENAME,DIVISION,DESC,ENTRIES,EXITS
0,2019-04-27 00:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7035249,2384833
1,2019-04-27 04:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7035269,2384840
2,2019-04-27 08:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7035292,2384875
3,2019-04-27 12:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7035392,2384951
4,2019-04-27 16:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7035651,2385020


## VERIFYING THAT WE HAVE ONLY 1 ENTRY PER ROW

In [16]:
df.groupby(["CA", "UNIT", "SCP", "STATION", "DATE_TIME"]).ENTRIES.count().reset_index().sort_values("ENTRIES", ascending=False).head(5)

Unnamed: 0,CA,UNIT,SCP,STATION,DATE_TIME,ENTRIES
118612,G009,R151,02-00-04,CONEY IS-STILLW,2019-05-16 17:00:00,2
118444,G009,R151,02-00-03,CONEY IS-STILLW,2019-05-16 17:00:00,2
408244,N525,R142,01-00-03,DELANCEY/ESSEX,2019-05-11 05:00:00,2
408415,N525,R142,01-00-04,DELANCEY/ESSEX,2019-05-11 05:00:00,2
0,A002,R051,02-00-00,59 ST,2019-04-27 00:00:00,1


In [17]:
df = df[df.DESC == 'REGULAR']

In [18]:
df.groupby(["CA", "UNIT", "SCP", "STATION", "DATE_TIME"]).ENTRIES.count().reset_index().sort_values("ENTRIES", ascending=False).head(5)

Unnamed: 0,CA,UNIT,SCP,STATION,DATE_TIME,ENTRIES
0,A002,R051,02-00-00,59 ST,2019-04-27 00:00:00,1
546269,R145,R032,00-00-02,TIMES SQ-42 ST,2019-05-05 20:00:00,1
546259,R145,R032,00-00-02,TIMES SQ-42 ST,2019-05-04 04:00:00,1
546260,R145,R032,00-00-02,TIMES SQ-42 ST,2019-05-04 08:00:00,1
546261,R145,R032,00-00-02,TIMES SQ-42 ST,2019-05-04 12:00:00,1


## USING THE DATAFRAME WITH NEEDED COLUMNS ONLY

In [None]:
daily_df = df.groupby(["CA", "UNIT", "SCP", "STATION", "DATE_TIME"])["ENTRIES","EXITS"].first().reset_index()

In [1]:
from googlemaps import Client as GoogleMaps

In [2]:
gmaps = GoogleMaps(key='AIzaSyDNY_DCPc_of8n6fuT-ahfSOCXuaOpbY60')

In [4]:
geocode_result = gmaps.geocode('Botanic Garden, New York, NY')
print(geocode_result)
latitude = geocode_result[0]['geometry']['location']['lat']

[{'address_components': [{'long_name': 'Botanical Garden', 'short_name': 'Botanical Garden', 'types': ['establishment', 'point_of_interest', 'train_station', 'transit_station']}, {'long_name': 'West Bronx', 'short_name': 'West Bronx', 'types': ['neighborhood', 'political']}, {'long_name': 'The Bronx', 'short_name': 'The Bronx', 'types': ['political', 'sublocality', 'sublocality_level_1']}, {'long_name': 'Bronx County', 'short_name': 'Bronx County', 'types': ['administrative_area_level_2', 'political']}, {'long_name': 'New York', 'short_name': 'NY', 'types': ['administrative_area_level_1', 'political']}, {'long_name': 'United States', 'short_name': 'US', 'types': ['country', 'political']}, {'long_name': '10458', 'short_name': '10458', 'types': ['postal_code']}], 'formatted_address': 'Botanical Garden, The Bronx, NY 10458, USA', 'geometry': {'location': {'lat': 40.8672174, 'lng': -73.8817164}, 'location_type': 'GEOMETRIC_CENTER', 'viewport': {'northeast': {'lat': 40.8685663802915, 'lng':

In [6]:
geocode_result[0].keys()

dict_keys(['address_components', 'formatted_address', 'geometry', 'place_id', 'plus_code', 'types'])

In [7]:
stations = df['STATION'].unique()

NameError: name 'df' is not defined

In [None]:
ny_stations = list(map(lambda x: x + ', New York, NY', stations))

In [None]:
results=[]

for station in ny_stations:
    geocode_result = gmaps.geocode(station)
    results.append(geocode_result)
print(results)

In [None]:
import pprint as pp

lat_dict, lng_dict = {},{}

for idx, result in enumerate(results):
    lat = result[0]['geometry']['location']['lat']
    lng = result[0]['geometry']['location']['lng']
    station_name = stations[idx]
    lat_dict[station_name] = lat
    lng_dict[station_name] = lng

In [None]:
address_dict = {}

for idx, result in enumerate(results):
    address = result[0]['formatted_address']
    station_name = stations[idx]
    address_dict[station_name] = address