In [157]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import operator
import pickle
import datetime
from datetime import datetime as dt
import dateutil.parser
import seaborn as sns

plt.style.use("seaborn")

# import pickled file


def get_data(week_nums):

    url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
    dfs = []
    for week_num in week_nums:
        file_url = url.format(week_num)
        dfs.append(pd.read_csv(file_url))
    return pd.concat(dfs)


filename = 'bensonMTA.pkl'  # make sure this file is in the same directory as your jupyter notebook
try:
    with open(filename, 'rb') as pklfile:
        df = pickle.load(pklfile)
except:
    week_nums = [160604, 160611, 160618, 160625, 160702, 160709, 160716, 160723,
                 160730, 160806, 160813, 160820, 160827, 170603, 170610, 170617,
                 170624, 170701, 170708, 170715, 170722, 170729, 170805, 170812,
                 170819, 170826, 180602, 180609, 180616, 180623, 180630]
    df = get_data(week_nums)
    with open(filename, 'wb') as pklfile:
        df = pickle.dump(df, pklfile)

df["DATE_TIME"] = pd.to_datetime(df.DATE + " " + df.TIME, format="%m/%d/%Y %H:%M:%S")

df['time_hour'] = df.DATE_TIME.dt.hour

df['WEEKDAY'] = df['DATE_TIME'].dt.weekday

df.rename(columns={column: column.strip() for column in df.columns}, inplace=True)

df.sort_values(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"], inplace=True, ascending=True)

df['real_entries'] = df['ENTRIES'].diff()
df['real_exits'] = df['EXITS'].diff()

turnstiles_df = df.copy()

df.groupby(["C/A", "UNIT", "SCP", "STATION", "DATE", "TIME"]).head()

turnstiles_df['totals'] = turnstiles_df['real_entries'] + turnstiles_df['real_exits']

(turnstiles_df.groupby(["STATION", "DATE_TIME"])).sum(
).sort_values("totals", ascending=False).head(50)

turnstiles_df = turnstiles_df.groupby(["STATION", "DATE_TIME"]).sum(
).sort_values("totals", ascending=False).reset_index()

turnstiles_df['time_hour'] = turnstiles_df['DATE_TIME'].dt.hour

turnstiles_df['WEEKDAY'] = turnstiles_df['DATE_TIME'].dt.weekday

turnstiles_df['STATION'].head(50).unique()

turnstiles_df.loc[(turnstiles_df['totals'] < 0) | (
    turnstiles_df['totals'] > 100000), 'totals'] = np.nan
turnstiles_df['totals'] = turnstiles_df['totals'].interpolate(method="linear")

eight_pm_df = turnstiles_df[turnstiles_df['time_hour'] == 20]

new_df = turnstiles_df.groupby(['STATION'])['real_entries', 'real_exits'].median()

totals = {}
for index, row in new_df.iterrows():
    totals[index] = row['real_entries'] + row['real_exits']

sorted_totals = sorted(totals.items(), key=operator.itemgetter(1), reverse=True)
print(sorted_totals)

new_df = eight_pm_df.groupby(['STATION'])['real_entries', 'real_exits'].median()

eight_totals = {}
for index, row in new_df.iterrows():
    eight_totals[index] = row['real_entries'] + row['real_exits']

eight_sorted_totals = sorted(eight_totals.items(), key=operator.itemgetter(1), reverse=True)
print(eight_sorted_totals)


[('14 ST-UNION SQ', 24126.5), ('34 ST-HERALD SQ', 23159.0), ('TIMES SQ-42 ST', 20980.0), ('FLUSHING-MAIN', 16435.0), ('59 ST COLUMBUS', 16174.0), ('34 ST-PENN STA', 15058.0), ('59 ST', 14820.0), ('GRD CNTRL-42 ST', 12607.0), ('JKSN HT-ROOSVLT', 12489.0), ('W 4 ST-WASH SQ', 11861.0), ('50 ST', 11076.0), ('42 ST-PORT AUTH', 10350.0), ('ATL AV-BARCLAY', 10158.0), ('145 ST', 9613.5), ('CHAMBERS ST', 9359.0), ('8 AV', 9082.0), ('125 ST', 8990.0), ('23 ST', 8566.0), ('BEDFORD AV', 8346.0), ('47-50 STS ROCK', 8279.0), ('7 AV', 8204.5), ('86 ST', 8069.0), ('103 ST', 7553.0), ('116 ST', 7528.0), ('42 ST-BRYANT PK', 7428.0), ('JAMAICA CENTER', 7392.0), ('DELANCEY/ESSEX', 6915.0), ('1 AV', 6887.5), ("B'WAY-LAFAYETTE", 6729.5), ('GRAND ST', 6611.5), ('72 ST-2 AVE', 6563.0), ('JAY ST-METROTEC', 6549.0), ('86 ST-2 AVE', 6464.0), ('49 ST', 6392.0), ('SUTPHIN-ARCHER', 6012.0), ('CROWN HTS-UTICA', 5761.5), ('BOWLING GREEN', 5622.0), ('82 ST-JACKSON H', 5508.0), ('FOREST HILLS 71', 5499.5), ('NOSTRAND A

In [164]:
locations = []
print(len(eight_sorted_totals),len(sorted_totals))
for key, value in sorted_totals:
    locations.append(key)
locations[0]

247 380


'14 ST-UNION SQ'

In [165]:
from yelpapi import YelpAPI
import io, json, pprint
# read API keys (note please make your own config_secret.json)
#{
# "api_key":"YOUR API KEY HERE"
#}
with io.open('config_secret.json') as cred:
    creds = json.load(cred)
limit = 50

def count_dict(dictionary, keyname):
    dict_count = 0
    for item in dictionary.get(keyname):
        dict_count += 1
    return dict_count

def iter_search(offset,limit,loc):
    # make this more flexible in the future
    # take in json file and allow user to set new
    # parameters on the fly
    args = {
    'location': loc,
    'limit': limit,
    'offset': offset,
    'categories': 'coffee,restaurants',
    'open_at': 1530878400,  
    'radius_filter': '241.40160000000003',
    'price': '4'
    }
    search_results = yelp_api.search_query(**args)
    return search_results
    
# input location    
res_dict = {}
yelp_api = YelpAPI(**creds)

for loc in locations: # query the data base 50 offsets at a time
    dict_count = 0
    for offset in range(0,100):
        query_cnt = count_dict(iter_search(offset*limit, limit, loc),'businesses')
        dict_count += query_cnt
        if query_cnt < 50:
            break
           
          
    res_dict[loc] = dict_count  
    
#sorted_rest = sorted(res_dict.items(), key=operator.itemgetter(1), reverse=True)

#pprint.pprint(sorted_rest) 


In [166]:
len(res_dict)

380

In [202]:
#for i in range(0,100):
#   print(sorted_totals[i],sorted_rest[i])
new_df = pd.DataFrame()
new_df['locations'] = [x[0] for x in sorted_totals]
new_df['total density'] = [x[1] for x in sorted_totals] 
new_df['proposed number targeted'] = round(0.02*new_df['total density'])
new_df['high priced restraunts'] = [value for key,value in res_dict.items()]
new_df['number people from nearby restraunts'] = ((250/2)*new_df['high priced restraunts'])
#new_df['proposed number targeted'] = round(0.05*new_df["total density"])
#new_df['high priced restraunts'] = [value for key,value in res_dict.items()]
#new_df['number people from nearby restraunts'] = ((250/2)*new_df['high priced restraunts'])
new_df['percentage from restraunts'] = new_df['number people from nearby restraunts']/new_df['total density']
new_df['proposed number donations'] = round(new_df['percentage from restraunts']*new_df['proposed number targeted']*0.20)

# 250 customers in restraunt at least half come from MTA, at least 50% come from MTA, since nobody wants to drive 
new_df

Unnamed: 0,locations,total density,proposed number targeted,high priced restraunts,number people from nearby restraunts,percentage from restraunts,proposed number donations
0,14 ST-UNION SQ,24126.5,483.0,8,1000.0,0.041448,4.0
1,34 ST-HERALD SQ,23159.0,463.0,25,3125.0,0.134937,12.0
2,TIMES SQ-42 ST,20980.0,420.0,14,1750.0,0.083413,7.0
3,FLUSHING-MAIN,16435.0,329.0,34,4250.0,0.258594,17.0
4,59 ST COLUMBUS,16174.0,323.0,15,1875.0,0.115927,7.0
5,34 ST-PENN STA,15058.0,301.0,12,1500.0,0.099615,6.0
6,59 ST,14820.0,296.0,5,625.0,0.042173,2.0
7,GRD CNTRL-42 ST,12607.0,252.0,5,625.0,0.049576,2.0
8,JKSN HT-ROOSVLT,12489.0,250.0,18,2250.0,0.180159,9.0
9,W 4 ST-WASH SQ,11861.0,237.0,7,875.0,0.073771,3.0


In [206]:
test = new_df[((new_df['proposed number donations'] > 12.0) & (new_df['percentage from restraunts'] < 1.0) &  (new_df['proposed number targeted'] > 50))]
#test

In [207]:
test

Unnamed: 0,locations,total density,proposed number targeted,high priced restraunts,number people from nearby restraunts,percentage from restraunts,proposed number donations
3,FLUSHING-MAIN,16435.0,329.0,34,4250.0,0.258594,17.0
20,7 AV,8204.5,164.0,35,4375.0,0.533244,17.0
34,SUTPHIN-ARCHER,6012.0,120.0,30,3750.0,0.623752,15.0
38,FOREST HILLS 71,5499.5,110.0,35,4375.0,0.795527,18.0
43,161/YANKEE STAD,5385.0,108.0,36,4500.0,0.835655,18.0
49,DEKALB AV,5139.5,103.0,27,3375.0,0.656679,14.0
51,167 ST,4903.5,98.0,36,4500.0,0.917712,18.0
54,MYRTLE-WYCKOFF,4781.0,96.0,29,3625.0,0.75821,15.0
57,FORDHAM RD,4668.0,93.0,34,4250.0,0.910454,17.0
63,103 ST-CORONA,4458.0,89.0,35,4375.0,0.981382,17.0


In [None]:
#df.loc[(df['real_exits'] < 0) | (df['real_exits'] > 100000), 'real_exits'] = np.nan

In [156]:
#df.sort_values('real_exits', ascending=False )