In [159]:
from pymongo import MongoClient
import pandas as pd
import time
import re
import folium
from folium import Choropleth, Circle, Marker, Icon, Map
from folium.plugins import HeatMap, MarkerCluster
from dotenv import load_dotenv
import os
import requests
import warnings
warnings.filterwarnings("ignore")

In [160]:
df = pd.read_csv("./dataset/dataset.csv")

In [161]:
df

Unnamed: 0,name,city,latitude,longitude,country_code
0,Wetpaint,Seattle,47.603122,-122.333253,USA
1,Wetpaint,New York,40.723731,-73.996431,USA
2,Digg,San Francisco,37.764726,-122.394523,USA
3,Geni,West Hollywood,34.090368,-118.393064,USA
4,StumbleUpon,San Francisco,37.775196,-122.419204,USA
5,Gizmoz,Menlo Park,37.48413,-122.169472,USA
6,Plaxo,Sunnyvale,37.387845,-122.055197,USA
7,Powerset,San Francisco,37.778613,-122.395289,USA
8,Mahalo,Culver City,34.017606,-118.487267,USA
9,Meetup,New York,40.72604,-73.995722,USA


In [162]:
load_dotenv()

True

In [163]:
token_fsq = os.getenv("token")

In [164]:
df.country_code.unique()


array(['USA', 'FRA', 'NLD', 'SGP', 'GBR', 'DEU', 'CHN', 'ESP', 'CAN',
       'FIN', 'IRL'], dtype=object)

In [165]:
#this is a test
usa_df = df[df["country_code"] == "USA"]
usa_df

Unnamed: 0,name,city,latitude,longitude,country_code
0,Wetpaint,Seattle,47.603122,-122.333253,USA
1,Wetpaint,New York,40.723731,-73.996431,USA
2,Digg,San Francisco,37.764726,-122.394523,USA
3,Geni,West Hollywood,34.090368,-118.393064,USA
4,StumbleUpon,San Francisco,37.775196,-122.419204,USA
5,Gizmoz,Menlo Park,37.48413,-122.169472,USA
6,Plaxo,Sunnyvale,37.387845,-122.055197,USA
7,Powerset,San Francisco,37.778613,-122.395289,USA
8,Mahalo,Culver City,34.017606,-118.487267,USA
9,Meetup,New York,40.72604,-73.995722,USA


In [166]:
#test api with country_code FRA, bc it doesn't have too many values

fra_df = df[df["country_code"] == "FRA"]
fra_df

Unnamed: 0,name,city,latitude,longitude,country_code
13,Netvibes,Paris,48.870806,2.34668,FRA
15,TVtrip,Paris,48.856667,2.350987,FRA
80,FastBooking,Paris,37.09024,-95.712891,FRA
109,fotopedia,Paris,48.856667,2.350987,FRA
172,Zoomorama,Paris,48.854845,2.40352,FRA
327,Uniteam Communication,Paris,48.842278,2.310483,FRA


In [167]:
#the filter was split into 2 steps, 
#first filter we used a mongo query to select form the database design companies that had raised more than 10M$ and 
#for the second filter we are going to be using foursquare queries to :
#1- look for Starbucks cafes nearby, because everyone needs coffee
     #setting the radius to 500 and limiting search to 50.

In [168]:
#first trial was with 1000 radius, trying to narrow down more, 
#reducing radius to 500

def getNearbyStarbucks(lat, long, radius=500):

    # create the API request URL
    url = f"https://api.foursquare.com/v3/places/search?ll={lat}%2C{long}&radius={radius}&chains=ab4c54c0-d68a-012e-5619-003048cad9da&exclude_all_chains=false&sort=DISTANCE&limit=50"        
    # make the GET request
    headers = {
    "accept": "application/json",
    "Authorization": token_fsq
    }
    
    results = requests.get(url,headers=headers).json()["results"]
    
    return len(results)
     

In [169]:
for index, row in df.iterrows():
    df.at[index, "Starbucks"] = getNearbyStarbucks(row["latitude"],row["longitude"])
   

In [170]:
star_count = df["Starbucks"].value_counts()
#print(star_count.to_string())

     


In [171]:
df['Starbucks'].dtypes
df['Starbucks'] = df['Starbucks'].astype(int)

In [173]:
#pd.set_option('display.max_rows', 500)
#df


Unnamed: 0,name,city,latitude,longitude,country_code,Starbucks
0,Wetpaint,Seattle,47.603122,-122.333253,USA,2
1,Wetpaint,New York,40.723731,-73.996431,USA,3
2,Digg,San Francisco,37.764726,-122.394523,USA,1
3,Geni,West Hollywood,34.090368,-118.393064,USA,0
4,StumbleUpon,San Francisco,37.775196,-122.419204,USA,2
5,Gizmoz,Menlo Park,37.48413,-122.169472,USA,0
6,Plaxo,Sunnyvale,37.387845,-122.055197,USA,0
7,Powerset,San Francisco,37.778613,-122.395289,USA,2
8,Mahalo,Culver City,34.017606,-118.487267,USA,1
9,Meetup,New York,40.72604,-73.995722,USA,4


In [174]:
#deleting the locations with zero Starbucks nearby
df = df[df.Starbucks != 0]

In [175]:
# Second filter is "Nightlife", because everyone in the company is between 25 and 40, and need some place to go party.
#setting the radius to 500 and limiting search to 50.

In [176]:
def getNearbyNightlife(lat, long, radius=500):

    # create the API request URL
    url = f"https://api.foursquare.com/v3/places/search?ll={lat}%2C{long}&radius={radius}&categories=10032&exclude_all_chains=false&sort=DISTANCE&limit=50"

    # make the GET request
    headers = {
    "accept": "application/json",
    "Authorization": token_fsq
    }
    
    results = requests.get(url,headers=headers).json()["results"]
    
    return len(results)


In [177]:
for index, row in df.iterrows():
    df.at[index, "Nightlife"] = getNearbyNightlife(row["latitude"],row["longitude"])

In [178]:
#for index, row in df.iterrows():
 #   if getNearbyNightlife(row["latitude"],row["longitude"]) == True:
  #      df.at[index, "Nightlife"] = True
   # else :
    #    df.at[index, "Nightlife"] = False

In [179]:
df['Nightlife'] = df['Nightlife'].astype(int)

In [181]:
#df

In [182]:
nl_count = df["Nightlife"].value_counts()
print(nl_count.to_string())

0     45
2     21
1     18
4     17
11    13
5      8
10     7
6      7
9      7
3      6
12     6
29     5
13     4
7      4
8      4
15     4
19     3
17     2
24     2
30     2
14     2
18     1
16     1
32     1
23     1
44     1
35     1
22     1
25     1
28     1


In [183]:
df = df[df.Nightlife != 0]

In [184]:
city_count = df["country_code"].value_counts()
print(city_count.to_string())

USA    134
GBR      6
FRA      3
SGP      2
ESP      2
CAN      2
CHN      2


In [185]:
# Third filter is Transportation, specifically train station,metro station and taxi.
#assuming that airports are generally located outside cities, by choosing an office near an airport,
#we are going to be discarding other criterias that can't be found near airports, such as schools,pet grooming services...
#setting the radius to 500 and limiting search to 50.

#19047 rail
#19049 taxi
#19046 metro


In [186]:
def getNearbyTransport(lat, long, radius=500):

    # create the API request URL
    url = f"https://api.foursquare.com/v3/places/search?ll={lat}%2C{long}&radius={radius}&categories=19047%2C19049%2C19046&exclude_all_chains=false&sort=DISTANCE&limit=50"

    # make the GET request
    headers = {
    "accept": "application/json",
    "Authorization": token_fsq
    }
    
    results = requests.get(url,headers=headers).json()["results"]
    
    return len(results)

In [187]:
for index, row in df.iterrows():
    df.at[index, "transportation"] = getNearbyTransport(row["latitude"],row["longitude"])

In [188]:
df['transportation'] = df['transportation'].astype(int)

In [189]:
df = df[df.transportation != 0]

In [192]:
transp_count = df["transportation"].value_counts()
#print(transp_count.to_string())
#df

In [193]:
#4th filter preschool
#30% of the company staff have at least 1 child.
#setting the radius to 1000 in this case


In [194]:
def getNearbypreschool(lat, long, radius=1000):

    # create the API request URL
    url = f"https://api.foursquare.com/v3/places/search?ll={lat}%2C{long}&radius={radius}&categories=12056&exclude_all_chains=false&sort=DISTANCE&limit=50"

    # make the GET request
    headers = {
    "accept": "application/json",
    "Authorization": token_fsq
    }
    
    results = requests.get(url,headers=headers).json()["results"]
    
    return len(results)
         


In [195]:
for index, row in df.iterrows():
    df.at[index, "preschool"] = getNearbypreschool(row["latitude"],row["longitude"])

In [196]:
df['preschool'] = df['preschool'].astype(int)

In [197]:
ps_count = df["preschool"].value_counts()
print(ps_count.to_string())

4     16
5     15
10    13
6     12
0     12
2     11
3      8
1      7
14     7
16     7
12     6
7      6
9      5
13     3
15     3
11     2
8      1
20     1
17     1


In [198]:
df = df[df.preschool != 0]

In [199]:
city_count = df["country_code"].value_counts()
print(city_count.to_string())

USA    113
GBR      5
SGP      2
ESP      2
FRA      1
CAN      1


In [200]:
#5th filter is vegan restaurants

In [201]:
def getNearbyVeganRest(lat, long, radius=500):

    # create the API request URL
    url = f"https://api.foursquare.com/v3/places/search?ll={lat}%2C{long}&radius={radius}&categories=13377&exclude_all_chains=false&sort=DISTANCE&limit=50"

    # make the GET request
    headers = {
    "accept": "application/json",
    "Authorization": token_fsq
    }
    
    results = requests.get(url,headers=headers).json()["results"]
    
    return len(results)
      

In [202]:
for index, row in df.iterrows():
    df.at[index, "veganRest"] = getNearbyVeganRest(row["latitude"],row["longitude"])

In [203]:
df['veganRest'] = df['veganRest'].astype(int)

In [204]:
vegan_count = df["veganRest"].value_counts()
#print(vegan_count.to_string())
#df

In [205]:
city_count = df["country_code"].value_counts()
print(city_count.to_string())

USA    113
GBR      5
SGP      2
ESP      2
FRA      1
CAN      1


In [206]:
df = df[df.veganRest != 0]

In [207]:
#The office dog—"Dobby" needs a hairdresser every month. Ensure there's one not too far away.
#the radius is set to 1000, Dobby only needs to go there once a month


In [208]:
def getNearbyPetService(lat, long, radius=1000):

    # create the API request URL
    url = f"https://api.foursquare.com/v3/places/search?ll={lat}%2C{long}&radius={radius}&categories=11134&exclude_all_chains=false&sort=DISTANCE&limit=50"

    # make the GET request
    headers = {
    "accept": "application/json",
    "Authorization": token_fsq
    }
    
    results = requests.get(url,headers=headers).json()["results"]
    
    return len(results)
      

In [209]:
for index, row in df.iterrows():
    df.at[index, "petService"] = getNearbyPetService(row["latitude"],row["longitude"])

In [211]:
#df

In [212]:
df['petService'] = df['petService'].astype(int)

In [213]:
Ps_count = df["petService"].value_counts()
print(Ps_count.to_string())

3     15
1     14
2     14
0     11
4     11
28     5
7      5
8      5
15     4
5      4
19     3
27     2
9      2
29     2
11     2
12     2
6      2
13     1
25     1
26     1
21     1
24     1
23     1
14     1


In [214]:
df = df[df.petService != 0]

In [215]:
city_count = df["country_code"].value_counts()
print(city_count.to_string())

USA    93
GBR     3
SGP     2
ESP     1


In [216]:
city_count = df["city"].value_counts()
print(city_count.to_string())

New York         45
San Francisco    35
Seattle           6
London            3
Boston            2
Singapore         2
Bellevue          1
Los Angeles       1
Culver City       1
Denver            1
San Diego         1
Madrid            1


In [218]:
test1 = df[~df.country_code.str.contains("USA")]
test1


Unnamed: 0,name,city,latitude,longitude,country_code,Starbucks,Nightlife,transportation,preschool,veganRest,petService
55,Wonga,London,51.519204,-0.16261,GBR,3,11,14,1,2,1
64,mig33,Singapore,37.580304,-122.343679,SGP,1,1,5,7,1,2
149,Globant,London,51.52051,-0.09522,GBR,2,5,6,5,2,1
287,Nexway,Madrid,40.414299,-3.703307,ESP,4,24,5,4,12,2
288,Nexway,Singapore,1.307505,103.8295,SGP,6,24,1,4,3,1
314,Netbiscuits,London,51.513248,-0.154738,GBR,3,9,6,1,3,1


In [233]:

df["Ratio"] = (df["Starbucks"]*1)+(df["Nightlife"]*0.8)+(df["transportation"]*0.9)+(df["preschool"]*0.7)+(df["veganRest"]*0.6)+(df["petService"]*0.6)

In [236]:
df["Total"] = (df["Starbucks"])+(df["Nightlife"])+(df["transportation"])+(df["preschool"])+(df["veganRest"])+(df["petService"])

In [242]:
df = df.sort_values(by='Ratio', ascending=False)

In [243]:
df


Unnamed: 0,name,city,latitude,longitude,country_code,Starbucks,Nightlife,transportation,preschool,veganRest,petService,Total,Ratio
338,Optimum Energy,New York,40.751892,-73.987592,USA,14,28,19,13,7,12,93,74.0
296,Crispy Gamer,New York,40.750597,-73.985273,USA,11,29,13,13,9,19,94,71.8
51,Heavy,New York,40.753385,-73.98962,USA,14,22,26,12,5,8,87,71.2
256,Yipit,New York,40.744618,-73.987764,USA,9,16,12,14,14,28,93,67.6
124,Tripology,New York,40.746883,-73.983969,USA,8,15,13,16,12,26,90,65.7
105,stickK,New York,40.744837,-73.983843,USA,10,11,8,15,17,28,89,63.5
111,Inform Technologies,New York,40.744639,-73.983963,USA,9,11,8,15,17,28,88,62.5
67,Mimeo,New York,40.745216,-73.982807,USA,8,12,10,14,15,27,86,61.6
128,Next New Networks,New York,40.743808,-73.983626,USA,8,11,6,16,16,28,85,59.8
75,Tumblr,New York,40.743808,-73.983626,USA,8,11,6,16,16,28,85,59.8


In [273]:
companies_group = folium.FeatureGroup(name=f"Companies: {df.shape[0]}")

In [274]:
wolrd_map_2  = Map(location = ["41.382707", "2.169242"], zoom_start = 2)
HeatMap(data=df[["latitude", "longitude"]], 
        radius=25,
       gradient = {0.4: 'yellow', 0.65: 'orange', 1: 'white'}).add_to(companies_group)

<folium.plugins.heat_map.HeatMap at 0x2545478f760>

In [275]:
companies_group.add_to(wolrd_map_2)

<folium.map.FeatureGroup at 0x25454700f40>

In [276]:
wolrd_map_2

In [264]:
new_df = df[df["city"] == "New York"]
new_df
#select all NY
#ratio helps me decide on the top 10 companies

Unnamed: 0,name,city,latitude,longitude,country_code,Starbucks,Nightlife,transportation,preschool,veganRest,petService,Total,Ratio
338,Optimum Energy,New York,40.751892,-73.987592,USA,14,28,19,13,7,12,93,74.0
296,Crispy Gamer,New York,40.750597,-73.985273,USA,11,29,13,13,9,19,94,71.8
51,Heavy,New York,40.753385,-73.98962,USA,14,22,26,12,5,8,87,71.2
256,Yipit,New York,40.744618,-73.987764,USA,9,16,12,14,14,28,93,67.6
124,Tripology,New York,40.746883,-73.983969,USA,8,15,13,16,12,26,90,65.7
105,stickK,New York,40.744837,-73.983843,USA,10,11,8,15,17,28,89,63.5
111,Inform Technologies,New York,40.744639,-73.983963,USA,9,11,8,15,17,28,88,62.5
67,Mimeo,New York,40.745216,-73.982807,USA,8,12,10,14,15,27,86,61.6
128,Next New Networks,New York,40.743808,-73.983626,USA,8,11,6,16,16,28,85,59.8
75,Tumblr,New York,40.743808,-73.983626,USA,8,11,6,16,16,28,85,59.8


In [269]:
NY_map = Map(location = ["40.751892", "-73.987592"], zoom_start = 10)
folium.Circle([40.751892, -73.987592],radius=1000).add_to(NY_map)


<folium.vector_layers.Circle at 0x25454498790>

In [270]:
NY_map

In [57]:
#top 10 locations in NY based on ratio

In [271]:
NY_df = df.head(10)
NY_df

Unnamed: 0,name,city,latitude,longitude,country_code,Starbucks,Nightlife,transportation,preschool,veganRest,petService,Total,Ratio
338,Optimum Energy,New York,40.751892,-73.987592,USA,14,28,19,13,7,12,93,74.0
296,Crispy Gamer,New York,40.750597,-73.985273,USA,11,29,13,13,9,19,94,71.8
51,Heavy,New York,40.753385,-73.98962,USA,14,22,26,12,5,8,87,71.2
256,Yipit,New York,40.744618,-73.987764,USA,9,16,12,14,14,28,93,67.6
124,Tripology,New York,40.746883,-73.983969,USA,8,15,13,16,12,26,90,65.7
105,stickK,New York,40.744837,-73.983843,USA,10,11,8,15,17,28,89,63.5
111,Inform Technologies,New York,40.744639,-73.983963,USA,9,11,8,15,17,28,88,62.5
67,Mimeo,New York,40.745216,-73.982807,USA,8,12,10,14,15,27,86,61.6
128,Next New Networks,New York,40.743808,-73.983626,USA,8,11,6,16,16,28,85,59.8
75,Tumblr,New York,40.743808,-73.983626,USA,8,11,6,16,16,28,85,59.8


In [291]:
TOP_NY_map = Map(location = ["40.751892", "-73.987592"], zoom_start = 15)

In [298]:
icon = Icon(color = "blue",
            opacity = 0.1,
            prefix = "fa",
            icon = "briefcase",
            icon_color = "black")

In [301]:
for index, row in NY_df.iterrows():
    folium.Circle([row["latitude"],row["longitude"]],radius=1000).add_to(TOP_NY_map)
    folium.Marker(location =[row["latitude"],row["longitude"]],tooltip = row["name"],icon = icon).add_to(TOP_NY_map)
    

In [302]:
TOP_NY_map

In [None]:
#get surroundings using api call for each amenity
#json into data frame
#plotting final data frame of my chosen location with amenities around
#using same code for mapping companies  or the code from lecture 