In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
from geopy.geocoders import Nominatim
import folium

In [2]:
req = requests.get('https://pt.wikipedia.org/wiki/Lista_de_bairros_de_Vit%C3%B3ria')

if req.status_code == 200:
    print('Requisição bem sucedida!')
    content = req.content

Requisição bem sucedida!


In [3]:
soup = BeautifulSoup(content, 'html.parser')

text = [hood.get_text() for hood in soup.select("table.sortable td a:not(.image)")]
text.sort()

df_city = pd.DataFrame(columns=['Hood', 'Latitude', 'Longitude'])
df_city['Hood'] = text

df_city.head()

Unnamed: 0,Hood,Latitude,Longitude
0,Aeroporto,,
1,Andorinhas,,
2,Antônio Honório,,
3,Ariovaldo Favalessa,,
4,Barro Vermelho,,


In [4]:
geolocator = Nominatim(user_agent="coursera")

for index, row in df_city.iterrows():
    address= '{} area, Vitória - ES, Brazil'.format( row['Hood'] ) 
    try:
        location = geolocator.geocode( address )
        latitude = location.latitude
        longitude = location.longitude
        print('The geograpical coordinates of {} are {}, {}.'.format(address, latitude, longitude))
        df_city.loc[index, 'Latitude'] = latitude
        df_city.loc[index, 'Longitude'] = longitude
    except AttributeError:
        print('Cannot find: {}, will drop index: {}'.format(address, index))

        
df_city.loc[16, 'Latitude'] = -20.307140
df_city.loc[16, 'Longitude'] = -40.316003
        
df_city.loc[18, 'Latitude'] = -20.314935
df_city.loc[18, 'Longitude'] = -40.343417

The geograpical coordinates of Aeroporto area, Vitória - ES, Brazil are -20.25726505, -40.28209337454523.
The geograpical coordinates of Andorinhas area, Vitória - ES, Brazil are -20.2876457, -40.3057956.
The geograpical coordinates of Antônio Honório area, Vitória - ES, Brazil are -20.262694, -40.2981413.
The geograpical coordinates of Ariovaldo Favalessa area, Vitória - ES, Brazil are -20.3142531, -40.3564749.
The geograpical coordinates of Barro Vermelho area, Vitória - ES, Brazil are -20.295109, -40.2975362.
The geograpical coordinates of Bela Vista area, Vitória - ES, Brazil are -20.3050413, -40.3480895.
The geograpical coordinates of Bento Ferreira area, Vitória - ES, Brazil are -20.3153408, -40.3069497.
The geograpical coordinates of Boa Vista area, Vitória - ES, Brazil are -20.2717123, -40.2996541.
The geograpical coordinates of Bonfim area, Vitória - ES, Brazil are -20.3001895, -40.3133996.
The geograpical coordinates of Caratoíra area, Vitória - ES, Brazil are -20.3154701, -4

In [6]:
address = 'Vitória, Espírito Santo, Brazil'
try:
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))
except AttributeError:
    print('Cannot do: {}, will drop index: {}'.format(address, index))

my_map = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(df_city['Latitude'], df_city['Longitude'], df_city['Hood']):
    label = folium.Popup(str(label), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(my_map)
    
my_map

The geograpical coordinate of Vitória, Espírito Santo, Brazil are -20.3200917, -40.3376682.


#### Define Foursquare Credentials and Version

In [7]:
CLIENT_ID = 'OQ0IBVUNCQUVOUQPKDMS2ZJRXGXREPN3E1DFYI00YJOROZWL' # your Foursquare ID
CLIENT_SECRET = 'M0JPPYK42NGWRG1VS1YSSTXVGYR2S24D2KIQJCP2NTDY2XEO' # your Foursquare Secret
VERSION = '20180323' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: OQ0IBVUNCQUVOUQPKDMS2ZJRXGXREPN3E1DFYI00YJOROZWL
CLIENT_SECRET:M0JPPYK42NGWRG1VS1YSSTXVGYR2S24D2KIQJCP2NTDY2XEO


## 2. Explore Neighborhoods in Toronto
#### Returns a list of recommended venues near the current location. For more robust information about the venues themselves (photos/tips/etc.), please see our venue details endpoint.

User authenticated calls will personalize the ranking based on you and your friends.

In [8]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Hood', 
                  'Hood Latitude', 
                  'Hood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### run the above function on each neighborhood and create a new dataframe called *toronto_venues*.

In [10]:
city_venues = getNearbyVenues(names=df_city['Hood'],
                                   latitudes=df_city['Latitude'],
                                   longitudes=df_city['Longitude']
                                  )

Aeroporto
Andorinhas
Antônio Honório
Ariovaldo Favalessa
Barro Vermelho
Bela Vista
Bento Ferreira
Boa Vista
Bonfim
Caratoíra
Centro
Comdusa
Conquista
Consolação
Cruzamento
Da Penha
De Lourdes
Do Cabral
Do Moscoso
Do Quadro
Enseada do Suá
Estrelinha
Fonte Grande
Forte São João
Fradinhos
Goiabeiras
Grande Vitória
Gurigica
Horto
Ilha das Caieiras
Ilha de Santa Maria
Ilha do Boi
Ilha do Frade
Ilha do Príncipe
Inhanguetá
Itararé
Jabour
Jardim Camburi
Jardim da Penha
Jesus de Nazareth
Joana d'Arc
Jucutuquara
Maria Ortiz
Maruípe
Mata da Praia
Monte Belo
Morada de Camburi
Mário Cypreste
Nazareth
Nova Palestina
Parque Industrial
Parque Moscoso
Piedade
Pontal de Camburi
Praia do Canto
Praia do Suá
Redenção
República
Resistência
Romão
Santa Cecília
Santa Clara
Santa Helena
Santa Luíza
Santa Lúcia
Santa Martha
Santa Tereza
Santo André
Santo Antônio
Santos Dumont
Santos Reis
Segurança do Lar
Solon Borges
São Benedito
São Cristóvão
São José
São Pedro
Tabuazeiro
Universitário
Vila Rubim


In [11]:
print(city_venues.shape)
city_venues.head()

(2400, 7)


Unnamed: 0,Hood,Hood Latitude,Hood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Aeroporto,-20.257265,-40.282093,Carnielli Cafeteria e Delicatessen,-20.265536,-40.282909,Coffee Shop
1,Aeroporto,-20.257265,-40.282093,Aeroporto de Vitória / Eurico de Aguiar Salles...,-20.265284,-40.28276,Airport
2,Aeroporto,-20.257265,-40.282093,Wellness Club,-20.251276,-40.271994,Gym
3,Aeroporto,-20.257265,-40.282093,Cerimonial Le Buffet,-20.264039,-40.270844,Event Space
4,Aeroporto,-20.257265,-40.282093,Empório Auguri,-20.258609,-40.27043,Deli / Bodega


In [12]:
df_copy = city_venues.copy()
df_copy['City'] = pd.Series('Vitoria', index=city_venues.index)
df_copy.to_csv('Dataset/dataVT',index=False)

Let's check how many venues were returned for each neighborhood

In [46]:
city_venues.groupby('Hood').count()

Unnamed: 0_level_0,Hood Latitude,Hood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Hood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aeroporto,30,30,30,30,30,30
Andorinhas,30,30,30,30,30,30
Antônio Honório,30,30,30,30,30,30
Ariovaldo Favalessa,30,30,30,30,30,30
Barro Vermelho,30,30,30,30,30,30
...,...,...,...,...,...,...
São José,30,30,30,30,30,30
São Pedro,30,30,30,30,30,30
Tabuazeiro,30,30,30,30,30,30
Universitário,30,30,30,30,30,30


#### Let's find out how many unique categories can be curated from all the returned venues

In [47]:
print('There are {} uniques categories.'.format(len(city_venues['Venue Category'].unique())))

There are 117 uniques categories.


## 3. Analyze Each Neighborhood

In [48]:
# one hot encoding
city_onehot = pd.get_dummies(city_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
city_onehot['Hood'] = city_venues['Hood'] 

# move neighborhood column to the first column
fixed_columns = [city_onehot.columns[-1]] + list(city_onehot.columns[:-1])
city_onehot = city_onehot[fixed_columns]

city_onehot.head()

Unnamed: 0,Hood,Acai House,Accessories Store,Airport,Aquarium,Argentinian Restaurant,BBQ Joint,Bakery,Bar,Beach,...,Stadium,Steakhouse,Supermarket,Sushi Restaurant,Tapiocaria,Theater,Vegetarian / Vegan Restaurant,Warehouse Store,Waterfront,Wine Shop
0,Aeroporto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Aeroporto,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Aeroporto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Aeroporto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Aeroporto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [49]:
city_grouped = city_onehot.groupby('Hood').mean().reset_index()
city_grouped

Unnamed: 0,Hood,Acai House,Accessories Store,Airport,Aquarium,Argentinian Restaurant,BBQ Joint,Bakery,Bar,Beach,...,Stadium,Steakhouse,Supermarket,Sushi Restaurant,Tapiocaria,Theater,Vegetarian / Vegan Restaurant,Warehouse Store,Waterfront,Wine Shop
0,Aeroporto,0.033333,0.0,0.033333,0.0,0.000000,0.033333,0.033333,0.100000,0.033333,...,0.000000,0.0,0.000000,0.033333,0.0,0.000000,0.000000,0.0,0.0,0.0
1,Andorinhas,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.033333,0.000000,...,0.000000,0.0,0.000000,0.033333,0.0,0.033333,0.033333,0.0,0.0,0.0
2,Antônio Honório,0.000000,0.0,0.033333,0.0,0.033333,0.066667,0.033333,0.000000,0.000000,...,0.000000,0.0,0.033333,0.000000,0.0,0.000000,0.033333,0.0,0.0,0.0
3,Ariovaldo Favalessa,0.000000,0.0,0.000000,0.0,0.000000,0.033333,0.033333,0.100000,0.000000,...,0.033333,0.0,0.000000,0.000000,0.0,0.066667,0.033333,0.0,0.0,0.0
4,Barro Vermelho,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.033333,0.033333,0.000000,...,0.000000,0.0,0.000000,0.033333,0.0,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,São José,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.033333,0.033333,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.033333,0.033333,0.0,0.0,0.0
76,São Pedro,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.066667,0.033333,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.033333,0.000000,0.0,0.0,0.0
77,Tabuazeiro,0.033333,0.0,0.000000,0.0,0.000000,0.000000,0.033333,0.066667,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
78,Universitário,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.066667,0.133333,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.066667,0.033333,0.0,0.0,0.0


#### Let's confirm the new size

In [50]:
city_grouped.shape

(80, 118)

#### Let's print each neighborhood along with the top 5 most common venues

In [51]:
num_top_venues = 5

for hood in city_grouped['Hood']:
    print("----"+hood+"----")
    temp = city_grouped[city_grouped['Hood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Aeroporto----
            venue  freq
0             Bar  0.10
1  Gymnastics Gym  0.07
2  Ice Cream Shop  0.07
3             Gym  0.07
4    Burger Joint  0.07


----Andorinhas----
                  venue  freq
0                  Park  0.07
1                   Gym  0.07
2  Gym / Fitness Center  0.07
3          Burger Joint  0.07
4              Dive Bar  0.07


----Antônio Honório----
                venue  freq
0           BBQ Joint  0.07
1         Coffee Shop  0.07
2            Dive Bar  0.03
3  Mexican Restaurant  0.03
4                Park  0.03


----Ariovaldo Favalessa----
                  venue  freq
0                   Bar  0.10
1                  Park  0.07
2  Brazilian Restaurant  0.07
3                Church  0.07
4               Theater  0.07


----Barro Vermelho----
               venue  freq
0     Ice Cream Shop  0.13
1        Coffee Shop  0.10
2                Gym  0.07
3  Health Food Store  0.07
4     Chocolate Shop  0.07


----Bela Vista----
                   venue 

                    venue  freq
0  Furniture / Home Store  0.10
1      Italian Restaurant  0.10
2    Brazilian Restaurant  0.07
3                  Bakery  0.07
4              Restaurant  0.07


----São Cristóvão----
               venue  freq
0               Park  0.07
1        Snack Place  0.07
2              Hotel  0.07
3                Gym  0.07
4  Martial Arts Dojo  0.07


----São José----
                venue  freq
0  Seafood Restaurant  0.10
1         Snack Place  0.07
2                 Gym  0.07
3                Park  0.07
4            Dive Bar  0.03


----São Pedro----
                venue  freq
0        Burger Joint  0.10
1  Seafood Restaurant  0.10
2              Bakery  0.07
3                Park  0.07
4                 Gym  0.03


----Tabuazeiro----
                  venue  freq
0                  Park  0.07
1           Snack Place  0.07
2  Gym / Fitness Center  0.07
3                   Gym  0.07
4            Restaurant  0.07


----Universitário----
      venue  freq
0   

#### Let's put that into a *pandas* dataframe
First, let's write a function to sort the venues in descending order.

In [52]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [53]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Hood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Hood'] = city_grouped['Hood']

for ind in np.arange(city_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(city_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Hood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Aeroporto,Bar,Gymnastics Gym,Gym,Burger Joint,Ice Cream Shop,Acai House,Hot Dog Joint,Deli / Bodega,Coffee Shop,Cocktail Bar
1,Andorinhas,Dive Bar,Gym / Fitness Center,Park,Burger Joint,Gym,Health Food Store,Hotel,Farmers Market,Event Space,Pizza Place
2,Antônio Honório,BBQ Joint,Coffee Shop,Farmers Market,Mexican Restaurant,Rental Car Location,Hotel,Pizza Place,Café,Japanese Restaurant,Restaurant
3,Ariovaldo Favalessa,Bar,Brazilian Restaurant,Performing Arts Venue,Theater,Park,Church,Scenic Lookout,Plaza,Pedestrian Plaza,Government Building
4,Barro Vermelho,Ice Cream Shop,Coffee Shop,Gym / Fitness Center,Gym,Chocolate Shop,Health Food Store,Event Space,Park,Pizza Place,Pub


#### Let's confirm the new size
Run *k*-means to cluster the neighborhood into 5 clusters.

In [54]:
# import k-means from clustering stage
from sklearn.cluster import AgglomerativeClustering

# set number of clusters
kclusters = 10

city_grouped_clustering = city_grouped.drop('Hood', 1)

# run k-means clustering
kmeans = AgglomerativeClustering(n_clusters=kclusters).fit(city_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:5]

array([6, 8, 4, 7, 1], dtype=int64)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [55]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

city_merged = df_city

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
city_merged = city_merged.join(neighborhoods_venues_sorted.set_index('Hood'), on='Hood', how='inner')

city_merged.head(6) # check the last columns!

Unnamed: 0,Hood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Aeroporto,-20.2573,-40.2821,6,Bar,Gymnastics Gym,Gym,Burger Joint,Ice Cream Shop,Acai House,Hot Dog Joint,Deli / Bodega,Coffee Shop,Cocktail Bar
1,Andorinhas,-20.2876,-40.3058,8,Dive Bar,Gym / Fitness Center,Park,Burger Joint,Gym,Health Food Store,Hotel,Farmers Market,Event Space,Pizza Place
2,Antônio Honório,-20.2627,-40.2981,4,BBQ Joint,Coffee Shop,Farmers Market,Mexican Restaurant,Rental Car Location,Hotel,Pizza Place,Café,Japanese Restaurant,Restaurant
3,Ariovaldo Favalessa,-20.3143,-40.3565,7,Bar,Brazilian Restaurant,Performing Arts Venue,Theater,Park,Church,Scenic Lookout,Plaza,Pedestrian Plaza,Government Building
4,Barro Vermelho,-20.2951,-40.2975,1,Ice Cream Shop,Coffee Shop,Gym / Fitness Center,Gym,Chocolate Shop,Health Food Store,Event Space,Park,Pizza Place,Pub
5,Bela Vista,-20.305,-40.3481,7,Bar,Music Venue,Theater,Farmers Market,Park,Performing Arts Venue,Church,Government Building,Scenic Lookout,Museum


In [56]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(city_merged['Latitude'], city_merged['Longitude'], city_merged['Hood'], city_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters