In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
from geopy.geocoders import Nominatim
import folium

In [2]:
df_sp = pd.read_csv('Dataset/dataSP')
df_rj = pd.read_csv('Dataset/dataRJ')
df_vt = pd.read_csv('Dataset/dataVT')
df_bh = pd.read_csv('Dataset/dataBH')

In [3]:
city_venues = pd.concat([df_sp, df_rj, df_vt, df_bh])
df_city = city_venues[['City','Hood','Hood Latitude', 'Hood Longitude']]
df_city = df_city.drop_duplicates().reset_index(drop=True)
df_city

Unnamed: 0,City,Hood,Hood Latitude,Hood Longitude
0,Sao Paulo,Grajaú,-23.786248,-46.664993
1,Sao Paulo,Sapopemba,-23.604327,-46.509885
2,Sao Paulo,Jardim Ângela,-23.712528,-46.768720
3,Sao Paulo,Brasilândia,-23.448272,-46.690269
4,Sao Paulo,Capão Redondo,-23.671903,-46.779435
...,...,...,...,...
693,Belo Horizonte,Vista Alegre,-19.955005,-43.999021
694,Belo Horizonte,Vitória,-19.862021,-43.886497
695,Belo Horizonte,Vitória Conquista - Barreiro,-20.014632,-44.023549
696,Belo Horizonte,Washington Pires,-19.989536,-44.059377


## 3. Analyze Each Neighborhood

In [4]:
# one hot encoding
city_onehot = pd.get_dummies(city_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
city_onehot['City'] = city_venues['City'] 
city_onehot['Hood'] = city_venues['Hood'] 

# move neighborhood column to the first column
fixed_columns = [city_onehot.columns[-1]] + list(city_onehot.columns[:-1])
city_onehot = city_onehot[fixed_columns]

city_onehot.head()

Unnamed: 0,Hood,Acai House,Accessories Store,Adult Boutique,Airport,Airport Lounge,Airport Service,American Restaurant,Amphitheater,Antique Shop,...,Water Park,Waterfall,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Women's Store,Yoga Studio,Zoo,City
0,Grajaú,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Sao Paulo
1,Grajaú,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Sao Paulo
2,Grajaú,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Sao Paulo
3,Grajaú,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Sao Paulo
4,Grajaú,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Sao Paulo


#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [5]:
city_grouped = city_onehot.groupby(['Hood','City']).mean().reset_index()
city_grouped

Unnamed: 0,Hood,City,Acai House,Accessories Store,Adult Boutique,Airport,Airport Lounge,Airport Service,American Restaurant,Amphitheater,...,Warehouse Store,Water Park,Waterfall,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Women's Store,Yoga Studio,Zoo
0,Aarão Reis,Belo Horizonte,0.033333,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Abolição,Rio de Janeiro,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Acaiaca,Belo Horizonte,0.066667,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Acari,Rio de Janeiro,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Adelaide,Belo Horizonte,0.033333,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
693,Água Santa,Rio de Janeiro,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
694,Álvaro Camargos,Belo Horizonte,0.033333,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
695,Átila Paiva - Barreiro,Belo Horizonte,0.066667,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
696,Átila de Paiva,Belo Horizonte,0.066667,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Let's put that into a *pandas* dataframe
First, let's write a function to sort the venues in descending order.

In [6]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[2:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [7]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Hood','City']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Hood'] = city_grouped['Hood']
neighborhoods_venues_sorted['City'] = city_grouped['City']

for ind in np.arange(city_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 2:] = return_most_common_venues(city_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Hood,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Aarão Reis,Belo Horizonte,Bar,Ice Cream Shop,Gym / Fitness Center,Pizza Place,Park,Acai House,Food Truck,Gymnastics Gym,Bakery,Gym
1,Abolição,Rio de Janeiro,Food Truck,Bar,Plaza,BBQ Joint,Soccer Stadium,Gym,Bakery,Brazilian Restaurant,Portuguese Restaurant,Restaurant
2,Acaiaca,Belo Horizonte,Pizza Place,Bakery,Gymnastics Gym,Ice Cream Shop,Acai House,Gym / Fitness Center,Bar,BBQ Joint,Snack Place,Athletics & Sports
3,Acari,Rio de Janeiro,Bar,Snack Place,Bakery,Japanese Restaurant,Ice Cream Shop,Bistro,Pizza Place,Restaurant,Pub,Gym
4,Adelaide,Belo Horizonte,Brazilian Restaurant,Bar,Restaurant,Burger Joint,Bakery,Acai House,BBQ Joint,Middle Eastern Restaurant,Gymnastics Gym,Gym / Fitness Center


#### Let's confirm the new size
Run *k*-means to cluster the neighborhood into 5 clusters.

In [8]:
# import k-means from clustering stage
from sklearn.cluster import AgglomerativeClustering

# set number of clusters
kclusters = 10

city_grouped_clustering = city_grouped.drop(['Hood','City'], 1)

# run k-means clustering
kmeans = AgglomerativeClustering(n_clusters=kclusters).fit(city_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:5]

array([1, 0, 1, 0, 0], dtype=int64)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [9]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
city_merged = pd.merge( neighborhoods_venues_sorted, df_city, how='inner', on=['Hood','City'] )

city_merged.head(6) # check the last columns!

Unnamed: 0,Cluster Labels,Hood,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Hood Latitude,Hood Longitude
0,1,Aarão Reis,Belo Horizonte,Bar,Ice Cream Shop,Gym / Fitness Center,Pizza Place,Park,Acai House,Food Truck,Gymnastics Gym,Bakery,Gym,-19.847221,-43.919508
1,0,Abolição,Rio de Janeiro,Food Truck,Bar,Plaza,BBQ Joint,Soccer Stadium,Gym,Bakery,Brazilian Restaurant,Portuguese Restaurant,Restaurant,-22.886161,-43.299846
2,1,Acaiaca,Belo Horizonte,Pizza Place,Bakery,Gymnastics Gym,Ice Cream Shop,Acai House,Gym / Fitness Center,Bar,BBQ Joint,Snack Place,Athletics & Sports,-19.844662,-43.894791
3,0,Acari,Rio de Janeiro,Bar,Snack Place,Bakery,Japanese Restaurant,Ice Cream Shop,Bistro,Pizza Place,Restaurant,Pub,Gym,-22.822153,-43.340674
4,0,Adelaide,Belo Horizonte,Brazilian Restaurant,Bar,Restaurant,Burger Joint,Bakery,Acai House,BBQ Joint,Middle Eastern Restaurant,Gymnastics Gym,Gym / Fitness Center,-19.905134,-43.974987
5,0,Aeroporto,Belo Horizonte,Gym / Fitness Center,Acai House,Pet Store,Bar,Burger Joint,Bakery,Pizza Place,Plaza,Sandwich Place,Clothing Store,-19.851905,-43.947739


In [10]:
city_merged.shape

(698, 15)

In [11]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[-23.461110, -46.213023], zoom_start=6)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(city_merged['Hood Latitude'], city_merged['Hood Longitude'], city_merged['Hood'], city_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# Part2

In [12]:
df = city_merged.astype( {'Cluster Labels': 'category'} )
df_one_hot = pd.get_dummies(df[['Cluster Labels']], prefix_sep="")

# add neighborhood column back to dataframe
df_one_hot['City'] = df['City'] 
df_one_hot['Hood'] = df['Hood'] 

df_one_hot

Unnamed: 0,Cluster Labels0,Cluster Labels1,Cluster Labels2,Cluster Labels3,Cluster Labels4,Cluster Labels5,Cluster Labels6,Cluster Labels7,Cluster Labels8,Cluster Labels9,City,Hood
0,0,1,0,0,0,0,0,0,0,0,Belo Horizonte,Aarão Reis
1,1,0,0,0,0,0,0,0,0,0,Rio de Janeiro,Abolição
2,0,1,0,0,0,0,0,0,0,0,Belo Horizonte,Acaiaca
3,1,0,0,0,0,0,0,0,0,0,Rio de Janeiro,Acari
4,1,0,0,0,0,0,0,0,0,0,Belo Horizonte,Adelaide
...,...,...,...,...,...,...,...,...,...,...,...,...
693,1,0,0,0,0,0,0,0,0,0,Rio de Janeiro,Água Santa
694,1,0,0,0,0,0,0,0,0,0,Belo Horizonte,Álvaro Camargos
695,0,1,0,0,0,0,0,0,0,0,Belo Horizonte,Átila Paiva - Barreiro
696,0,1,0,0,0,0,0,0,0,0,Belo Horizonte,Átila de Paiva


In [13]:
city_grouped = df_one_hot.groupby(['City']).mean().reset_index()
city_grouped

Unnamed: 0,City,Cluster Labels0,Cluster Labels1,Cluster Labels2,Cluster Labels3,Cluster Labels4,Cluster Labels5,Cluster Labels6,Cluster Labels7,Cluster Labels8,Cluster Labels9
0,Belo Horizonte,0.240997,0.202216,0.188366,0.085873,0.024931,0.0,0.091413,0.119114,0.0,0.047091
1,Rio de Janeiro,0.42236,0.0,0.0,0.192547,0.099379,0.0,0.043478,0.0,0.111801,0.130435
2,Sao Paulo,0.15625,0.03125,0.0,0.21875,0.0625,0.0,0.0,0.010417,0.0,0.520833
3,Vitoria,0.0,0.0,0.0,0.2625,0.425,0.3125,0.0,0.0,0.0,0.0


In [14]:
# import k-means from clustering stage
from sklearn.cluster import AgglomerativeClustering

# set number of clusters
kclusters = 2

city_grouped_clustering = city_grouped.drop(['City'], 1)

# run k-means clustering
kmeans = AgglomerativeClustering(n_clusters=kclusters).fit(city_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:5]

array([0, 0, 0, 1], dtype=int64)