# Clasificación de vecindarios por comunas en Chile

## Importamos las librerías

In [154]:
import pandas as pd
import numpy as np
import requests
import os
import folium 
import matplotlib.cm as cm
import matplotlib.colors as colors
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim 


## Obtenemos los datos de wikipedia haciendo scraping

In [155]:
page = "https://es.wikipedia.org/wiki/Anexo:Comunas_de_Chile"
soup = BeautifulSoup(requests.get(page).text, "lxml")


In [156]:
table = soup.find("table",{"class": "wikitable sortable"})
column_names=['CUT','Comuna','Emblema', "Provincia",'Region','Superficie','Poblacion', "Densidad",'idh1','idh2','Latitud', "Longitud"]
df = pd.DataFrame(columns=column_names)

for tr in table.find_all('tr'):
    row=[]
    for td in tr.find_all('td'):
        row.append(td.text.strip())
    if len(row)==12:
        df.loc[len(df)] = row

## Limpiamos los datos

In [157]:
df = df.drop(columns=["Emblema","idh1","idh2", "Poblacion", "Provincia", "Densidad", "CUT"])
df.head()

Unnamed: 0,Comuna,Region,Superficie,Latitud,Longitud
0,Arica,Arica y Parinacota,4799.4,"-18°27'18""","-70°17'24"""
1,Camarones,Arica y Parinacota,3927.0,"-19°1'1.2""","-69°52'1.2"""
2,Putre,Arica y Parinacota,5902.5,"-18°12'0""","-69°34'58.8"""
3,General Lagos,Arica y Parinacota,2244.4,"-17°39'10.8""","-69°38'6"""
4,Iquique,Tarapacá,2242.1,"-20°14'38.4""","-70°8'20.4"""


In [158]:
df = df[df["Region"] == "Metropolitana de Santiago"]
df.head(15)

Unnamed: 0,Comuna,Region,Superficie,Latitud,Longitud
294,Santiago,Metropolitana de Santiago,23.2,"-33°26'14""","-70°39'26"""
295,Cerrillos,Metropolitana de Santiago,21.0,"-33°30'0""","-70°43'0"""
296,Cerro Navia,Metropolitana de Santiago,11.0,"-33°25'19.2""","-70°44'6"""
297,Conchalí,Metropolitana de Santiago,10.7,"-33°22'48""","-70°40'30"""
298,El Bosque,Metropolitana de Santiago,14.2,"-33°34'1.2""","-70°40'30"""
299,Estación Central,Metropolitana de Santiago,15.0,"-33°27'32.4""","-70°41'56.4"""
300,Huechuraba,Metropolitana de Santiago,44.8,"-33°22'4.8""","-70°38'2.4"""
301,Independencia,Metropolitana de Santiago,7.0,"-33°24'46.8""","-70°39'57.6"""
302,La Cisterna,Metropolitana de Santiago,10.0,"-33°31'44.4""","-70°39'46.8"""
303,La Florida,Metropolitana de Santiago,70.2,"-33°31'30""","-70°32'16.8"""


## Arreglamos el formato de las coordenadas

In [159]:
def dms2dd(degrees, minutes, seconds):
    if float(degrees) < 0:
        dd = float(degrees) - float(minutes)/60 - float(seconds)/(60*60)
    else:
        dd = float(degrees) + float(minutes)/60 + float(seconds)/(60*60)
    return dd

def parse_dms(dms):
    parts = dms.replace("°", "'").replace('"', "").split("'")
    lat = dms2dd(parts[0], parts[1], parts[2])

    return (lat)

In [160]:
df["Latitud"] = df["Latitud"].apply(parse_dms)
df["Longitud"] = df["Longitud"].apply(parse_dms)

In [161]:
df.head(30)

Unnamed: 0,Comuna,Region,Superficie,Latitud,Longitud
294,Santiago,Metropolitana de Santiago,23.2,-33.437222,-70.657222
295,Cerrillos,Metropolitana de Santiago,21.0,-33.5,-70.716667
296,Cerro Navia,Metropolitana de Santiago,11.0,-33.422,-70.735
297,Conchalí,Metropolitana de Santiago,10.7,-33.38,-70.675
298,El Bosque,Metropolitana de Santiago,14.2,-33.567,-70.675
299,Estación Central,Metropolitana de Santiago,15.0,-33.459,-70.699
300,Huechuraba,Metropolitana de Santiago,44.8,-33.368,-70.634
301,Independencia,Metropolitana de Santiago,7.0,-33.413,-70.666
302,La Cisterna,Metropolitana de Santiago,10.0,-33.529,-70.663
303,La Florida,Metropolitana de Santiago,70.2,-33.525,-70.538


In [162]:
CLIENT_ID = 'UJX0HF0KI0FCRJNZ10MKPGWOYATKIJH0QLG4KQFGMBHN0LA4'
CLIENT_SECRET = 'JRZQRGXVK0VZS3B24UEBKKY4ZTJWLGE1RPCWRZJMKEKHDIXK'
VERSION = '20180604'

In [163]:
def getNearbyVenues(names, latitudes, longitudes):
    radius=500
    LIMIT=100
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Comuna', 
                  'Comuna Latitude', 
                  'Comuna Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

## Ocupamos la API enseñada para sacar los lugares cercanos a cada comuna

In [164]:
santiago_venues = getNearbyVenues(names=df['Comuna'],
                                   latitudes=df['Latitud'],
                                   longitudes=df['Longitud']
                                  )

Santiago
Cerrillos
Cerro Navia
Conchalí
El Bosque
Estación Central
Huechuraba
Independencia
La Cisterna
La Florida
La Granja
La Pintana
La Reina
Las Condes
Lo Barnechea
Lo Espejo
Lo Prado
Macul
Maipú
Ñuñoa
Pedro Aguirre Cerda
Peñalolén
Providencia
Pudahuel
Quilicura
Quinta Normal
Recoleta
Renca
San Joaquín
San Miguel
San Ramón
Vitacura
Puente Alto
Pirque
San José de Maipo
Colina
Lampa
Til Til
San Bernardo
Buin
Calera de Tango
Paine
Melipilla
Alhué
Curacaví
María Pinto
San Pedro
Talagante
El Monte
Isla de Maipo
Padre Hurtado
Peñaflor


In [165]:
santiago_venues.head(600)

Unnamed: 0,Comuna,Comuna Latitude,Comuna Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Santiago,-33.437222,-70.657222,Plaza de Bolsillo - Santiago Centro,-33.436778,-70.655481,Plaza
1,Santiago,-33.437222,-70.657222,Starbucks,-33.437938,-70.657007,Coffee Shop
2,Santiago,-33.437222,-70.657222,YMCA,-33.439060,-70.656257,Pool
3,Santiago,-33.437222,-70.657222,Caffe Mauro,-33.437763,-70.655304,Coffee Shop
4,Santiago,-33.437222,-70.657222,Bambudda,-33.438987,-70.655631,Asian Restaurant
...,...,...,...,...,...,...,...
536,Talagante,-33.667000,-70.931000,Farmacia Ahumada Centro,-33.663766,-70.928165,Pharmacy
537,El Monte,-33.684000,-71.017000,La Pepita,-33.685815,-71.015808,Latin American Restaurant
538,El Monte,-33.684000,-71.017000,Restaurant David,-33.685612,-71.015385,Restaurant
539,Isla de Maipo,-33.754000,-70.886000,Procecion de las mujeres,-33.754366,-70.885653,Moving Target


In [166]:
santiago_venues.groupby('Comuna').count()

Unnamed: 0_level_0,Comuna Latitude,Comuna Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Comuna,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alhué,6,6,6,6,6,6
Buin,10,10,10,10,10,10
Calera de Tango,4,4,4,4,4,4
Cerrillos,4,4,4,4,4,4
Cerro Navia,4,4,4,4,4,4
Colina,4,4,4,4,4,4
Conchalí,4,4,4,4,4,4
El Bosque,2,2,2,2,2,2
El Monte,2,2,2,2,2,2
Estación Central,7,7,7,7,7,7


## Arreglamos los datos categóricos

In [167]:
santiago_onehot = pd.get_dummies(santiago_venues[['Venue Category']], prefix="", prefix_sep="")
santiago_onehot.drop(['Comuna'],axis=1,inplace=True) 
santiago_onehot.insert(loc=0, column='Comuna', value=santiago_venues['Comuna'] )
santiago_onehot.shape

KeyError: "['Comuna'] not found in axis"

In [168]:
santiago_grouped = santiago_onehot.groupby('Comuna').mean().reset_index()
santiago_grouped.head()

KeyError: 'Comuna'

In [169]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

## Calculamos los lugares más concurridos

In [170]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['Comuna']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Comuna'] = toronto_grouped['Comuna']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

NameError: name 'toronto_grouped' is not defined

## Hacemos Clusters de los resultados

In [171]:

kclusters = 10
santiago_grouped_clustering = santiago_grouped.drop('Comuna', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(santiago_grouped_clustering)

kmeans.labels_[0:10]

NameError: name 'santiago_grouped' is not defined

In [173]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

santiago_merged = santiago_data

santiago_merged = santiago_merged.join(neighborhoods_venues_sorted.set_index('Comuna'), on='Neighborhood')

santiago_merged.head()

NameError: name 'kmeans' is not defined