# Obtain Data by Webscraping and data cleaning 

In [26]:
import pandas as pd
import requests
from bs4 import BeautifulSoup # analizar datos estructurados e interactuar con html

In [35]:
# función para extraer los datos necesarios de la página web y realizar la limpieza necesaria explicada en el archivo Obtain_Data_Webscraping.ipynb
def Obtain_Data_Webscraping():
    url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' # url de pagina web
    page = requests.get(url) # extraer el contenido html
    soup = BeautifulSoup(page.content, "html.parser") # para hacer mas manejable las búsquedas
    tbody = soup.find("tbody")
    PC = []
    Borough = []
    for i in range(len(tbody.find_all('tr'))):
        for j in range(len(tbody.find_all("tr")[i].find_all('td'))):
            if tbody.find_all("tr")[i].find_all('td')[j].find('span').text == 'Not assigned':
                bij = None
                b = None
            else:
                bij = tbody.find_all("tr")[i].find_all('td')[j].find('span').text
            Borough.append(bij)
            pcij = tbody.find_all("tr")[i].find_all('td')[j].find('b').text
            PC.append(pcij)

    Toronto_City = pd.DataFrame({'Postal Code': PC,'Borough and Neighborhood':Borough})
    Toronto_City.dropna(inplace=True) # Eliminar registros vacios
    # dividir la columna por estos caracteres ()
    Borough_and_Neighborhood = Toronto_City["Borough and Neighborhood"].str.split('[()]',expand=True)
    # Unir columnas: 1 y 2, ya que en la columna 2 encontramos información necesaria para la columna 1
    Borough_and_Neighborhood[1] = Borough_and_Neighborhood[1].str.cat(Borough_and_Neighborhood[2],sep =' ')
    # eliminar columnas y reemplazar caracteres 
    Borough_and_Neighborhood.drop(columns=[2,4],inplace=True)
    Borough_and_Neighborhood[1] = Borough_and_Neighborhood[1].str.replace('/',',')
    # Unir columnas y reemplazar valores vacíos
    Borough_and_Neighborhood[1] = Borough_and_Neighborhood[1].str.cat(Borough_and_Neighborhood[3],sep =',',na_rep='-')
    # reemplazamos caracteres y eliminamos columnas
    Borough_and_Neighborhood[1]= Borough_and_Neighborhood[1].str.replace(',-','')
    Borough_and_Neighborhood.drop(columns=[3],inplace=True)
    # Cambiar el nombre de una columna
    Borough_and_Neighborhood.rename(columns = {0:"Borough",1:"Neighborhood"},inplace=True)
    # Concatenamos 2 dataframes
    Dataset = pd.concat([Toronto_City['Postal Code'], Borough_and_Neighborhood], axis=1)
    return Dataset

In [37]:
Dataset = Obtain_Data_Webscraping() # Resultado
Dataset.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"
5,M6A,North York,"Lawrence Manor , Lawrence Heights"
6,M7A,Queen's Park,Ontario Provincial Government


# Join Data 

In [38]:
#leer dataframe de datos geospaciales
Geospatial_Cordinates=pd.read_csv('https://ibm.ent.box.com/public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv')
Geospatial_Cordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [39]:
# Union de columnas a un dataframen
Dataset = Dataset.join(Geospatial_Cordinates.set_index('Postal Code'),on='Postal Code')
Dataset

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.654260,-79.360636
5,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
6,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
...,...,...,...,...,...
160,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North",43.653654,-79.506944
165,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
168,M7Y,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L,43.662744,-79.321558
169,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,...",43.636258,-79.498509


# Rename Values of a column

In [40]:
# Agrupamientos de los municipios
Dataset.groupby('Borough').size()

Borough
Central Toronto                                                  9
Downtown Toronto                                                17
Downtown TorontoStn A PO Boxes25 The Esplanade                   1
East Toronto                                                     4
East TorontoBusiness reply mail Processing Centre969 Eastern     1
East York                                                        4
East YorkEast Toronto                                            1
Etobicoke                                                       11
EtobicokeNorthwest                                               1
MississaugaCanada Post Gateway Processing Centre                 1
North York                                                      24
Queen's Park                                                     1
Scarborough                                                     17
West Toronto                                                     6
York                                                  

In [41]:
# Eliminar un registro
indexdrop = Dataset[Dataset['Borough']=='MississaugaCanada Post Gateway Processing Centre'].index
Dataset.drop(indexdrop , inplace=True)

# Reemplazar valores dentro de una columna
Dataset.replace({'Borough': {
    'Central Toronto':'Downtown Toronto',
    'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto',
    "Queen's Park":"Downtown Toronto",
    'East Toronto':'East York',
    'East TorontoBusiness reply mail Processing Centre969 Eastern':'East York',
    'East YorkEast Toronto':'East York',
    'EtobicokeNorthwest':'Etobicoke',
    'West Toronto':'York',
    }},inplace=True)

# Modificamos el indice
Dataset.reset_index(drop=True,inplace=True)

# Dimensión del dataframe
Dataset.shape

(102, 5)

In [42]:
# Agrupamientos de los municipios
Dataset.groupby('Borough').size().sort_values(ascending=False)

Borough
Downtown Toronto    28
East York           10
Etobicoke           12
North York          24
Scarborough         17
York                11
dtype: int64