# Web Scraping

In [6]:
import pandas as pd
import requests
from bs4 import BeautifulSoup # analizar datos estructurados e interactuar con html

In [10]:
# función para extraer los datos necesarios de la página web
def ExtraerData(url):
    page = requests.get(url) # extraer el contenido html
    soup = BeautifulSoup(page.content, "html.parser") # para hacer mas manejable las búsquedas
    tbody = soup.find("tbody")
    PC = []
    Borough = []

    for i in range(len(tbody.find_all('tr'))):
        for j in range(len(tbody.find_all("tr")[i].find_all('td'))):
            if tbody.find_all("tr")[i].find_all('td')[j].find('span').text == 'Not assigned':
                bij = None
                b = None
            else:
                bij = tbody.find_all("tr")[i].find_all('td')[j].find('span').text
            Borough.append(bij)
            pcij = tbody.find_all("tr")[i].find_all('td')[j].find('b').text
            PC.append(pcij)

    Toronto_City = pd.DataFrame({'Postal Code': PC,'Borough and Neighborhood':Borough})
    return Toronto_City

In [11]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' # url de pagina web
Toronto_City = ExtraerData(url) # Resultado
Toronto_City.info() # Información del dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 2 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Postal Code               180 non-null    object
 1   Borough and Neighborhood  103 non-null    object
dtypes: object(2)
memory usage: 2.9+ KB


## Limpieza de datos

In [13]:
Toronto_City.dropna(inplace=True) # Eliminar registros vacios
Toronto_City.info()
Toronto_City.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 2 to 178
Data columns (total 2 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Postal Code               103 non-null    object
 1   Borough and Neighborhood  103 non-null    object
dtypes: object(2)
memory usage: 2.4+ KB


Unnamed: 0,Postal Code,Borough and Neighborhood
2,M3A,North York(Parkwoods)
3,M4A,North York(Victoria Village)
4,M5A,Downtown Toronto(Regent Park / Harbourfront)
5,M6A,North York(Lawrence Manor / Lawrence Heights)
6,M7A,Queen's Park(Ontario Provincial Government)


In [14]:
# dividir la columna por estos caracteres ()
Borough_and_Neighborhood = Toronto_City["Borough and Neighborhood"].str.split('[()]',expand=True)
Borough_and_Neighborhood.head()

Unnamed: 0,0,1,2,3,4
2,North York,Parkwoods,,,
3,North York,Victoria Village,,,
4,Downtown Toronto,Regent Park / Harbourfront,,,
5,North York,Lawrence Manor / Lawrence Heights,,,
6,Queen's Park,Ontario Provincial Government,,,


In [17]:
# Unir columnas: 1 y 2, ya que en la columna 2 encontramos información necesaria para la columna 1
Borough_and_Neighborhood[1] = Borough_and_Neighborhood[1].str.cat(Borough_and_Neighborhood[2],sep =' ')

In [18]:
# eliminar columnas y reemplazar caracteres 
Borough_and_Neighborhood.drop(columns=[2,4],inplace=True)
Borough_and_Neighborhood[1] = Borough_and_Neighborhood[1].str.replace('/',',')
Borough_and_Neighborhood

Unnamed: 0,0,1,3
2,North York,Parkwoods,
3,North York,Victoria Village,
4,Downtown Toronto,"Regent Park , Harbourfront",
5,North York,"Lawrence Manor , Lawrence Heights",
6,Queen's Park,Ontario Provincial Government,
...,...,...,...
160,Etobicoke,"The Kingsway , Montgomery Road , Old Mill Nort...",
165,Downtown Toronto,Church and Wellesley,
168,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L,
169,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,...",


In [19]:
# Unir columnas y reemplazar valores vacíos
Borough_and_Neighborhood[1] = Borough_and_Neighborhood[1].str.cat(Borough_and_Neighborhood[3],sep =',',na_rep='-')

In [20]:
# reemplazamos caracteres y eliminamos columnas
Borough_and_Neighborhood[1]= Borough_and_Neighborhood[1].str.replace(',-','')
Borough_and_Neighborhood.drop(columns=[3],inplace=True)
Borough_and_Neighborhood.head()

Unnamed: 0,0,1
2,North York,Parkwoods
3,North York,Victoria Village
4,Downtown Toronto,"Regent Park , Harbourfront"
5,North York,"Lawrence Manor , Lawrence Heights"
6,Queen's Park,Ontario Provincial Government


In [21]:
# Cambiar el nombre de una columna
Borough_and_Neighborhood.rename(columns = {0:"Borough",1:"Neighborhood"},inplace=True)
Borough_and_Neighborhood.head()

Unnamed: 0,Borough,Neighborhood
2,North York,Parkwoods
3,North York,Victoria Village
4,Downtown Toronto,"Regent Park , Harbourfront"
5,North York,"Lawrence Manor , Lawrence Heights"
6,Queen's Park,Ontario Provincial Government


In [22]:
# Concatenamos 2 dataframes
Dataset = pd.concat([Toronto_City['Postal Code'], Borough_and_Neighborhood], axis=1)
Dataset.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"
5,M6A,North York,"Lawrence Manor , Lawrence Heights"
6,M7A,Queen's Park,Ontario Provincial Government


In [23]:
Dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 2 to 178
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Postal Code   103 non-null    object
 1   Borough       103 non-null    object
 2   Neighborhood  103 non-null    object
dtypes: object(3)
memory usage: 3.2+ KB
