# Web scrapping to obtain data for Neighbourhood income-CMHA(Source)

## Vancouver

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the target webpage
url = 'https://www03.cmhc-schl.gc.ca/hmip-pimh/en/TableMapChart/TableMatchingCriteria?GeographyType=MetropolitanMajorArea&GeographyId=2410&CategoryLevel1=Population%2C%20Households%20and%20Housing%20Stock&CategoryLevel2=Household%20Income&ColumnField=HouseholdIncomeRange&RowField=Neighbourhood&SearchTags%5B0%5D.Key=Households&SearchTags%5B0%5D.Value=Number&SearchTags%5B1%5D.Key=Statistics&SearchTags%5B1%5D.Value=Ranges'

# Make a GET request to the webpage
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Find the table in the webpage
table = soup.find('table')

# Extract headers
headers = [header.text.strip() for header in table.find_all('th')]
# Ensure the first header is 'Neighbourhood' and remove any empty headers
headers = ['Neighbourhood'] + [h for h in headers[1:] if h]

# Extract rows
rows = []
for row in table.find_all('tr')[1:]:  # Skip the header row
    # Get row label from the first column
    row_label = row.find('th').text.strip() if row.find('th') else ''
    
    # Extract cell data
    cells = [cell.text.strip() for cell in row.find_all('td')]
    
    # Combine the row label with the cells
    if len(cells) == len(headers) - 1:  # Adjust for row label
        rows.append([row_label] + cells)
    else:
        # If the number of cells does not match, pad or truncate as needed
        cells.extend([''] * (len(headers) - 1 - len(cells)))  # Pad if too few cells
        rows.append([row_label] + cells[:len(headers) - 1])  # Truncate if too many cells

# Create a DataFrame
df = pd.DataFrame(rows, columns=headers)

# Filter columns to include only up to one column after 'Total'
if 'Total' in df.columns:
    total_index = df.columns.get_loc('Total')  # Find the index of 'Total'
    df = df.iloc[:, : total_index + 2]  # Include one column after 'Total'

    # Example of renaming by assigning to columns
df.columns = ['Neighborhood',
              'less than $20,000',
              'Income $20,000 - $39,999', 
              'Income $40,000 - $59,999', 
              'Income $60,000 - $79,999', 
              'Income $80,000 - $99,999', 
              'Income $100,000 and Over', 
              'Total', 
              ]  # Add additional names as necessary

# Save to CSV
df.to_csv('vancouver_neighborhood_data_filtered.csv', index=False)
print("Data saved to vancouver_neighborhood_data_filtered.csv")

# Display the DataFrame for verification
(df)


Data saved to vancouver_neighborhood_data_filtered.csv


Unnamed: 0,Neighborhood,"less than $20,000","Income $20,000 - $39,999","Income $40,000 - $59,999","Income $60,000 - $79,999","Income $80,000 - $99,999","Income $100,000 and Over",Total
0,Vancouver,59895,135830,135375,130095,115715,466405,1043320
1,Ambleside,335,900,685,585,360,1135,4000
2,Anmore/Belcarra/Metro Vancouver North,30,50,110,150,130,1070,1545
3,Capital Hill/Brentwood,1185,2145,1945,1955,1845,6610,15680
4,Cedar Cottage,670,1800,1735,1735,1415,6200,13565
...,...,...,...,...,...,...,...,...
66,West End/Stanley Park South,235,690,695,505,400,870,3410
67,Westside Heights,745,1835,1890,1830,1440,6220,13960
68,Westside/Kerrisdale Remainder,1640,2540,2355,2120,1845,11395,21890
69,Whalley,1120,3350,3965,4010,3680,12150,28275


## Toronto

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the target webpage
url = 'https://www03.cmhc-schl.gc.ca/hmip-pimh/en/TableMapChart/TableMatchingCriteria?GeographyType=MetropolitanMajorArea&GeographyId=2270&CategoryLevel1=Population%2C%20Households%20and%20Housing%20Stock&CategoryLevel2=Household%20Income&ColumnField=HouseholdIncomeRange&RowField=Neighbourhood&SearchTags%5B0%5D.Key=Households&SearchTags%5B0%5D.Value=Number&SearchTags%5B1%5D.Key=Statistics&SearchTags%5B1%5D.Value=Ranges'
#https://www03.cmhc-schl.gc.ca/hmip-pimh/en/TableMapChart/TableMatchingCriteria?GeographyType=MetropolitanMajorArea&GeographyId=2410&CategoryLevel1=Population%2C%20Households%20and%20Housing%20Stock&CategoryLevel2=Household%20Income&ColumnField=HouseholdIncomeRange&RowField=Neighbourhood&SearchTags%5B0%5D.Key=Households&SearchTags%5B0%5D.Value=Number&SearchTags%5B1%5D.Key=Statistics&SearchTags%5B1%5D.Value=Ranges'

# Make a GET request to the webpage
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Find the table in the webpage
table = soup.find('table')

# Extract headers
headers = [header.text.strip() for header in table.find_all('th')]
# Ensure the first header is 'Neighbourhood' and remove any empty headers
headers = ['Neighbourhood'] + [h for h in headers[1:] if h]

# Extract rows
rows = []
for row in table.find_all('tr')[1:]:  # Skip the header row
    # Get row label from the first column
    row_label = row.find('th').text.strip() if row.find('th') else ''
    
    # Extract cell data
    cells = [cell.text.strip() for cell in row.find_all('td')]
    
    # Combine the row label with the cells
    if len(cells) == len(headers) - 1:  # Adjust for row label
        rows.append([row_label] + cells)
    else:
        # If the number of cells does not match, pad or truncate as needed
        cells.extend([''] * (len(headers) - 1 - len(cells)))  # Pad if too few cells
        rows.append([row_label] + cells[:len(headers) - 1])  # Truncate if too many cells

# Create a DataFrame
df = pd.DataFrame(rows, columns=headers)

# Filter columns to include only up to one column after 'Total'
if 'Total' in df.columns:
    total_index = df.columns.get_loc('Total')  # Find the index of 'Total'
    df = df.iloc[:, : total_index + 2]  # Include one column after 'Total'

    # Example of renaming by assigning to columns
df.columns = ['Neighborhood',
              'less than $20,000',
              'Income $20,000 - $39,999', 
              'Income $40,000 - $59,999', 
              'Income $60,000 - $79,999', 
              'Income $80,000 - $99,999', 
              'Income $100,000 and Over', 
              'Total', 
              ]  # Add additional names as necessary

# Save to CSV
df.to_csv('toronto_neighborhood_data_filtered.csv', index=False)
print("Data saved to Toronto_neighborhood_data_filtered.csv")

# Display the DataFrame for verification
(df)


Data saved to Toronto_neighborhood_data_filtered.csv


Unnamed: 0,Neighborhood,"less than $20,000","Income $20,000 - $39,999","Income $40,000 - $59,999","Income $60,000 - $79,999","Income $80,000 - $99,999","Income $100,000 and Over",Total
0,Toronto,108855,264185,268775,272180,252230,1096245,2262475
1,Agincourt/Malvern,1340,4625,5225,5595,5050,16355,38190
2,Ajax/Pickering,1440,4975,6500,7925,8105,43965,72910
3,Alderwood,120,525,560,535,475,2565,4790
4,Aurora,655,1630,2015,2340,2140,12730,21505
...,...,...,...,...,...,...,...,...
130,Woodbine-Lumsden,160,475,385,380,355,1615,3380
131,Wychwood,315,965,735,650,640,2460,5770
132,Yonge-Eglinton,350,725,870,720,445,2875,5995
133,Yonge-St. Clair,440,770,935,920,740,3400,7205
