# Toronto Neighbourhoods and Income of Individuals

The Open data porta was used to obtain the data of Neighbourhoods in Vancouver city of BC. 

In [1]:
# 3rd party imports
import os
import json
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pylab as plt
from bs4 import BeautifulSoup
#!pip install pandas openpyxl


# Configure Notebook
%matplotlib inline
plt.style.use('fivethirtyeight')
sns.set_context("notebook")
import warnings
warnings.filterwarnings('ignore')
%config Completer.use_jedi = False


In [2]:
assignment_files = os.listdir()

# # Print the contents of the directory
# print(assignment_files)

### The main csv was obtained and loaded in the file.

In [3]:
# Load the csv. 
Vn_df = pd.read_csv("V_CensusLocalAreaProfiles2016.csv", encoding='Windows-1252')
VnT_df=Vn_df

# View DataFrame
VnT_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5583,5584,5585,5586,5587,5588,5589,5590,5591,5592
The data shown here is provided by Statistics Canada from the 2016 Census as a custom data order for the City of Vancouver using the City's 22 local planning areas,The data may be reproduced provided they are c...,,CENSUS DATA FOR CITY OF VANCOUVER LOCAL AREAS...,ID,1,2,3,4,5,6,...,5484.0,5485,5486,5487,5488,5489,5490,5491,5492,5493
Unnamed: 1,,,,Variable,Total - Age groups and average age of the pop...,0 to 14 years,0 to 4 years,5 to 9 years,10 to 14 years,15 to 64 years,...,,English,French,Non-official language,Aboriginal,Non-Aboriginal,English and French,English and non-official language,French and non-official language,"English, French and non-official language"
Unnamed: 2,,,,Arbutus-Ridge,15295,2015,455,685,880,9805,...,3120.0,135,40,355,0,360,0,10,0,0
Unnamed: 3,,,,Downtown,62030,4000,2080,1105,810,51275,...,17290.0,360,255,1300,0,1300,0,10,15,0
Unnamed: 4,,,,Dunbar-Southlands,21425,3545,675,1225,1650,14215,...,5035.0,100,105,335,10,335,0,0,10,0
Unnamed: 5,,,,Fairview,33620,2580,1240,760,580,25140,...,11900.0,95,255,500,0,505,0,0,20,0
Unnamed: 6,,,,Grandview-Woodland,29175,3210,1320,1025,865,22535,...,9525.0,150,180,320,15,305,0,0,10,0
Unnamed: 7,,,,Hastings-Sunrise,34575,4595,1510,1560,1525,23945,...,8805.0,230,65,750,0,750,0,15,0,0
Unnamed: 8,,,,Kensington-Cedar Cottage,49325,7060,2515,2390,2160,35385,...,13525.0,360,150,1125,0,1125,0,20,20,0
Unnamed: 9,,,,Kerrisdale,13975,1880,430,600,845,9395,...,2900.0,75,25,360,0,360,0,10,0,0


## Filtering and Cleaning the dataframe to obtain the required data set.

The index of the required data was obtained by searching for the keywords.

In [4]:
# Extract the first data row (index 1, which is the second row of the sheet)
first_row = Vn_df.T.iloc[1].astype(str)

# Keywords to search for
keywords = [
    'Income statistics in 2020 for the population aged 15 years and over in private households - 25% sample data',
   'Income', 
    'tax',
    'after-tax income',
    'after-tax income in 2020',
    'Median total income'
]

# Initialize a dictionary to store all indexes for each keyword
all_keyword_indexes = {}

# Find keywords in the first row (case-insensitive matching) and their corresponding indexes
for keyword in keywords:
    # Find all indexes for the keyword
    indexes = first_row[first_row.str.contains(keyword, case=False, na=False)].index.tolist()
    
    # If keyword is found, store the indexes
    if indexes:
        # Store the indexes for the keyword
        all_keyword_indexes[keyword] = indexes
        
        # Calculate and print only the min and max indexes
        min_index = min(indexes)  # Get the minimum index
        max_index = max(indexes)  # Get the maximum index
        print(f"Keyword '{keyword}' found at min index: {min_index} and max index: {max_index}")



Keyword 'Income' found at min index: 1883 and max index: 4196
Keyword 'tax' found at min index: 1887 and max index: 4178
Keyword 'after-tax income' found at min index: 1887 and max index: 4178
Keyword 'Median total income' found at min index: 1886 and max index: 4176


In [6]:
# Access the actual values at various index to confirm the result.

value_at_index_60 = Vn_df.T.iloc[0,92]  
value_at_index_1949 = Vn_df.T.iloc[0, 424]  

# Optionally, get the column names at those indices
column_name_60 = Vn_df.T.columns[92]  
column_name_1949 = Vn_df.T.columns[424]  

# Print the results
print(f"Value at column {column_name_60}: {value_at_index_60}")
print(f"Value at column {column_name_1949} : {value_at_index_1949}")


Value at column 92: 88
Value at column 424 : 404


## Extracting the desired columns

In [9]:
# Step 1: Extract the first column (assuming it contains labels or identifiers)
first_column = Vn_df.T.iloc[:, 3]

# Step 2: Filter the columns based on index range (between 60 and 168)
filtered_columns = Vn_df.T.iloc[:, 1883:2018]  # Select columns from index 60 to 168 (inclusive of 60, exclusive of 169)

# Step 3: Create a new DataFrame with the first column and the filtered columns
filtered_df = pd.concat([ first_column,filtered_columns], axis=1)

# Step 4: Print the resulting filtered DataFrame
(filtered_df)


Unnamed: 0,3,1883,1884,1885,1886,1887,1888,1889,1890,1891,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
The data shown here is provided by Statistics Canada from the 2016 Census as a custom data order for the City of Vancouver using the City's 22 local planning areas,ID,1856,1857,1858,1859,1860,1861,1862,1863,1864,...,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986
Unnamed: 1,Variable,Total - Income statistics in 2015 for the popu...,Number of total income recipients aged 15 ye...,Average total income in 2015 among recipie...,Median total income in 2015 among recipien...,Number of after-tax income recipients aged 1...,Average after-tax income in 2015 among rec...,Median after-tax income in 2015 among reci...,Number of market income recipients aged 15 y...,Average market income in 2015 among recipi...,...,"$30,000 to $39,999","$40,000 to $49,999","$50,000 to $59,999","$60,000 to $69,999","$70,000 to $79,999","$80,000 and over","$80,000 to $89,999","$90,000 to $99,999","$100,000 and over",Percentage with after-tax income
Unnamed: 2,Arbutus-Ridge,13055,12185,62675,30929,12180,48943,27946,11300,62995,...,490,390,325,320,155,1040,140,185,720,92.7
Unnamed: 3,Downtown,54905,53630,63251,41858,53720,49390,36918,47865,67870,...,2515,2700,2440,2075,1620,5750,1320,1050,3375,98.6
Unnamed: 4,Dunbar-Southlands,17785,16685,78117,40463,16715,59021,35459,15640,79592,...,640,585,470,455,300,2285,340,315,1635,93.3
Unnamed: 5,Fairview,30130,29655,61627,46940,29685,49734,40815,28220,61142,...,1470,1575,1430,1105,910,2490,645,365,1480,98.8
Unnamed: 6,Grandview-Woodland,25915,25435,42896,32438,25465,36036,28951,22410,44009,...,1625,1290,950,685,490,1090,365,220,515,98.2
Unnamed: 7,Hastings-Sunrise,29410,28420,38258,27255,28440,32460,25163,24865,38305,...,1755,1390,1080,690,520,985,320,165,495,97.1
Unnamed: 8,Kensington-Cedar Cottage,41795,40440,38411,28356,40485,32767,26205,36025,38437,...,2475,2165,1445,905,755,1535,555,265,715,96.9
Unnamed: 9,Kerrisdale,12060,11435,77248,35064,11450,57131,31259,10570,79395,...,405,395,360,240,185,1235,175,170,885,94.8


In [10]:
## Reseting the index
# Step 1: Reset the row index and add it as a column (this step you already have)
filtered_df_reset = filtered_df.reset_index()

# Step 2: Set the row index column to the first column (this you already did)
filtered_df_reset.set_index(filtered_df_reset.columns[0], inplace=True)

# Step 3: Reset column headers to default (0, 1, 2, ...)
# This step will reset the column headers (the numeric indices you mentioned)
filtered_df_reset.columns = range(filtered_df_reset.shape[1])

# Step 4: Save the DataFrame to CSV
#filtered_df_reset.to_csv('filtered_data_with_reset_columns.csv', index=True)

# Confirmation message
print("Row index has been moved to the first column, and column headers have been reset.")


Row index has been moved to the first column, and column headers have been reset.


In [12]:
# Transposing the dataframe

# Step 1: Store original column names (neighborhood names)
neighborhood_names = filtered_df.columns.tolist()

# Step 2: Reset the row index and add it as a column
filtered_df_reset = filtered_df.reset_index()

# Step 3: Set the row index column to the first column
filtered_df_reset.set_index(filtered_df_reset.columns[0], inplace=True)

# Step 4: Transpose the DataFrame (rows become columns)
filtered_df_transposed = filtered_df_reset.T  # Transpose

# Step 5: Reset the index after transposition (to avoid unwanted row index after transpose)
filtered_df_transposed.reset_index(drop=True, inplace=True)

# Confirmation message
print("Rows have been transformed to columns, the neighborhood names have been preserved")


Rows have been transformed to columns, the neighborhood names have been preserved


In [21]:
#Check
# Set the first row as column headers
filtered_df_transposed.columns = filtered_df_transposed.iloc[0]
filtered_df_transposed = filtered_df_transposed[1:]

# Reset the index
filtered_df_transposed.reset_index(drop=True, inplace=True)

filtered_df_transposed

Unnamed: 0,ID,Variable,Arbutus-Ridge,Downtown,Dunbar-Southlands,Fairview,Grandview-Woodland,Hastings-Sunrise,Kensington-Cedar Cottage,Kerrisdale,...,Riley Park,Shaughnessy,South Cambie,Strathcona,Sunset,Victoria-Fraserview,West End,West Point Grey,Vancouver CSD,Vancouver CMA
0,1856,Total - Income statistics in 2015 for the popu...,13055,54905,17785,30130,25915,29410,41795,12060,...,18960,6735,6510,8805,30685,26445,44755,11050,547845,2064615
1,1857,Number of total income recipients aged 15 ye...,12185,53630,16685,29655,25435,28420,40440,11435,...,18405,6370,6280,8605,29090,25415,43975,10520,530100,1978880
2,1858,Average total income in 2015 among recipie...,62675,63251,78117,61627,42896,38258,38411,77248,...,53060,118668,65459,31534,34212,34298,47253,82042,50317,46821
3,1859,Median total income in 2015 among recipien...,30929,41858,40463,46940,32438,27255,28356,35064,...,37327,44392,42094,17631,25498,24758,36425,40304,32896,32575
4,1860,Number of after-tax income recipients aged 1...,12180,53720,16715,29685,25465,28440,40485,11450,...,18420,6385,6295,8610,29150,25460,44025,10525,530765,1981625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130,1982,"$80,000 and over",1040,5750,2285,2490,1090,985,1535,1235,...,1440,880,575,265,775,690,2375,1325,33585,119000
131,1983,"$80,000 to $89,999",140,1320,340,645,365,320,555,175,...,365,90,90,65,265,250,750,190,8310,32520
132,1984,"$90,000 to $99,999",185,1050,315,365,220,165,265,170,...,210,90,100,30,205,130,425,145,5715,22180
133,1985,"$100,000 and over",720,3375,1635,1480,515,495,715,885,...,865,700,380,175,310,310,1200,990,19560,64295


## Input the other source of neighbourhood-income data

In [16]:
t_nn = pd.read_csv("vancouver_neighborhood_data_filtered.csv")
t_nn

Unnamed: 0,Neighborhood,"less than $20,000","Income $20,000 - $39,999","Income $40,000 - $59,999","Income $60,000 - $79,999","Income $80,000 - $99,999","Income $100,000 and Over",Total
0,Vancouver,59895,135830,135375,130095,115715,466405,1043320
1,Ambleside,335,900,685,585,360,1135,4000
2,Anmore/Belcarra/Metro Vancouver North,30,50,110,150,130,1070,1545
3,Capital Hill/Brentwood,1185,2145,1945,1955,1845,6610,15680
4,Cedar Cottage,670,1800,1735,1735,1415,6200,13565
...,...,...,...,...,...,...,...,...
66,West End/Stanley Park South,235,690,695,505,400,870,3410
67,Westside Heights,745,1835,1890,1830,1440,6220,13960
68,Westside/Kerrisdale Remainder,1640,2540,2355,2120,1845,11395,21890
69,Whalley,1120,3350,3965,4010,3680,12150,28275


As the Neighbourhood count did not match to that of earlier data set, also when cross-confirmed with the crime data,  26 neighbourhood count was considered and this data set dropped.

In [22]:
from geopy.geocoders import Nominatim
import time

# Initialize the geolocator
geolocator = Nominatim(user_agent="neighbourhood_locator")

def get_postal_code(neighbourhood_name):
    try:
        location = geolocator.geocode(f"{neighbourhood_name}, Vancouver, British Columbia")
        if location:
            # Reverse geocode to get postal code
            postal_location = geolocator.reverse((location.latitude, location.longitude), addressdetails=True)
            postal_code = postal_location.raw['address'].get('postcode', 'Postal code not found')
            return postal_code
    except Exception as e:
        print(f"Error retrieving postal code for {neighbourhood_name}: {e}")
    return None

# Example usage
neighbourhoods = filtered_df_transposed.columns.values


results = []

# Iterate through neighborhoods and get postal codes
for neighbourhood in neighbourhoods:
    postal_code = get_postal_code(neighbourhood)
    print(f"{neighbourhood}: {postal_code}")
    results.append({"Neighbourhood": neighbourhood, "Postal_Code": postal_code})
    time.sleep(1)  # To prevent overloading the server with requests

# Create a DataFrame from the results
df_postal_codes = pd.DataFrame(results)

# Display the DataFrame
print(df_postal_codes)

ID: V5Y 1V4
Variable: None
Arbutus-Ridge : V6L
Downtown : V7Y 1G5
Dunbar-Southlands : V6S 1S3
Fairview : V6H
Grandview-Woodland : V5L
Hastings-Sunrise : V5K
Kensington-Cedar Cottage : V5N
Kerrisdale : V6M
Killarney : V5S
Kitsilano : V6K
Marpole : V6P
Mount Pleasant : V5T
Oakridge : V6M
Renfrew-Collingwood : V5R
Riley Park : V5V
Shaughnessy : V6H
South Cambie : V5Z
Strathcona : V6A
Sunset : V5X
Victoria-Fraserview: V5P
West End : V6E
West Point Grey : V6R
Vancouver CSD : None
Vancouver CMA : None
                Neighbourhood Postal_Code
0                          ID     V5Y 1V4
1                    Variable        None
2              Arbutus-Ridge          V6L
3                   Downtown      V7Y 1G5
4          Dunbar-Southlands      V6S 1S3
5                   Fairview          V6H
6         Grandview-Woodland          V5L
7           Hastings-Sunrise          V5K
8   Kensington-Cedar Cottage          V5N
9                 Kerrisdale          V6M
10                 Killarney         

In [23]:
# Remove rows with None or 'Postal code not found' as postal code
df1_postal_codes = df_postal_codes[df_postal_codes['Postal_Code'].notna() & (df_postal_codes['Postal_Code'] != 'Postal code not found')]

# Create a new column with the part of the postal code before the first space
df1_postal_codes['Postal'] = df1_postal_codes['Postal_Code'].apply(lambda x: x.split()[0] if isinstance(x, str) else '')
df1_postal_codes = df1_postal_codes.reset_index(drop=True)
# Display the final DataFrame
(df1_postal_codes)

Unnamed: 0,Neighbourhood,Postal_Code,Postal
0,ID,V5Y 1V4,V5Y
1,Arbutus-Ridge,V6L,V6L
2,Downtown,V7Y 1G5,V7Y
3,Dunbar-Southlands,V6S 1S3,V6S
4,Fairview,V6H,V6H
5,Grandview-Woodland,V5L,V5L
6,Hastings-Sunrise,V5K,V5K
7,Kensington-Cedar Cottage,V5N,V5N
8,Kerrisdale,V6M,V6M
9,Killarney,V5S,V5S
