# Toronto Neighbourhoods and Income of Individuals

The Open data porta was used to obtain the data of Neighbourhoods in Toronto city of Ontario. 

In [1]:
# 3rd party imports
import os
import json
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pylab as plt
from bs4 import BeautifulSoup
#!pip install pandas openpyxl


# Configure Notebook
%matplotlib inline
plt.style.use('fivethirtyeight')
sns.set_context("notebook")
import warnings
warnings.filterwarnings('ignore')
%config Completer.use_jedi = False


### The main csv was obtained and loaded in the file.

In [3]:
assignment_files = os.listdir()

# Print the contents of the directory
#print(assignment_files)


In [4]:
# Load the csv. 
tn_df = pd.read_csv("Toronto-Neigh.csv", encoding='Windows-1252')
tnT_df=tn_df.T

# View DataFrame
tnT_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2593,2594,2595,2596,2597,2598,2599,2600,2601,2602
Neighbourhood Name,Neighbourhood Number,TSNS 2020 Designation,Total - Age groups of the population - 25% sam...,0 to 14 years,0 to 4 years,5 to 9 years,10 to 14 years,15 to 64 years,15 to 19 years,20 to 24 years,...,Between 9 a.m. and 11:59 a.m.,Between 12 p.m. and 4:59 a.m.,Total - Eligibility for instruction in the min...,Children eligible for instruction in the min...,Children not eligible for instruction in the...,Total - Eligibility and instruction in the min...,Children eligible for instruction in the min...,Eligible children who have been instructed...,Eligible children who have not been instru...,Children not eligible for instruction in the...
West Humber-Clairville,1,Not an NIA or Emerging Neighbourhood,33300,4295,1460,1345,1485,23640,1860,3175,...,1665,2935,5430,410,5020,3875,335,255,75,3540
Mount Olive-Silverstone-Jamestown,2,Neighbourhood Improvement Area,31345,5690,1650,1860,2175,21490,2280,2675,...,1145,2965,7285,510,6780,5540,395,245,145,5145
Thistletown-Beaumond Heights,3,Neighbourhood Improvement Area,9850,1495,505,540,455,6615,570,745,...,395,635,1860,180,1685,1325,120,75,45,1205
Rexdale-Kipling,4,Not an NIA or Emerging Neighbourhood,10375,1575,505,615,455,6950,515,715,...,425,775,1910,135,1770,1370,90,75,25,1275


## Filtering and Cleaning the dataframe to obtain the required data set.

The index of the required data was obtained by searching for the keywords.

In [5]:
# Extract the first data row (index 1, which is the second row of the sheet)
first_row = tn_df.T.iloc[0].astype(str)

# Keywords to search for
keywords = [
    'Income statistics in 2020 for the population aged 15 years and over in private households - 25% sample data',
   # 'Income', 
    'tax',
    'after-tax income',
    'after-tax income in 2020',
    'Median total income'
]

# Initialize a dictionary to store all indexes for each keyword
all_keyword_indexes = {}

# Find keywords in the first row (case-insensitive matching) and their corresponding indexes
for keyword in keywords:
    # Find all indexes for the keyword
    indexes = first_row[first_row.str.contains(keyword, case=False, na=False)].index.tolist()
    
    # If keyword is found, store the indexes
    if indexes:
        # Store the indexes for the keyword
        all_keyword_indexes[keyword] = indexes
        
        # Calculate and print only the min and max indexes
        min_index = min(indexes)  # Get the minimum index
        max_index = max(indexes)  # Get the maximum index
        print(f"Keyword '{keyword}' found at min index: {min_index} and max index: {max_index}")

# At this point, the full list of indexes is saved in the all_keyword_indexes dictionary.

Keyword 'Income statistics in 2020 for the population aged 15 years and over in private households - 25% sample data' found at min index: 60 and max index: 1949
Keyword 'tax' found at min index: 64 and max index: 424
Keyword 'after-tax income' found at min index: 64 and max index: 424
Keyword 'after-tax income in 2020' found at min index: 65 and max index: 166
Keyword 'Median total income' found at min index: 62 and max index: 421


In [6]:
# Access the actual values at various index to confirm the result.

value_at_index_60 = tn_df.T.iloc[0,92]  
value_at_index_1949 = tn_df.T.iloc[0, 424]  

# Optionally, get the column names at those indices
column_name_60 = tn_df.T.columns[92]  
column_name_1949 = tn_df.T.columns[424]  

# Print the results
print(f"Value at column {column_name_60}: {value_at_index_60}")
print(f"Value at column {column_name_1949} : {value_at_index_1949}")


Value at column 92:     Under $10,000 (including loss)
Value at column 424 :   Average after-tax income of one-parent economic family in 2020 ($)


## Extracting the desired columns

In [7]:
# Step 1: Extract the first column (assuming it contains labels or identifiers)
first_column = tn_df.T.iloc[:, 0]

# Step 2: Filter the columns based on index range (between 60 and 168)
filtered_columns = tn_df.T.iloc[:, 60:167]  # Select columns from index 60 to 168 (inclusive of 60, exclusive of 169)

# Step 3: Create a new DataFrame with the first column and the filtered columns
filtered_df = pd.concat([first_column, filtered_columns], axis=1)

# Step 4: Print the resulting filtered DataFrame
(filtered_df)


Unnamed: 0,0,60,61,62,63,64,65,66,67,68,...,157,158,159,160,161,162,163,164,165,166
Neighbourhood Name,Neighbourhood Number,Total - Income statistics in 2020 for the popu...,Number of total income recipients aged 15 ye...,Median total income in 2020 among recipie...,Average total income in 2020 among recipie...,Number of after-tax income recipients aged 1...,Median after-tax income in 2020 among reci...,Average after-tax income in 2020 among rec...,Number of market income recipients aged 15 y...,Median market income in 2020 among recipi...,...,Composition of total income in 2019 of the pop...,Market income (%),Employment income (%),Government transfers (%),Employment insurance benefits (%),Total - Income statistics for persons aged 15 ...,Median total income in 2020 ($),Average total income in 2020 ($),Median after-tax income in 2020 ($),Average after-tax income in 2020 ($)
West Humber-Clairville,1,29000,27555,33600,40560,27570,31600,35800,23565,26400,...,100,83.8,75,16.2,1.4,4990,30600,37040,29200,32840
Mount Olive-Silverstone-Jamestown,2,25660,23990,29600,35000,23995,28400,31760,18275,22200,...,100,77.2,70.8,22.8,1.5,3530,28000,33920,27200,30480
Thistletown-Beaumond Heights,3,8355,7910,32800,41520,7925,30600,36360,6450,25800,...,100,82,71.6,18,1.6,1320,32800,39300,30800,34700
Rexdale-Kipling,4,8800,8445,33600,42040,8470,31800,36880,7010,28400,...,100,82.2,72.2,17.9,1,1715,27400,37040,26800,32960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Yonge-Bay Corridor,170,11680,10930,44000,68200,10930,40400,53050,9985,44000,...,100,96.4,81.6,3.7,0.4,5600,40800,61600,37200,48920
Junction-Wallace Emerson,171,20100,19420,41200,54350,19420,37200,45480,16950,39600,...,100,90,83.2,10.1,0.9,5255,36800,49640,33600,42160
Dovercourt Village,172,11015,10600,38000,55850,10600,34800,46200,9525,33600,...,100,90.4,81.2,9.5,0.7,3170,34400,49080,31800,41560
North Toronto,173,14570,13865,46000,58100,13865,41200,48200,12585,44400,...,100,92.8,81.6,7.3,0.6,5935,48400,59700,42800,49160


In [8]:
## Reset the index of the dataframe

# Step 1: Reset the row index and add it as a column 
filtered_df_reset = filtered_df.reset_index()

# Step 2: Set the row index column to the first column (this you already did)
filtered_df_reset.set_index(filtered_df_reset.columns[0], inplace=True)

# Step 3: Reset column headers to default (0, 1, 2, ...)
# This step will reset the column headers (the numeric indices you mentioned)
filtered_df_reset.columns = range(filtered_df_reset.shape[1])

# Confirmation message
print("Row index has been moved to the first column, and column headers have been reset.")


Row index has been moved to the first column, and column headers have been reset.


In [11]:
## transposing the dataframe.

# Step 1: Store original column names (neighborhood names)
neighborhood_names = filtered_df.columns.tolist()

# Step 2: Reset the row index and add it as a column
filtered_df_reset = filtered_df.reset_index()

# Step 3: Set the row index column to the first column
filtered_df_reset.set_index(filtered_df_reset.columns[0], inplace=True)

# Step 4: Transpose the DataFrame (rows become columns)
filtered_df_transposed = filtered_df_reset.T  # Transpose

# Step 5: Reset the index after transposition (to avoid unwanted row index after transpose)
filtered_df_transposed.reset_index(drop=True, inplace=True)

# Confirmation message
print("Rows have been transformed to columns, the neighborhood names have been preserved.")


Rows have been transformed to columns, the neighborhood names have been preserved.


In [12]:
#Check the transposed data frame

filtered_df_transposed

index,Neighbourhood Name,West Humber-Clairville,Mount Olive-Silverstone-Jamestown,Thistletown-Beaumond Heights,Rexdale-Kipling,Elms-Old Rexdale,Kingsview Village-The Westway,Willowridge-Martingrove-Richview,Humber Heights-Westmount,Edenbridge-Humber Valley,...,Harbourfront-CityPlace,St Lawrence-East Bayfront-The Islands,Church-Wellesley,Downtown Yonge East,Bay-Cloverhill,Yonge-Bay Corridor,Junction-Wallace Emerson,Dovercourt Village,North Toronto,South Eglinton-Davisville
0,Neighbourhood Number,1,2,3,4,5,6,7,8,9,...,165,166,167,168,169,170,171,172,173,174
1,Total - Income statistics in 2020 for the popu...,29000,25660,8355,8800,7745,18090,18945,8635,13125,...,26065,29005,21420,16645,15925,11680,20100,11015,14570,20545
2,Number of total income recipients aged 15 ye...,27555,23990,7910,8445,7325,17240,18140,8305,12650,...,24955,27980,20515,15955,15015,10930,19420,10600,13865,19760
3,Median total income in 2020 among recipie...,33600,29600,32800,33600,34400,34400,40800,40400,48400,...,58800,58000,42800,45200,39600,44000,41200,38000,46000,52400
4,Average total income in 2020 among recipie...,40560,35000,41520,42040,41240,44880,53750,60900,98200,...,73100,79700,56700,66700,66300,68200,54350,55850,58100,69000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,Total - Income statistics for persons aged 15 ...,4990,3530,1320,1715,1000,3080,2905,1525,2290,...,11755,12365,11940,8585,8655,5600,5255,3170,5935,8115
104,Median total income in 2020 ($),30600,28000,32800,27400,33600,34400,40400,40800,50400,...,62400,61600,39600,41200,33600,40800,36800,34400,48400,53600
105,Average total income in 2020 ($),37040,33920,39300,37040,40200,44800,52100,51200,65400,...,73100,79800,53800,60550,52700,61600,49640,49080,59700,67200
106,Median after-tax income in 2020 ($),29200,27200,30800,26800,31200,32400,36800,36400,44000,...,53600,52800,36800,37200,32000,37200,33600,31800,42800,47200


## Input the other source of neighbourhood-income data

In [13]:
t_nn = pd.read_csv("toronto_neighborhood_data_filtered.csv")
t_nn

Unnamed: 0,Neighborhood,"less than $20,000","Income $20,000 - $39,999","Income $40,000 - $59,999","Income $60,000 - $79,999","Income $80,000 - $99,999","Income $100,000 and Over",Total
0,Toronto,108855,264185,268775,272180,252230,1096245,2262475
1,Agincourt/Malvern,1340,4625,5225,5595,5050,16355,38190
2,Ajax/Pickering,1440,4975,6500,7925,8105,43965,72910
3,Alderwood,120,525,560,535,475,2565,4790
4,Aurora,655,1630,2015,2340,2140,12730,21505
...,...,...,...,...,...,...,...,...
130,Woodbine-Lumsden,160,475,385,380,355,1615,3380
131,Wychwood,315,965,735,650,640,2460,5770
132,Yonge-Eglinton,350,725,870,720,445,2875,5995
133,Yonge-St. Clair,440,770,935,920,740,3400,7205


As the Neighbourhood count did not match to that of earlier data set, also when cross-confirmed with the crime data, 158 neighbourhood count was considered and this data set dropped.

In [15]:
from geopy.geocoders import Nominatim
import time

# Initialize the geolocator
geolocator = Nominatim(user_agent="neighbourhood_locator")

def get_postal_code(neighbourhood_name):
    try:
        location = geolocator.geocode(f"{neighbourhood_name}, Toronto, Ontario")
        if location:
            # Reverse geocode to get postal code
            postal_location = geolocator.reverse((location.latitude, location.longitude), addressdetails=True)
            postal_code = postal_location.raw['address'].get('postcode', 'Postal code not found')
            return postal_code
    except Exception as e:
        print(f"Error retrieving postal code for {neighbourhood_name}: {e}")
    return None

# Example usage
neighbourhoods = filtered_df_transposed.columns.values


results = []

# Iterate through neighborhoods and get postal codes
for neighbourhood in neighbourhoods:
    postal_code = get_postal_code(neighbourhood)
    print(f"{neighbourhood}: {postal_code}")
    results.append({"Neighbourhood": neighbourhood, "Postal_Code": postal_code})
    time.sleep(1)  # To prevent overloading the server with requests

# Create a DataFrame from the results
df_postal_codes = pd.DataFrame(results)

# Display the DataFrame
(df_postal_codes)

Neighbourhood Name: None
West Humber-Clairville: M9W 1R9
Mount Olive-Silverstone-Jamestown: None
Thistletown-Beaumond Heights: None
Rexdale-Kipling: M9W 1Y4
Elms-Old Rexdale: None
Kingsview Village-The Westway: None
Willowridge-Martingrove-Richview: None
Humber Heights-Westmount: M9P 3W3
Edenbridge-Humber Valley: M9A 3E1
Princess-Rosethorn: None
Eringate-Centennial-West Deane: None
Markland Wood: M9C 2P7
Etobicoke West Mall: M9C 5S1
Kingsway South: M6S
Stonegate-Queensway: M8Z 1N7
New Toronto: M8V 3B5
Long Branch: M8W 3T4
Alderwood: M8W 3T7
Humber Summit: M9L 2K5
Humbermede: M9M 2A8
Pelmo Park-Humberlea: None
Black Creek: M6M 5E6
Glenfield-Jane Heights: None
York University Heights: M3J 1V6
Rustic: M6L 1N4
Maple Leaf: M6L 1M8
Brookhaven-Amesbury: None
Yorkdale-Glen Park: M6B 3L3
Englemount-Lawrence: M6B 2H5
Clanton Park: M3H 3N4
Bathurst Manor: M3H 2T5
Westminster-Branson: None
Newtonbrook West: None
Willowdale West: M2N 6K9
Lansing-Westgate: None
Bedford Park-Nortown: None
St.Andrew-W

Unnamed: 0,Neighbourhood,Postal_Code
0,Neighbourhood Name,
1,West Humber-Clairville,M9W 1R9
2,Mount Olive-Silverstone-Jamestown,
3,Thistletown-Beaumond Heights,
4,Rexdale-Kipling,M9W 1Y4
...,...,...
154,Yonge-Bay Corridor,
155,Junction-Wallace Emerson,
156,Dovercourt Village,M6H 3L8
157,North Toronto,M5V 3C7


In [16]:
# Remove rows with None or 'Postal code not found' as postal code
df1_postal_codes = df_postal_codes[df_postal_codes['Postal_Code'].notna() & (df_postal_codes['Postal_Code'] != 'Postal code not found')]

# Create a new column with the part of the postal code before the first space
df1_postal_codes['Postal'] = df1_postal_codes['Postal_Code'].apply(lambda x: x.split()[0] if isinstance(x, str) else '')
df1_postal_codes = df1_postal_codes.reset_index(drop=True)
# Display the final DataFrame
(df1_postal_codes)

Unnamed: 0,Neighbourhood,Postal_Code,Postal
0,West Humber-Clairville,M9W 1R9,M9W
1,Rexdale-Kipling,M9W 1Y4,M9W
2,Humber Heights-Westmount,M9P 3W3,M9P
3,Edenbridge-Humber Valley,M9A 3E1,M9A
4,Markland Wood,M9C 2P7,M9C
...,...,...,...
104,Harbourfront-CityPlace,M5V 3A6,M5V
105,Church-Wellesley,M4Y 2C7,M4Y
106,Downtown Yonge East,M5C 2T6,M5C
107,Dovercourt Village,M6H 3L8,M6H
