In [113]:
import pandas as pd
import os
from datetime import datetime
import geopandas as gpd
from shapely.geometry import Point
from shapely import wkt
import matplotlib.pyplot as plt
from shapely.geometry import Polygon
import folium
import math

# Configure Notebook
import warnings
warnings.filterwarnings('ignore')


First to import all Toronto facilities' csv files

In [116]:
# Load and adjust hospital and school data
# Rename the 'type' column to 'fclass' to match other datasets, then drop the original 'fclass' column
hospital = pd.read_csv('toronto__Hospitals.csv')
hospital.drop(columns=["fclass"], inplace=True, errors='ignore')
hospital.rename(columns={"type": "fclass"}, inplace=True)

school_university_colleges = pd.read_csv('toronto__Schools_university_colleges.csv')
school_university_colleges.drop(columns=["fclass"], inplace=True, errors='ignore')
school_university_colleges.rename(columns={"type": "fclass"}, inplace=True)

# Load other CSV files directly into DataFrames
pubs_and_restuarent = pd.read_csv('toronto__Pubs and restuarents.csv')
railway_station_and_stop = pd.read_csv('toronto__Railway Stations and  Railway Stops.csv')
supermarket = pd.read_csv('toronto__Supermarkets.csv')
bus_station_and_stop = pd.read_csv('toronto_Bus Stations and Bus Stops.csv')
cafe = pd.read_csv('toronto-cafe.csv')
cinema = pd.read_csv('toronto-cinema.csv')
dentist = pd.read_csv('toronto-dentist.csv')
fast_food = pd.read_csv('toronto-fast_food.csv')
kindergartens = pd.read_csv('toronto-kindergartens.csv')
malls = pd.read_csv('toronto-malls.csv')
play_ground = pd.read_csv('toronto-paly_grounds.csv')
parks = pd.read_csv('toronto-parks.csv')
post_office = pd.read_csv('toronto-post_office.csv')

# Dictionary containing DataFrames directly for iteration
csv_dataframes = {
    "hospital": hospital,
    "pubs_and_restuarent": pubs_and_restuarent,
    "railway_station_and_stop": railway_station_and_stop,
    "school_university_colleges": school_university_colleges,
    "supermarket": supermarket,
    "bus_station_and_stop": bus_station_and_stop,
    "cafe": cafe,
    "cinema": cinema,
    "dentist": dentist,
    "fast_food": fast_food,
    "kindergartens": kindergartens,
    "malls": malls,
    "play_ground": play_ground,
    "parks": parks,
    "post_office": post_office,
}

Then we will filter out the data we don't need and add all other data together

In [119]:
# Selected relevant columns to extract
extract_data = ["fclass", "name", "geometry", "Latitude", "Longitude"]

# Extract relevant columns and concatenate all data into a single DataFrame
dfs = []
for name, df in csv_dataframes.items():
    # Check if required columns exist
    missing_cols = [col for col in extract_data if col not in df.columns]
    if missing_cols:
        print(f"Warning: {name} is missing columns: {missing_cols}")
    else:
        dfs.append(df[extract_data])

# Concatenate all dataframes
facilities = pd.concat(dfs, ignore_index=True)
# Rename 'fclass' to 'type of facilities'
facilities.rename(columns={"fclass": "type of facilities"}, inplace=True)

# Display the first few rows of the concatenated DataFrame
facilities.head()

Unnamed: 0,type of facilities,name,geometry,Latitude,Longitude
0,hospital,,"POLYGON ((-79.3997483 43.6596444, -79.3993673 ...",43.659434,-79.399272
1,hospital,,"POLYGON ((-79.3995177 43.6588946, -79.3990495 ...",43.658698,-79.399028
2,hospital,,"POLYGON ((-79.4516206 43.8710529, -79.4513835 ...",43.870163,-79.450705
3,hospital,,"POLYGON ((-79.3894249 43.6588368, -79.3893467 ...",43.65858,-79.388299
4,hospital,,"POLYGON ((-79.5998244 43.7296222, -79.5996299 ...",43.729534,-79.598591


We will remove any missing value

In [122]:
# Remove missing date values from dataframe

# Drop rows with missing values in geometry, latitude, and longitude columns, missing name is not important
final_facilities = facilities.dropna(subset=['geometry', 'Latitude', 'Longitude'])

# Check if there are any remaining missing values
has_missing_values = final_facilities.isna().any().any()

# Output result
if has_missing_values:
    print("The final DataFrame contains NaN or NaT values in names.")
else:
    print("The final DataFrame does not contain NaN or NaT values.")

final_facilities.head()

The final DataFrame contains NaN or NaT values in names.


Unnamed: 0,type of facilities,name,geometry,Latitude,Longitude
0,hospital,,"POLYGON ((-79.3997483 43.6596444, -79.3993673 ...",43.659434,-79.399272
1,hospital,,"POLYGON ((-79.3995177 43.6588946, -79.3990495 ...",43.658698,-79.399028
2,hospital,,"POLYGON ((-79.4516206 43.8710529, -79.4513835 ...",43.870163,-79.450705
3,hospital,,"POLYGON ((-79.3894249 43.6588368, -79.3893467 ...",43.65858,-79.388299
4,hospital,,"POLYGON ((-79.5998244 43.7296222, -79.5996299 ...",43.729534,-79.598591


In [124]:
# Count the size of the DataFrame
num_rows, num_columns = final_facilities.shape
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_columns}")

Number of rows: 22827
Number of columns: 5


In [126]:
# Ensure `final_facilities` is a GeoDataFrame
final_facilities = gpd.GeoDataFrame(
    final_facilities, 
    geometry=gpd.points_from_xy(final_facilities['Longitude'], final_facilities['Latitude'])
)
final_facilities.set_crs(epsg=4326, inplace=True)
final_facilities.head()

Unnamed: 0,type of facilities,name,geometry,Latitude,Longitude
0,hospital,,POINT (-79.399 43.659),43.659434,-79.399272
1,hospital,,POINT (-79.399 43.659),43.658698,-79.399028
2,hospital,,POINT (-79.451 43.87),43.870163,-79.450705
3,hospital,,POINT (-79.388 43.659),43.65858,-79.388299
4,hospital,,POINT (-79.599 43.73),43.729534,-79.598591


Import Toronto housing data

In [129]:
# Importing the houses CSV
toronto_housing_1 = pd.read_csv('Toronto_House_data_Final_cleaned[1].csv')
toronto_housing_2 = pd.read_csv('Toronto_Rew_houses_Price_Address_HouseInfo[53].csv')

toronto_housing = pd.concat([toronto_housing_1, toronto_housing_2], ignore_index=True)


# Price of 1 dollar, means this house needs to bid. We will drop this special
# Remove the "C$" symbol and commas, then convert to numeric
toronto_housing['Price'] = toronto_housing['Price'].str.replace(r'[^\d.]', '', regex=True).astype(float)
# Filter out rows where the price is exactly 1
toronto_housing = toronto_housing[toronto_housing['Price'] != 1]

# Drop duplicate
toronto_housing.drop_duplicates(subset=['Street address'],inplace =True)
print(len(toronto_housing))

# Create a GeoDataFrame with longitude and latitude as geometry
toronto_housing = gpd.GeoDataFrame(
    toronto_housing,
    geometry=gpd.points_from_xy(toronto_housing['Longitude'], toronto_housing['Latitude'])
)

# Set the coordinate reference system to EPSG 4326 (WGS84)
toronto_housing.set_crs(epsg=4326, inplace=True)

# Display the first few rows
toronto_housing.head()

1138


Unnamed: 0,Street address,City,State,Zip/Postal Code,Country,Latitude,Longitude,Neighbourhood,Region,Price,Bedrooms,Bathrooms,Square Footage,Home Type,geometry
0,1787 Saint Clair Ave W #414,Toronto,ON,M6N1H9,Canada,43.673553,-79.461868,The Junction North,York,525000.0,2 bds,1 ba,,,POINT (-79.462 43.674)
1,509 Beecroft Rd #1209,Toronto,ON,M2N0A3,Canada,43.778652,-79.417947,Willowdale South,North York,848000.0,3 bds,2 ba,,,POINT (-79.418 43.779)
3,186 Macpherson Ave,Toronto,ON,M5R1W8,Canada,43.677945,-79.396728,The Annex,Central Toronto,2218000.0,3 bds,3 ba,,,POINT (-79.397 43.678)
4,195 McCaul St #909,Toronto,ON,M5T0E5,Canada,43.6567,-79.392206,Chinatown,Downtown Toronto,449900.0,Studio,1 ba,,,POINT (-79.392 43.657)
5,30 Roehampton Ave UNIT 1902,Toronto,ON,M4P0B9,Canada,43.708432,-79.397532,Davisville North,Central Toronto,630000.0,2 bds,2 ba,,,POINT (-79.398 43.708)


Buffer 1.5KM around each house adress to see how many facilities around the house

In [132]:
# Create buffer around each house poin
    
def create_circle(point, radius_meters=1500):
    # Convert radius from meters to degrees (approximate)
    radius_degrees = radius_meters / (40008000 / 360)

    # Create a buffer (circle) around the point
    circle_area= point.buffer(radius_degrees)

    return circle_area

toronto_housing_gdf = gpd.GeoDataFrame(toronto_housing, geometry='geometry')
toronto_housing_gdf['circle'] = toronto_housing_gdf['geometry'].apply(create_circle)
toronto_housing_gdf.head()

Unnamed: 0,Street address,City,State,Zip/Postal Code,Country,Latitude,Longitude,Neighbourhood,Region,Price,Bedrooms,Bathrooms,Square Footage,Home Type,geometry,circle
0,1787 Saint Clair Ave W #414,Toronto,ON,M6N1H9,Canada,43.673553,-79.461868,The Junction North,York,525000.0,2 bds,1 ba,,,POINT (-79.462 43.674),"POLYGON ((-79.448 43.674, -79.448 43.672, -79...."
1,509 Beecroft Rd #1209,Toronto,ON,M2N0A3,Canada,43.778652,-79.417947,Willowdale South,North York,848000.0,3 bds,2 ba,,,POINT (-79.418 43.779),"POLYGON ((-79.404 43.779, -79.405 43.777, -79...."
3,186 Macpherson Ave,Toronto,ON,M5R1W8,Canada,43.677945,-79.396728,The Annex,Central Toronto,2218000.0,3 bds,3 ba,,,POINT (-79.397 43.678),"POLYGON ((-79.383 43.678, -79.383 43.677, -79...."
4,195 McCaul St #909,Toronto,ON,M5T0E5,Canada,43.6567,-79.392206,Chinatown,Downtown Toronto,449900.0,Studio,1 ba,,,POINT (-79.392 43.657),"POLYGON ((-79.379 43.657, -79.379 43.655, -79...."
5,30 Roehampton Ave UNIT 1902,Toronto,ON,M4P0B9,Canada,43.708432,-79.397532,Davisville North,Central Toronto,630000.0,2 bds,2 ba,,,POINT (-79.398 43.708),"POLYGON ((-79.384 43.708, -79.384 43.707, -79...."


In [134]:
# Rename columns to have `geometry` as polygons and points as `point`
toronto_housing_gdf = toronto_housing_gdf.rename(columns={'geometry': 'point'})
toronto_housing_gdf = toronto_housing_gdf.rename(columns={'circle': 'geometry'})

toronto_housing_gdf.geometry.head()

0    POLYGON ((-79.448 43.674, -79.448 43.672, -79....
1    POLYGON ((-79.404 43.779, -79.405 43.777, -79....
3    POLYGON ((-79.383 43.678, -79.383 43.677, -79....
4    POLYGON ((-79.379 43.657, -79.379 43.655, -79....
5    POLYGON ((-79.384 43.708, -79.384 43.707, -79....
Name: geometry, dtype: geometry

In [136]:
# Display the first few rows to verify the changes
toronto_housing_gdf.head()


Unnamed: 0,Street address,City,State,Zip/Postal Code,Country,Latitude,Longitude,Neighbourhood,Region,Price,Bedrooms,Bathrooms,Square Footage,Home Type,point,geometry
0,1787 Saint Clair Ave W #414,Toronto,ON,M6N1H9,Canada,43.673553,-79.461868,The Junction North,York,525000.0,2 bds,1 ba,,,POINT (-79.462 43.674),"POLYGON ((-79.448 43.674, -79.448 43.672, -79...."
1,509 Beecroft Rd #1209,Toronto,ON,M2N0A3,Canada,43.778652,-79.417947,Willowdale South,North York,848000.0,3 bds,2 ba,,,POINT (-79.418 43.779),"POLYGON ((-79.404 43.779, -79.405 43.777, -79...."
3,186 Macpherson Ave,Toronto,ON,M5R1W8,Canada,43.677945,-79.396728,The Annex,Central Toronto,2218000.0,3 bds,3 ba,,,POINT (-79.397 43.678),"POLYGON ((-79.383 43.678, -79.383 43.677, -79...."
4,195 McCaul St #909,Toronto,ON,M5T0E5,Canada,43.6567,-79.392206,Chinatown,Downtown Toronto,449900.0,Studio,1 ba,,,POINT (-79.392 43.657),"POLYGON ((-79.379 43.657, -79.379 43.655, -79...."
5,30 Roehampton Ave UNIT 1902,Toronto,ON,M4P0B9,Canada,43.708432,-79.397532,Davisville North,Central Toronto,630000.0,2 bds,2 ba,,,POINT (-79.398 43.708),"POLYGON ((-79.384 43.708, -79.384 43.707, -79...."


At the final step, we will count the number of each type of facility within the 1500-meter buffer around each house

In [139]:

facility_types = {
    "hospital": "Hospital in area",
    "railway_station": "Railway Station in area",
    "school": "School in area",
    "university": "University in area",
    "college": "College in area",
    "supermarket": "Supermarket in area",
    "bus_stop": "Bus Stop in area",
    "bus_station": "Bus Station in area",
    "pub": "Pub in area",
    "restaurant": "Restaurant in area",
    "cafe": "Cafe in area",
    "cinema": "Cinema in area",
    "dentist": "Dentist in area",
    "fast_food": "Fast Food in area",
    "kindergartens": "Kindergartens in area",
    "malls": "Malls in area",
    "play_ground": "Play Ground in area",
    "parks": "Parks in area",
    "post_office": "Post Office in area"
}

# Add columns to toronto_housing_gdf DataFrame for each facility count, initializing them to 0 if they don’t already exist
for column in facility_types.values():
    if column not in toronto_housing_gdf.columns:
        toronto_housing_gdf[column] = 0

# Ensure CRS consistency between the housing and facilities data
if final_facilities.crs != toronto_housing_gdf.crs:
    final_facilities = final_facilities.to_crs(toronto_housing_gdf.crs)

# Combine bus_stop and bus_station into one facility type for analysis
final_facilities['type of facilities'] = final_facilities['type of facilities'].replace({
    'bus_stop': 'bus_stop',
    'bus_station': 'bus_stop'
})

# Combine university, college, and school into one facility type for analysis
final_facilities['type of facilities'] = final_facilities['type of facilities'].replace({
    'university': 'school',
    'college': 'school',
    'school': 'school'
})

# Count facilities within 1500m for each type and update `toronto_housing_gdf`
for facility_key, column_name in facility_types.items():
    # Filter facilities by type
    facilities_subset = final_facilities[final_facilities['type of facilities'] == facility_key]

     # Perform spatial join to find facilities within each buffer
    joined = gpd.sjoin(facilities_subset, toronto_housing_gdf[['geometry']], how='left', predicate='within')
    
    # Count facilities for each housing location and update counts
    counts = joined.groupby('index_right').size()
    toronto_housing_gdf[column_name] = toronto_housing_gdf.index.map(counts).fillna(0).astype(int)


# Display the updated DataFrame
toronto_housing_gdf.head()





Unnamed: 0,Street address,City,State,Zip/Postal Code,Country,Latitude,Longitude,Neighbourhood,Region,Price,...,Restaurant in area,Cafe in area,Cinema in area,Dentist in area,Fast Food in area,Kindergartens in area,Malls in area,Play Ground in area,Parks in area,Post Office in area
0,1787 Saint Clair Ave W #414,Toronto,ON,M6N1H9,Canada,43.673553,-79.461868,The Junction North,York,525000.0,...,66,20,0,0,31,0,0,0,0,2
1,509 Beecroft Rd #1209,Toronto,ON,M2N0A3,Canada,43.778652,-79.417947,Willowdale South,North York,848000.0,...,83,29,1,1,21,0,0,0,0,2
3,186 Macpherson Ave,Toronto,ON,M5R1W8,Canada,43.677945,-79.396728,The Annex,Central Toronto,2218000.0,...,103,55,1,1,58,0,0,0,0,3
4,195 McCaul St #909,Toronto,ON,M5T0E5,Canada,43.6567,-79.392206,Chinatown,Downtown Toronto,449900.0,...,454,229,4,0,362,0,0,0,0,15
5,30 Roehampton Ave UNIT 1902,Toronto,ON,M4P0B9,Canada,43.708432,-79.397532,Davisville North,Central Toronto,630000.0,...,99,43,4,3,69,0,0,0,0,7


In [141]:
toronto_housing_gdf = toronto_housing_gdf[['Street address', 'Zip/Postal Code','Latitude','Longitude', 'Neighbourhood','Region','Price','Bedrooms', 'Bathrooms','Square Footage','Hospital in area', 
                        'Railway Station in area', 'School in area','Supermarket in area','Bus Stop in area','Pub in area','Restaurant in area',  'Cafe in area','Dentist in area',
                        'Fast Food in area','Kindergartens in area', 'Malls in area',  'Play Ground in area',  'Parks in area',  'Post Office in area'   ]]
toronto_housing_gdf.to_csv('Toronto_Houses_Facilities_1.5km.csv')