In [283]:
import geopandas as gpd
import pandas as pd
import numpy as np
import json
import re
from shapely.geometry import Point
from shapely.geometry import shape
from shapely import wkt
from pathlib import Path

In [284]:
BASE_DIR = Path(r"C:\Projects\Toronto_Waste_Analytics")
RAW_DIR = BASE_DIR / "data_raw"
WASTE_BINS_DIR = RAW_DIR / "waste_bins"

print("BASE_DIR:", BASE_DIR, BASE_DIR.exists())
print("RAW_DIR:", RAW_DIR, RAW_DIR.exists())
print("WASTE_BINS_DIR:", WASTE_BINS_DIR, WASTE_BINS_DIR.exists())

for p in sorted(WASTE_BINS_DIR.rglob("*")):
    if p.is_file():
        print(p.relative_to(BASE_DIR))

BASE_DIR: C:\Projects\Toronto_Waste_Analytics True
RAW_DIR: C:\Projects\Toronto_Waste_Analytics\data_raw True
WASTE_BINS_DIR: C:\Projects\Toronto_Waste_Analytics\data_raw\waste_bins True
data_raw\waste_bins\OPTIONAL\Litter Bin Collection Frequency - 4326.csv
data_raw\waste_bins\OPTIONAL\Litter Bin Collection Frequency - 4326.geojson
data_raw\waste_bins\OPTIONAL\Solid Waste Collection Schedule\pickup-schedule-2020.csv
data_raw\waste_bins\OPTIONAL\Solid Waste Collection Schedule\pickup-schedule-2021.csv
data_raw\waste_bins\OPTIONAL\Solid Waste Collection Schedule\pickup-schedule-2022.csv
data_raw\waste_bins\OPTIONAL\Solid Waste Collection Schedule\pickup-schedule-2023.csv
data_raw\waste_bins\OPTIONAL\Solid Waste Collection Schedule\pickup-schedule-2024.csv
data_raw\waste_bins\OPTIONAL\Solid Waste Collection Schedule\pickup-schedule-2025.csv
data_raw\waste_bins\OPTIONAL\Solid Waste Management Districts - TORONTO\citygcs_swms_districts_metadata_wgs84.pdf
data_raw\waste_bins\OPTIONAL\Solid 

In [285]:
geo_path = WASTE_BINS_DIR / "Street furniture-Litter receptacle data - 4326.geojson"
gdf_street = gpd.read_file(geo_path)

print("Street GeoJSON shape:", gdf_street.shape)
print("CRS:", gdf_street.crs)
print("Geom types:", gdf_street.geometry.geom_type.value_counts())
gdf_street.head(3)

Street GeoJSON shape: (10457, 17)
CRS: EPSG:4326
Geom types: MultiPoint    10457
Name: count, dtype: int64


Unnamed: 0,_id,OBJECTID,ID,ADDRESSNUMBERTEXT,ADDRESSSTREET,FRONTINGSTREET,SIDE,FROMSTREET,DIRECTION,SITEID,WARD,BIA,ASSETTYPE,STATUS,BARCODE,SDE_STATE_ID,geometry
0,1,30,LR-07817,532,Annette St,Annette St,North,Runnymede Rd,West,954,4,,WR1,Existing,L1300954,,MULTIPOINT ((-79.48022 43.66008))
1,2,31,LR-07819,660,Annette St,Annette St,North,Windermere Ave,East,948,4,,WR1,Existing,L1300948,,MULTIPOINT ((-79.48371 43.65932))
2,3,32,LR-07822,1873,Bloor St W,Bloor St W,South,Colborne Lodge Dr,East,977,4,,WR2,Existing,L1300977,,MULTIPOINT ((-79.46464 43.6536))


In [286]:
nulls = gdf_street.isna().sum().sort_values(ascending=False)
nulls = nulls[nulls > 0]
nulls.head(20)

SDE_STATE_ID    10457
dtype: int64

In [287]:
invalid = (~gdf_street.geometry.is_valid).sum()
empty = (gdf_street.geometry.is_empty).sum()
print("Invalid geometries:", invalid)
print("Empty geometries:", empty)

print("total_bounds:", gdf_street.total_bounds)

Invalid geometries: 0
Empty geometries: 0
total_bounds: [-79.6388488   43.59046533 -79.12311282  43.84011157]


In [288]:
csv_path = WASTE_BINS_DIR / "Street furniture-Litter receptacle data - 4326.csv"
df_street = pd.read_csv(csv_path)

print("Street CSV shape:", df_street.shape)
print(df_street.dtypes)
df_street.head(3)

Street CSV shape: (10457, 17)
_id                    int64
OBJECTID               int64
ID                    object
ADDRESSNUMBERTEXT     object
ADDRESSSTREET         object
FRONTINGSTREET        object
SIDE                  object
FROMSTREET            object
DIRECTION             object
SITEID                object
WARD                 float64
BIA                   object
ASSETTYPE             object
STATUS                object
BARCODE               object
SDE_STATE_ID         float64
geometry              object
dtype: object


Unnamed: 0,_id,OBJECTID,ID,ADDRESSNUMBERTEXT,ADDRESSSTREET,FRONTINGSTREET,SIDE,FROMSTREET,DIRECTION,SITEID,WARD,BIA,ASSETTYPE,STATUS,BARCODE,SDE_STATE_ID,geometry
0,1,30,LR-07817,532,Annette St,Annette St,North,Runnymede Rd,West,954,4.0,,WR1,Existing,L1300954,,"{""coordinates"": [[-79.4802204012433, 43.660081..."
1,2,31,LR-07819,660,Annette St,Annette St,North,Windermere Ave,East,948,4.0,,WR1,Existing,L1300948,,"{""coordinates"": [[-79.4837111829841, 43.659317..."
2,3,32,LR-07822,1873,Bloor St W,Bloor St W,South,Colborne Lodge Dr,East,977,4.0,,WR2,Existing,L1300977,,"{""coordinates"": [[-79.4646436092827, 43.653599..."


In [289]:
set(df_street.columns) - set(gdf_street.columns)
set(gdf_street.columns) - set(df_street.columns)

set()

In [290]:
park_shp = WASTE_BINS_DIR / "Solid-waste-in-park-assets-wgs84" / "SWMS_PARK_BIN_WGS84.shp"
gdf_park = gpd.read_file(park_shp)

print("Park SHP shape:", gdf_park.shape)
print("CRS:", gdf_park.crs)
print("Geom types:", gdf_park.geometry.geom_type.value_counts())
gdf_park.head(3)

Park SHP shape: (4768, 18)
CRS: EPSG:4326
Geom types: Point    4768
Name: count, dtype: int64


Unnamed: 0,FID,INSPEC_DAT,PARK_NAME,PARK_LCODE,WARD_NAME,WARD_SCODE,LIT_BTYPE,LIT_BCOUNT,REC_BTYPE,REC_BCOUNT,CON_STATUS,X_COORDI,Y_COORDI,LONGITUDE,LATITUDE,OBJECTID,RID,geometry
0,818,2012-11-22,SCARBOROUGH BLUFFS PARK,576,Scarborough Southwest (36),36,Litter Domed Lid Toter,1,Recycling Domed Lid Toter,1,Non Seasonal,325581.243,4840277.279,-79.241957,43.704154,10111,1,POINT (-79.24196 43.70415)
1,820,2012-11-22,SCARBOROUGH BLUFFS PARK,576,Scarborough Southwest (36),36,Litter Domed Lid Toter,1,Recycling Domed Lid Toter,1,Non Seasonal,325656.752,4840501.523,-79.241012,43.70617,10112,2,POINT (-79.24101 43.70617)
2,821,2012-11-22,DUNLOP PARK,1741,Scarborough Southwest (35),35,Litter Domed Lid Toter,1,Recycling Domed Lid Toter,1,Non Seasonal,323580.688,4841266.419,-79.266745,43.713111,10113,3,POINT (-79.26675 43.71311)


In [291]:
nulls = gdf_park.isna().sum().sort_values(ascending=False)
nulls = nulls[nulls > 0]
print(nulls.head(20) if len(nulls) > 0 else "No null values found")

print("Invalid geometries:", (~gdf_park.geometry.is_valid).sum())
print("Empty geometries:", (gdf_park.geometry.is_empty).sum())
print("total_bounds:", gdf_park.total_bounds)

print("CRS street:", gdf_street.crs)
print("CRS park  :", gdf_park.crs)

REC_BTYPE     68
LIT_BTYPE     26
PARK_LCODE     2
dtype: int64
Invalid geometries: 0
Empty geometries: 0
total_bounds: [-79.62438483  43.58458967 -79.11802922  43.84812976]
CRS street: EPSG:4326
CRS park  : EPSG:4326


In [292]:
gdf_street_pts = gdf_street.explode(index_parts=False).reset_index(drop=True)
print("Antes:", gdf_street.shape, " | geom:", gdf_street.geometry.geom_type.unique())
print("Después:", gdf_street_pts.shape, " | geom:", gdf_street_pts.geometry.geom_type.unique())

Antes: (10457, 17)  | geom: ['MultiPoint']
Después: (10457, 17)  | geom: ['Point']


In [293]:
BASE_DIR = Path(r"C:\Projects\Toronto_Waste_Analytics")
RAW_DIR = BASE_DIR / "data_raw"
PED_DIR = RAW_DIR / "pedestrian_proxy"

xlsx_path = PED_DIR / "Pedestrian Network Data - 4326.xlsx"
print("File exists?", xlsx_path.exists())
print("Path:", xlsx_path)

File exists? True
Path: C:\Projects\Toronto_Waste_Analytics\data_raw\pedestrian_proxy\Pedestrian Network Data - 4326.xlsx


In [294]:
BBASE_DIR = Path(r"C:\Projects\Toronto_Waste_Analytics")
csv_path = BASE_DIR / "data_raw" / "pedestrian_proxy" / "Pedestrian Network Data - 4326.csv"

df_ped = pd.read_csv(csv_path)

print("Shape:", df_ped.shape)
print("Columns:", df_ped.columns.tolist())

df_ped.head(3)

Shape: (87105, 11)
Columns: ['_id', 'OBJECTID', 'ROAD_TYPE', 'SIDEWALK_CODE', 'SIDEWALK_DESCRIPTION', 'CROSSWALK', 'CROSSWALK_TYPE', 'PX', 'PX_TYPE', 'LENGTH', 'geometry']


Unnamed: 0,_id,OBJECTID,ROAD_TYPE,SIDEWALK_CODE,SIDEWALK_DESCRIPTION,CROSSWALK,CROSSWALK_TYPE,PX,PX_TYPE,LENGTH,geometry
0,1,1,Local,7.0,Sidewalk on both sides,,,,,93.86768,"{""coordinates"": [[[-79.5639645086874, 43.73782..."
1,2,2,Collector,7.0,Sidewalk on both sides,,,,,32.546284,"{""coordinates"": [[[-79.567921505888, 43.636179..."
2,3,3,,2.0,Sidewalk on north side only,,,,,117.669206,"{""coordinates"": [[[-79.3779147846482, 43.67845..."


In [295]:

BASE_DIR = Path(r"C:\Projects\Toronto_Waste_Analytics")
csv_path = BASE_DIR / "data_raw" / "pedestrian_proxy" / "Pedestrian Network Data - 4326.csv"

df_ped = pd.read_csv(csv_path)

# 1) parse JSON -> dict
geom_dicts = df_ped["geometry"].apply(json.loads)

# 2) dict -> shapely geometry
geom = geom_dicts.apply(shape)

# 3) construir GeoDataFrame (ahora sí)
gdf_ped = gpd.GeoDataFrame(df_ped.drop(columns=["geometry"]), geometry=geom, crs="EPSG:4326")

print("Geo shape:", gdf_ped.shape)
print("CRS:", gdf_ped.crs)
print("Geom types:", gdf_ped.geometry.geom_type.value_counts())


Geo shape: (87105, 11)
CRS: EPSG:4326
Geom types: MultiLineString    87105
Name: count, dtype: int64


In [296]:
gdf_ped_26917 = gdf_ped.to_crs(epsg=26917)

print("CRS:", gdf_ped_26917.crs)
print("Bounds 26917:", gdf_ped_26917.total_bounds)
print("Length check (first 3):", gdf_ped_26917.geometry.length.head(3).tolist())

CRS: EPSG:26917
Bounds 26917: [ 609550.29954725 4826817.54702803  651405.98617441 4857438.31432199]
Length check (first 3): [93.8571831701813, 32.54194251063621, 117.66745434547892]


In [297]:
BASE_DIR = Path(r"C:\Projects\Toronto_Waste_Analytics")
POP_DIR = BASE_DIR / "data_raw" / "population_context"

csv_path = POP_DIR / "population_toronto_2024_2025_age_groups.csv"
df_csv = pd.read_csv(csv_path)

print("CSV FILE")
print("Shape:", df_csv.shape)
print("Columns:", df_csv.columns.tolist())
df_csv.head(3)

CSV FILE
Shape: (2, 5)
Columns: ['YEAR (JULY 1)', '0 to 14', '15 to 64', '65 Plus', 'TOTAL_CHECK']


Unnamed: 0,YEAR (JULY 1),0 to 14,15 to 64,65 Plus,TOTAL_CHECK
0,2024,802560,4694136,1049542,6546238
1,2025,788330,4603092,1076056,6467478


In [298]:
xlsx_path = POP_DIR / "49_census_divisions_mof_population_projections_2024-2051.xlsx"

In [299]:
df_xls = pd.read_excel(xlsx_path)

print("EXCEL FILE")
print("Shape:", df_xls.shape)
print("Columns:", df_xls.columns.tolist())

df_xls.head(3)

EXCEL FILE
Shape: (4121, 118)
Columns: ["Population Projections for Ontario's 49 Census Divisions by Age and Gender, 2024-2051", 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 35', 'Unnamed: 36', 'Unnamed: 37', 'Unnamed: 38', 'Unnamed: 39', 'Unnamed: 40', 'Unnamed: 41', 'Unnamed: 42', 'Unnamed: 43', 'Unnamed: 44', 'Unnamed: 45', 'Unnamed: 46', 'Unnamed: 47', 'Unnamed: 48', 'Unnamed: 49', 'Unnamed: 50', 'Unnamed: 51', 'Unnamed: 52', 'Unnamed: 53', 'Unnamed: 54', 'Unnamed: 55', 'Unnamed: 56', 'Unnamed: 57', 'Unnamed: 58', 'Unnamed: 

Unnamed: 0,"Population Projections for Ontario's 49 Census Divisions by Age and Gender, 2024-2051",Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47,Unnamed: 48,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58,Unnamed: 59,Unnamed: 60,Unnamed: 61,Unnamed: 62,Unnamed: 63,Unnamed: 64,Unnamed: 65,Unnamed: 66,Unnamed: 67,Unnamed: 68,Unnamed: 69,Unnamed: 70,Unnamed: 71,Unnamed: 72,Unnamed: 73,Unnamed: 74,Unnamed: 75,Unnamed: 76,Unnamed: 77,Unnamed: 78,Unnamed: 79,Unnamed: 80,Unnamed: 81,Unnamed: 82,Unnamed: 83,Unnamed: 84,Unnamed: 85,Unnamed: 86,Unnamed: 87,Unnamed: 88,Unnamed: 89,Unnamed: 90,Unnamed: 91,Unnamed: 92,Unnamed: 93,Unnamed: 94,Unnamed: 95,Unnamed: 96,Unnamed: 97,Unnamed: 98,Unnamed: 99,Unnamed: 100,Unnamed: 101,Unnamed: 102,Unnamed: 103,Unnamed: 104,Unnamed: 105,Unnamed: 106,Unnamed: 107,Unnamed: 108,Unnamed: 109,Unnamed: 110,Unnamed: 111,Unnamed: 112,Unnamed: 113,Unnamed: 114,Unnamed: 115,Unnamed: 116,Unnamed: 117
0,Sources: Statistics Canada for 2024 and Ontari...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,BROAD AGE GROUPS,,,5-YEAR AGE GROUPS,,,,,,,,,,,,,,,,,,,SINGLE-YEAR OF AGE,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [300]:
BASE_DIR = Path(r"C:\Projects\Toronto_Waste_Analytics")
stops_path = (
    BASE_DIR
    / "data_raw"
    / "transit_points"
    / "TTC Routes and Schedules Data"
    / "stops.txt"
)

df_stops = pd.read_csv(stops_path)

print("Shape:", df_stops.shape)
print("Columns:", df_stops.columns.tolist())

df_stops.head(3)

Shape: (9322, 12)
Columns: ['stop_id', 'stop_code', 'stop_name', 'stop_desc', 'stop_lat', 'stop_lon', 'zone_id', 'stop_url', 'location_type', 'parent_station', 'stop_timezone', 'wheelchair_boarding']


Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding
0,662,662,Danforth Rd at Kennedy Rd,,43.714379,-79.260939,,,,,,1
1,929,929,Davenport Rd at Bedford Rd,,43.674448,-79.399659,,,,,,1
2,940,940,Davenport Rd at Dupont St,,43.675511,-79.401938,,,,,,2


In [301]:
gdf_stops = gpd.GeoDataFrame(
    df_stops,
    geometry=gpd.points_from_xy(df_stops["stop_lon"], df_stops["stop_lat"]),
    crs="EPSG:4326"
)

print("Geo shape:", gdf_stops.shape)
print("CRS:", gdf_stops.crs)
print("Geom types:", gdf_stops.geometry.geom_type.value_counts())

Geo shape: (9322, 13)
CRS: EPSG:4326
Geom types: Point    9322
Name: count, dtype: int64


In [302]:
#Reviewed all datasets in the data_raw folder.
# Checked file formats, rows, and columns.
# Verified geometry and CRS for spatial datasets.
# Confirmed all spatial data uses EPSG:4326.
# Identified waste bins, pedestrian network, transit stops, and population data roles.
# No cleaning or transformations were performed at this stage.
# Data preparation will be done in later notebooks.