## Extracting CTA Bus stops file

In [1]:
# importing required libraries
import zipfile
import os
import geopandas as gpd
from pykml import parser
from lxml import etree
import xml.etree.ElementTree as ET
import folium
from folium.plugins import MarkerCluster
import requests
import pandas as pd
from shapely.ops import unary_union
import random
from shapely.geometry import Point, box
from folium.plugins import FastMarkerCluster

In [5]:
# Extract the KMZ file to get KML file
kmz_file = 'C:/Users/kaur6/Downloads/Urban Analytics/CTA_BusStops.kmz'

with zipfile.ZipFile(kmz_file, 'r') as kmz:
    kmz.extractall('extracted_kmz')

## Visualizing CTA bus stops on Map

In [8]:
# Load the KML file
kml_file = "C:/Users/kaur6/Downloads/Urban Analytics/extracted_kmz_b/CTA_BusStops.kml"
tree = ET.parse(kml_file)
root = tree.getroot()

# Define KML namespace
namespace = {'kml': 'http://www.opengis.net/kml/2.2'}

# Extract all Placemark elements
placemarks = root.findall('.//kml:Placemark', namespaces=namespace)

# Extract coordinates (longitude, latitude)
bus_stops = []
for placemark in placemarks:
    coordinates = placemark.find('.//kml:coordinates', namespaces=namespace)
    if coordinates is not None:
        # KML format: longitude,latitude
        coords = coordinates.text.strip().split(',')
        longitude = float(coords[0])
        latitude = float(coords[1])
        bus_stops.append((latitude, longitude))

# Create a map centered at the first bus stop
if bus_stops:
    first_stop = bus_stops[0]
    m = folium.Map(location=first_stop, zoom_start=12)

    # Use MarkerCluster for efficiency
    marker_cluster = MarkerCluster().add_to(m)

    # Add all bus stops to the cluster
    for lat, lon in bus_stops:
        folium.Marker(
            [lat, lon], 
            popup=f"Bus Stop: {lat}, {lon}"
        ).add_to(marker_cluster)

    # Save and display the map
    output_file = "C:/Users/kaur6/Downloads/Urban Analytics/bus_stops_map.html"
    m.save(output_file)
    print(f"✅ Map saved as '{output_file}'. Open this file in a browser to view.")

else:
    print("❌ No bus stops found in the KML file.")

✅ Map saved as 'C:/Users/kaur6/Downloads/Urban Analytics/bus_stops_map.html'. Open this file in a browser to view.


## Extracting CTA Train Stations

In [7]:
# Extract the KMZ file to get KML file
kmz_r_file = 'C:/Users/kaur6/Downloads/Urban Analytics/CTA_RailStations.kmz'  # Path to your KMZ file
with zipfile.ZipFile(kmz_r_file, 'r') as kmz:
    kmz.extractall('extracted_kmz')

## Visualizing CTA train stations on Map

In [None]:
# Load the KML file
kml_r_file = "C:/Users/kaur6/Downloads/Urban Analytics/extracted_kmz_r/CTA_RailStations.kml"
tree = ET.parse(kml_r_file)
root = tree.getroot()

# Define KML namespace
namespace = {'kml': 'http://www.opengis.net/kml/2.2'}

# Extract all Placemark elements
placemarks = root.findall('.//kml:Placemark', namespaces=namespace)

# Extract coordinates (longitude, latitude)
rail_stations = []
for placemark in placemarks:
    coordinates = placemark.find('.//kml:coordinates', namespaces=namespace)
    if coordinates is not None:
        # KML format: longitude,latitude
        coords = coordinates.text.strip().split(',')
        longitude = float(coords[0])
        latitude = float(coords[1])
        rail_stations.append((latitude, longitude))

# Create a map centered at the first rail station
if rail_stations:
    first_station = rail_stations[0]
    m = folium.Map(location=first_station, zoom_start=12)

    # Add markers for all bus stops
    for lat, lon in rail_stations:
        folium.Marker([lat, lon], popup=f"Rail Station: {lat}, {lon}").add_to(m)

    # Save and display the map
    m.save("rail_stations_map.html")
    print("Map saved as 'rail_stations_map.html'. Open this file in a browser to view.")

else:
    print("No rail stations found in the KML file.")

## Getting Metra Stations in Cook County from Overpass API

In [5]:
# Overpass Turbo API URL
overpass_url = "http://overpass-api.de/api/interpreter"
query = """
[out:json];
area[name="Cook County"]->.searchArea;
node["railway"="station"]["operator"~"Metra"](area.searchArea);
out body;
"""

# Fetch data from Overpass API
response = requests.get(overpass_url, params={"data": query})
data = response.json()

# Extract relevant information (ID, name, latitude, longitude)
stations = []
for element in data["elements"]:
    if "lat" in element and "lon" in element:
        name = element["tags"].get("name", "Unknown Station")
        stations.append({
            "ID": element["id"],
            "Name": name,
            "Latitude": element["lat"],
            "Longitude": element["lon"]
        })

# Convert to GeoDataFrame
gdf = gpd.GeoDataFrame(stations, geometry=gpd.points_from_xy(
    [s["Longitude"] for s in stations], [s["Latitude"] for s in stations]
))

# Set CRS for Metra stations (WGS 84 - EPSG:4326)
gdf.set_crs("EPSG:4326", allow_override=True, inplace=True)

# Save as GeoJSON and CSV
geojson_path = "metra_stations_cook_county.geojson"
csv_path = "metra_stations_cook_county.csv"
gdf.to_file(geojson_path, driver="GeoJSON")
gdf.drop(columns=["geometry"]).to_csv(csv_path, index=False)

print(f"Saved Metra stations to {geojson_path} and {csv_path}")

Saved Metra stations to metra_stations_cook_county.geojson and metra_stations_cook_county.csv


## Visualizing CTA and Metra Stations on same map

In [6]:
# Define KML namespace
namespace = {'kml': 'http://www.opengis.net/kml/2.2'}

# Load CTA Rail Stations KML file
kml_r_file = "C:/Users/kaur6/Downloads/Urban Analytics/extracted_kmz_r/CTA_RailStations.kml"
tree = ET.parse(kml_r_file)
root = tree.getroot()

# Extract all Placemark elements
placemarks = root.findall('.//kml:Placemark', namespaces=namespace)

# Extract CTA rail station coordinates
cta_stations = []
for placemark in placemarks:
    coordinates = placemark.find('.//kml:coordinates', namespaces=namespace)
    if coordinates is not None:
        coords = coordinates.text.strip().split(',')
        longitude = float(coords[0])
        latitude = float(coords[1])
        cta_stations.append((latitude, longitude))

# Print number of CTA stations
print(f"Number of CTA Rail Stations: {len(cta_stations)}")

# Convert CTA data to a GeoDataFrame
cta_gdf = gpd.GeoDataFrame(cta_stations, columns=["Latitude", "Longitude"], 
                           geometry=[Point(lon, lat) for lat, lon in cta_stations])

# Load Metra Rail Stations from GeoJSON
metra_gdf = gpd.read_file("C:/Users/kaur6/Downloads/Urban Analytics/metra_stations_cook_county.geojson")

# Print number of Metra stations
print(f"Number of Metra Rail Stations: {len(metra_gdf)}")

# Create a Folium map centered at an average location
map_center = [cta_gdf["Latitude"].mean(), cta_gdf["Longitude"].mean()]
m = folium.Map(location=map_center, zoom_start=11)

# Add CTA stations (Blue markers)
for idx, row in cta_gdf.iterrows():
    folium.Marker(
        location=[row["Latitude"], row["Longitude"]],
        popup=f"CTA Rail Station: {row['Latitude']}, {row['Longitude']}",
        icon=folium.Icon(color="blue", icon="train")
    ).add_to(m)

# Add Metra stations (Red markers)
for idx, row in metra_gdf.iterrows():
    folium.Marker(
        location=[row["Latitude"], row["Longitude"]],
        popup=f"Metra Station: {row['Name']}",
        icon=folium.Icon(color="red", icon="train")
    ).add_to(m)

# Save and display the map
m.save("cta_metra_stations_map.html")
print("Map saved as 'cta_metra_stations_map.html'. Open this file in a browser to view.")

Number of CTA Rail Stations: 144
Number of Metra Rail Stations: 132
Map saved as 'cta_metra_stations_map.html'. Open this file in a browser to view.


## Getting valid unique pins from the pins of cook county dataset

In [5]:
# File paths
input_file = "C:/Users/kaur6/Downloads/Urban Analytics/pin_lat_long.csv"
output_file = "C:/Users/kaur6/Downloads/Urban Analytics/pins_on_map.html"

# Define chunk size
chunk_size = 100000 

# Initialize the map
map_center = [0, 0]  # Placeholder until we get valid coordinates
m = folium.Map(location=map_center, zoom_start=12)

# Initialize total pin counter
total_pins = 0
chunk_count = 0 

# Process data in chunks
for chunk in pd.read_csv(input_file, chunksize=chunk_size):
    chunk_count += 1
    print(f"\nProcessing Chunk {chunk_count}...")

    # Check for NaN values and remove them
    before_drop = len(chunk)
    chunk = chunk.dropna(subset=['latitude', 'longitude'])
    after_drop = len(chunk)
    
    print(f"Chunk {chunk_count}: {before_drop} rows → {after_drop} valid rows after NaN removal.")

    # If all rows are NaN, skip this chunk
    if chunk.empty:
        print(f"Chunk {chunk_count} is empty after NaN removal. Skipping...")
        continue

    # Convert chunk into a GeoDataFrame
    try:
        gdf = gpd.GeoDataFrame(chunk, 
                               geometry=gpd.points_from_xy(chunk['longitude'], chunk['latitude']),
                               crs="EPSG:4326")
    except Exception as e:
        print(f"Error in Chunk {chunk_count}: {e}")
        continue  

    # Update total pin count
    total_pins += len(gdf)
    print(f"Chunk {chunk_count}: Added {len(gdf)} pins. Total pins so far: {total_pins}")

    # Update map center with first valid chunk
    if total_pins == len(gdf):
        map_center = [gdf["latitude"].mean(), gdf["longitude"].mean()]
        m.location = map_center
        m.zoom_start = 12

    # Add markers for each valid row (Limit to first 5000 to avoid slowing down)
    for idx, row in gdf.iterrows():
        folium.Marker(
            location=[row["latitude"], row["longitude"]],
            popup=f"PIN: {row.get('pin', 'No pin')}",
            icon=folium.Icon(color="blue", icon="info-sign")
        ).add_to(m)
        
        if idx >= 5000:  # Prevent too many markers slowing down the map
            break

    print(f"Chunk {chunk_count} processing complete.\n")

# Save the map after processing all chunks
m.save(output_file)
print(f"\nMap saved as '{output_file}'. Open this file in a browser to view.")
print(f"Total number of valid PINs: {total_pins}")


Processing Chunk 1...
Chunk 1: 100000 rows → 77896 valid rows after NaN removal.
Chunk 1: Added 77896 pins. Total pins so far: 77896
Chunk 1 processing complete.


Processing Chunk 2...
Chunk 2: 100000 rows → 78087 valid rows after NaN removal.
Chunk 2: Added 78087 pins. Total pins so far: 155983
Chunk 2 processing complete.


Processing Chunk 3...
Chunk 3: 100000 rows → 78009 valid rows after NaN removal.
Chunk 3: Added 78009 pins. Total pins so far: 233992
Chunk 3 processing complete.


Processing Chunk 4...
Chunk 4: 100000 rows → 77958 valid rows after NaN removal.
Chunk 4: Added 77958 pins. Total pins so far: 311950
Chunk 4 processing complete.


Processing Chunk 5...
Chunk 5: 100000 rows → 78032 valid rows after NaN removal.
Chunk 5: Added 78032 pins. Total pins so far: 389982
Chunk 5 processing complete.


Processing Chunk 6...
Chunk 6: 100000 rows → 78007 valid rows after NaN removal.
Chunk 6: Added 78007 pins. Total pins so far: 467989
Chunk 6 processing complete.


Processing

## Getting pins which are in distance of 0.5 miles from a train station

In [10]:
# File paths
pin_file = "C:/Users/kaur6/Downloads/Urban Analytics/pin_lat_long.csv"
metra_file = "C:/Users/kaur6/Downloads/Urban Analytics/metra_stations_cook_county.geojson"
cta_kml_file = "C:/Users/kaur6/Downloads/Urban Analytics/extracted_kmz_r/CTA_RailStations.kml"

# Load PIN data
pins_df = pd.read_csv(pin_file)

# Drop rows with missing coordinates
pins_df = pins_df.dropna(subset=['latitude', 'longitude'])

# Convert PINs to a GeoDataFrame
pins_gdf = gpd.GeoDataFrame(
    pins_df, 
    geometry=gpd.points_from_xy(pins_df['longitude'], pins_df['latitude']),
    crs="EPSG:4326"
)

# Convert to metric projection for distance calculation (Meters)
pins_gdf = pins_gdf.to_crs("EPSG:3857")

# Load Metra stations
metra_gdf = gpd.read_file(metra_file)

# Convert Metra stations to metric CRS
metra_gdf = metra_gdf.to_crs("EPSG:3857")

# Load CTA Rail Stations from KML
import xml.etree.ElementTree as ET

# Define KML namespace
namespace = {'kml': 'http://www.opengis.net/kml/2.2'}

# Parse KML file
tree = ET.parse(cta_kml_file)
root = tree.getroot()

# Extract CTA stations
cta_stations = []
for placemark in root.findall('.//kml:Placemark', namespaces=namespace):
    coordinates = placemark.find('.//kml:coordinates', namespaces=namespace)
    if coordinates is not None:
        lon, lat, _ = map(float, coordinates.text.strip().split(','))
        cta_stations.append(Point(lon, lat))

# Convert CTA stations to a GeoDataFrame
cta_gdf = gpd.GeoDataFrame(geometry=cta_stations, crs="EPSG:4326")
cta_gdf = cta_gdf.to_crs("EPSG:3857")  # Convert to meters

# Combine CTA and Metra station geometries using pd.concat()
all_stations_gdf = pd.concat([cta_gdf, metra_gdf], ignore_index=True)

# Create buffer zone of 0.5 miles (≈ 804 meters)
station_buffer = all_stations_gdf.buffer(804)

# Find PINs within the buffer zone
pins_near_stations = pins_gdf[pins_gdf.geometry.within(unary_union(station_buffer))]

# Convert back to Latitude/Longitude for mapping
pins_near_stations = pins_near_stations.to_crs("EPSG:4326")

# Save results
pins_near_stations.to_csv("C:/Users/kaur6/Downloads/Urban Analytics/pins_near_train_stations.csv", index=False)

print(f"✅ Found {len(pins_near_stations)} PINs within 0.5 miles of a train station.")

✅ Found 142402 PINs within 0.5 miles of a train station.


## Visualizing 5000 pins on the map due to memory constraints

In [7]:
# File paths
pins_near_stations_file = "C:/Users/kaur6/Downloads/Urban Analytics/pins_near_train_stations.csv"
metra_file = "C:/Users/kaur6/Downloads/Urban Analytics/metra_stations_cook_county.geojson"
cta_kml_file = "C:/Users/kaur6/Downloads/Urban Analytics/extracted_kmz_r/CTA_RailStations.kml"

# Load the results of PINs near stations
pins_near_stations = pd.read_csv(pins_near_stations_file)

# Convert to a GeoDataFrame
pins_near_stations_gdf = gpd.GeoDataFrame(
    pins_near_stations,
    geometry=gpd.points_from_xy(pins_near_stations['longitude'], pins_near_stations['latitude']),
    crs="EPSG:4326"
)

# Load Metra stations
metra_gdf = gpd.read_file(metra_file)

# Load CTA Rail Stations from KML
import xml.etree.ElementTree as ET

# Define KML namespace
namespace = {'kml': 'http://www.opengis.net/kml/2.2'}

# Parse KML file
tree = ET.parse(cta_kml_file)
root = tree.getroot()

# Extract CTA stations
cta_stations = []
for placemark in root.findall('.//kml:Placemark', namespaces=namespace):
    coordinates = placemark.find('.//kml:coordinates', namespaces=namespace)
    if coordinates is not None:
        lon, lat, _ = map(float, coordinates.text.strip().split(','))
        cta_stations.append((lat, lon))

# Initialize the map centered around an average location
map_center = [pins_near_stations_gdf["latitude"].mean(), pins_near_stations_gdf["longitude"].mean()]
m = folium.Map(location=map_center, zoom_start=12)

# Add Metra station markers
for _, row in metra_gdf.iterrows():
    folium.Marker(
        location=[row.geometry.y, row.geometry.x],
        popup=f"Metra Station: {row.geometry.y}, {row.geometry.x}",
        icon=folium.Icon(color="red", icon="info-sign")
    ).add_to(m)

# Add CTA station markers
for lat, lon in cta_stations:
    folium.Marker(
        location=[lat, lon],
        popup=f"CTA Station: {lat}, {lon}",
        icon=folium.Icon(color="green", icon="info-sign")
    ).add_to(m)

# Sample 1000 pins randomly
sampled_pins = pins_near_stations_gdf.sample(n=5000, random_state=42)

# Use MarkerCluster to group pins efficiently
marker_cluster = MarkerCluster().add_to(m)

# Add sampled PIN markers
for _, row in sampled_pins.iterrows():
    folium.Marker(
        location=[row["latitude"], row["longitude"]],
        popup=f"PIN: {row.get('pin', 'No pin')} is within 0.5 miles of a station",
        icon=folium.Icon(color="blue", icon="info-sign")
    ).add_to(marker_cluster)

# Save the map to an HTML file
output_map_file = "C:/Users/kaur6/Downloads/Urban Analytics/pins_near_train_stations_map.html"
m.save(output_map_file)

print(f"✅ Map saved as '{output_map_file}'. Open this file in a browser to view.")

✅ Map saved as 'C:/Users/kaur6/Downloads/Urban Analytics/pins_near_train_stations_map.html'. Open this file in a browser to view.


## Getting pins which lie in distance of 0.25 miles from CTA bus stops

In [9]:
# File paths
pin_file = "C:/Users/kaur6/Downloads/Urban Analytics/pin_lat_long.csv"
cta_bus_stop_kml_file = "C:/Users/kaur6/Downloads/Urban Analytics/extracted_kmz_b/CTA_BusStops.kml"

# Load PIN data
pins_df = pd.read_csv(pin_file)

# Drop rows with missing coordinates
pins_df = pins_df.dropna(subset=['latitude', 'longitude'])

# Convert PINs to a GeoDataFrame
pins_gdf = gpd.GeoDataFrame(
    pins_df, 
    geometry=gpd.points_from_xy(pins_df['longitude'], pins_df['latitude']),
    crs="EPSG:4326"
)

# Convert to metric projection for distance calculation (Meters)
pins_gdf = pins_gdf.to_crs("EPSG:3857")

# Load CTA Bus Stops from KML
namespace = {'kml': 'http://www.opengis.net/kml/2.2'}

# Parse KML file
tree = ET.parse(cta_bus_stop_kml_file)
root = tree.getroot()

# Extract CTA bus stops
cta_bus_stops = []
for placemark in root.findall('.//kml:Placemark', namespaces=namespace):
    coordinates = placemark.find('.//kml:coordinates', namespaces=namespace)
    if coordinates is not None:
        lon, lat, _ = map(float, coordinates.text.strip().split(','))
        cta_bus_stops.append(Point(lon, lat))

# Convert CTA bus stops to a GeoDataFrame
cta_bus_stops_gdf = gpd.GeoDataFrame(geometry=cta_bus_stops, crs="EPSG:4326")
cta_bus_stops_gdf = cta_bus_stops_gdf.to_crs("EPSG:3857")  # Convert to meters

# Create buffer zone of 0.25 miles (≈ 402 meters)
bus_stop_buffer = cta_bus_stops_gdf.buffer(402)

# Find PINs within the buffer zone
pins_near_bus_stops = pins_gdf[pins_gdf.geometry.within(unary_union(bus_stop_buffer))]

# Convert back to Latitude/Longitude for mapping
pins_near_bus_stops = pins_near_bus_stops.to_crs("EPSG:4326")

# Save results
pins_near_bus_stops.to_csv("C:/Users/kaur6/Downloads/Urban Analytics/pins_near_bus_stops.csv", index=False)

print(f"✅ Found {len(pins_near_bus_stops)} PINs within 0.25 miles of a CTA bus stop.")

✅ Found 389675 PINs within 0.25 miles of a CTA bus stop.


## Visualizing 5000 pins on the map due to memory constraints

In [12]:
# File paths
pins_file = "C:/Users/kaur6/Downloads/Urban Analytics/pins_near_bus_stops.csv"
cta_bus_stops_kml = "C:/Users/kaur6/Downloads/Urban Analytics/extracted_kmz_b/CTA_BusStops.kml"

# Load the data
pins_df = pd.read_csv(pins_file)

# Randomly sample 5000 PINs
sampled_pins = pins_df.sample(n=5000, random_state=42)

# Load CTA Bus Stops from KML
import xml.etree.ElementTree as ET

namespace = {'kml': 'http://www.opengis.net/kml/2.2'}
tree = ET.parse(cta_bus_stops_kml)
root = tree.getroot()

cta_bus_stops = []
for placemark in root.findall('.//kml:Placemark', namespaces=namespace):
    coordinates = placemark.find('.//kml:coordinates', namespaces=namespace)
    if coordinates is not None:
        lon, lat, _ = map(float, coordinates.text.strip().split(','))
        cta_bus_stops.append((lat, lon))

# Create the map centered at the first sampled pin
first_sampled_pin = sampled_pins.iloc[0]
m = folium.Map(location=[first_sampled_pin['latitude'], first_sampled_pin['longitude']], zoom_start=12)

# Create MarkerCluster for PINs
pins_marker_cluster = MarkerCluster().add_to(m)

# Add sampled PINs to the MarkerCluster with a blue icon
for _, row in sampled_pins.iterrows():
    folium.Marker(
        location=[row['latitude'], row['longitude']],
        popup=f"PIN: {row['pin']}",
        icon=folium.Icon(color='blue')  # Set pin icon to blue
    ).add_to(pins_marker_cluster)

# Create MarkerCluster for CTA Bus Stops
bus_stops_marker_cluster = MarkerCluster().add_to(m)

# Add CTA Bus Stops to the map with red icon
for lat, lon in cta_bus_stops:
    folium.Marker(
        location=[lat, lon],
        popup="CTA Bus Stop",
        icon=folium.Icon(color='red', icon='info-sign')  # Red icon for bus stops
    ).add_to(bus_stops_marker_cluster)

# Save the map
m.save("C:/Users/kaur6/Downloads/Urban Analytics/pins_near_bus_stops_map.html")

print("Map with sampled PINs and all CTA Bus Stops saved as 'pins_near_bus_stops_map.html'. Open this file in a browser to view.")

Map with sampled PINs and all CTA Bus Stops saved as 'pins_near_bus_stops_map.html'. Open this file in a browser to view.


## Merging the pins close to stations and close to bus stops to a common csv making sure the pins are unique

In [13]:
# File paths
bus_pins_file = "C:/Users/kaur6/Downloads/Urban Analytics/pins_near_bus_stops.csv"
train_pins_file = "C:/Users/kaur6/Downloads/Urban Analytics/pins_near_train_stations.csv"
output_file = "C:/Users/kaur6/Downloads/Urban Analytics/pins_near_bus_stop_or_train_station.csv"

# Load the data
bus_pins_df = pd.read_csv(bus_pins_file)
train_pins_df = pd.read_csv(train_pins_file)

# Concatenate both DataFrames
combined_pins_df = pd.concat([bus_pins_df, train_pins_df], ignore_index=True)

# Ensure unique pins based on the 'pin' column
combined_pins_df = combined_pins_df.drop_duplicates(subset=['pin'], keep='first')

# Save the resulting DataFrame to a new CSV file
combined_pins_df.to_csv(output_file, index=False)

# Print the number of unique pins
print(f"Number of unique pins in the merged file: {len(combined_pins_df)}")
print(f"Unique pins merged and saved to: {output_file}")

Number of unique pins in the merged file: 438395
Unique pins merged and saved to: C:/Users/kaur6/Downloads/Urban Analytics/pins_near_bus_stop_or_train_station.csv


## Getting the zoning class for the parcels in the pins_near_bus_stop_or_train_station.csv data

## Firstly using the Chicago Zoning district data

In [3]:
# Load datasets
parcel_data = pd.read_csv('C:/Users/kaur6/Downloads/Urban Analytics/pins_near_bus_stop_or_train_station.csv')
zoning_data = pd.read_csv('C:/Users/kaur6/Downloads/Urban Analytics/Zoning_districts.csv')  

# Convert to GeoDataFrame
parcel_gdf = gpd.GeoDataFrame(parcel_data, geometry=gpd.points_from_xy(parcel_data['longitude'], parcel_data['latitude']))
zoning_gdf = gpd.GeoDataFrame(zoning_data, geometry=gpd.GeoSeries.from_wkt(zoning_data['the_geom']))

# Set CRS
parcel_gdf.set_crs('EPSG:4326', allow_override=True, inplace=True)
zoning_gdf.set_crs('EPSG:4326', allow_override=True, inplace=True)

# Perform spatial join
joined_gdf = gpd.sjoin(parcel_gdf, zoning_gdf, how='left', predicate='within')

# Keep only required columns
final_gdf = joined_gdf[parcel_data.columns.tolist() + ['ZONE_CLASS']].copy()

# Remove duplicates based on unique parcel identifier (e.g., 'pin')
#final_gdf = final_gdf.drop_duplicates(subset=['pin'])

# Fill missing ZONE_CLASS with 'Unknown'
final_gdf.loc[:, 'ZONE_CLASS'] = final_gdf['ZONE_CLASS'].fillna('Unknown')

# Drop geometry column to reduce file size
final_gdf = final_gdf.drop(columns=['geometry'])

# Save final cleaned dataset
final_gdf.to_csv('C:/Users/kaur6/Downloads/Urban Analytics/pins_near_bus_stop_or_train_station_with_zone_class.csv', index=False)

In [5]:
# Count rows with 'Unknown' in 'ZONE_CLASS'
unknown_count = joined_gdf[final_gdf['ZONE_CLASS'] == 'Unknown'].shape[0]

# Count rows where 'ZONE_CLASS' is not 'Unknown'
not_unknown_count = joined_gdf[final_gdf['ZONE_CLASS'] != 'Unknown'].shape[0]

# Print the results
print(f"Rows with 'Unknown' ZONE_CLASS: {unknown_count}")
print(f"Rows with non-'Unknown' ZONE_CLASS: {not_unknown_count}")

Rows with 'Unknown' ZONE_CLASS: 66471
Rows with non-'Unknown' ZONE_CLASS: 371928


## Now, getting Zoning class for the pins whose zone class was Unknown

In [15]:
# Load the GeoJSON file
geojson_path = "C:/Users/kaur6/Downloads/Urban Analytics/Zoning_Cook_County.geojson"  
gdf = gpd.read_file(geojson_path)

# Ensure the 'geometry' column is set as the active geometry column
gdf = gdf.set_geometry('geometry')

# Create a spatial index for faster geometry lookup
gdf.sindex  # This creates the spatial index for faster lookups

# Load the CSV file containing PINs (with Unknown ZONE_CLASS)
csv_path = "C:/Users/kaur6/Downloads/Urban Analytics/pins_near_bus_stop_or_train_station_with_zone_class.csv" 
df = pd.read_csv(csv_path)

# Function to find the ZoneID for a given latitude and longitude using spatial indexing
def get_zone_id(lat, lon, gdf):
    point = Point(lon, lat)  # Shapely uses (longitude, latitude) format
    # Create a bounding box (minx, miny, maxx, maxy) around the point
    minx, miny, maxx, maxy = point.bounds
    bbox = box(minx, miny, maxx, maxy)  # Create a Shapely geometry for the bounding box
    
    # Use spatial index query to find geometries that intersect the bounding box
    possible_matches_index = list(gdf.sindex.query(bbox, predicate='intersects'))
    
    # Loop through the possible matches to check if the point is contained
    for idx in possible_matches_index:
        if gdf.iloc[idx].geometry.contains(point):
            return gdf.iloc[idx]["ZoneID"]
    
    return "Unknown"  # If no match is found

# Iterate over each row of the DataFrame where ZONE_CLASS is "Unknown" using apply()
def update_zone_class(row):
    if row["ZONE_CLASS"] == "Unknown":
        # Extract the latitude and longitude
        latitude = row["latitude"]
        longitude = row["longitude"]
        
        # Get the ZoneID using the get_zone_id function
        zone_id = get_zone_id(latitude, longitude, gdf)
        
        # Update the ZONE_CLASS column with the found ZoneID
        return zone_id
    return row["ZONE_CLASS"]

# Apply the function to update the 'ZONE_CLASS' column
df["ZONE_CLASS"] = df.apply(update_zone_class, axis=1)

# Save the updated DataFrame to a new CSV file
output_csv_path = "C:/Users/kaur6/Downloads/Urban Analytics/pins_near_bs_ts_with_zone_class.csv"  # Specify the output file path
df.to_csv(output_csv_path, index=False)

print(f"Updated CSV saved as {output_csv_path}")

Updated CSV saved as C:/Users/kaur6/Downloads/Urban Analytics/pins_near_bs_ts_with_zone_class.csv


In [16]:
unknown_count = df[df["ZONE_CLASS"] == "Unknown"].shape[0]
print(f"Number of rows with ZONE_CLASS as 'Unknown': {unknown_count}")

Number of rows with ZONE_CLASS as 'Unknown': 66166


In [7]:
# Load the CSV file, ensuring PIN is read as a string
csv_path = "C:/Users/kaur6/Downloads/Urban Analytics/pins_near_bs_ts_with_zone_class.csv"
df = pd.read_csv(csv_path)  # Force PIN to be a string

# Define the high-density zoning list
high_density_zones = [
    "R6", "R7", "R8", "C8",
    "RT-4", "RT-3.5", "RM-5", "RM-6.5", "RM-5.5", "RM-6", "RM-4.5",
    "B1-1", "B3-1", "B3-2", "B2-2", "B3-3", "B1-2", "B1-3", "B2-3", "B2-5", "B3-5",
    "C1-2", "C1-1", "C1-3", "C1-5", "C2-2", "C2-1", "C2-3", "C2-5", "C3-2", "C3-3", "C3-5",
    "DX-3", "DX-5", "DX-7", "DX-12", "DX-16",
    "PD 62", "PD 102", "PD 112", "PD 135", "PD 143", "PD 157", "PD 204", "PD 236", "PD 250", "PD 280",
    "PD 314", "PD 355", "PD 362", "PD 368", "PD 384", "PD 395", "PD 412", "PD 416", "PD 420", "PD 421",
    "PD 422", "PD 427", "PD 441", "PD 447", "PD 456", "PD 462", "PD 466", "PD 483", "PD 491", "PD 535",
    "PD 536", "PD 537", "PD 546", "PD 549", "PD 555", "PD 599", "PD 601", "PD 609", "PD 610", "PD 615",
    "PD 622", "PD 623", "PD 631", "PD 632", "PD 636", "PD 637", "PD 645", "PD 650", "PD 651", "PD 661",
    "PD 678", "PD 684", "PD 686", "PD 687", "PD 690", "PD 691", "PD 692", "PD 700", "PD 704", "PD 705",
    "PD 707", "PD 711", "PD 712", "PD 713", "PD 714", "PD 715", "PD 720", "PD 723", "PD 734", "PD 737", 
    "PD 744", "PD 749", "PD 764", "PD 771", "PD 774", "PD 777", "PD 783", "PD 786", "PD 788", "PD 797",
    "PD 803", "PD 817", "PD 826", "PD 827", "PD 828", "PD 831", "PD 832", "PD 833", "PD 836", "PD 838",
    "PD 839", "PD 840", "PD 849", "PD 853", "PD 854", "PD 855", "PD 858", "PD 865", "PD 866", "PD 869",
    "PD 873", "PD 879", "PD 885", "PD 888", "PD 893", "PD 896", "PD 897", "PD 911", "PD 913", "PD 917",
    "PD 918", "PD 921", "PD 928", "PD 929", "PD 930", "PD 937", "PD 939", "PD 945", "PD 948", "PD 955",
    "PD 963", "PD 976", "PD 984", "PD 985", "PD 986", "PD 999", "PD 1001", "PD 1004", "PD 1008", "PD 1009",
    "PD 1013", "PD 1017", "PD 1024", "PD 1027", "PD 1043", "PD 1046", "PD 1064", "PD 1068", "PD 1084", "PD 1095",
    "PD 1101", "PD 1103", "PD 1120", "PD 1131", "PD 1141", "PD 1145", "PD 1169", "PD 1185", "PD 1189", "PD 1197",
    "PD 1206", "PD 1215", "PD 1220", "PD 1237", "PD 1259", "PD 1262", "PD 1270", "PD 1276", "PD 1287", "PD 1289",
    "PD 1292", "PD 1294", "PD 1304", "PD 1305", "PD 1308", "PD 1313", "PD 1319", "PD 1327", "PD 1332", "PD 1335",
    "PD 1340", "PD 1345", "PD 1352", "PD 1357", "PD 1358", "PD 1364", "PD 1370", "PD 1373", "PD 1374", "PD 1379",
    "PD 1399", "PD 1401", "PD 1419", "PD 1423", "PD 1429", "PD 1430", "PD 1439", "PD 1444", "PD 1468", "PD 1473",
    "PD 1481", "PD 1484", "PD 1492", "PD 1501", "PD 1505", "PD 1513", "PD 1514", "PD 1520", "PD 1535", "PD 1536",
    "PD 1540", "PD 1541", "PD 1542", "PD 1543", "PD 1548", "PD 1550", "PD 1552", "PD 1553", "PD 1559", "PD 1568", 
    "PD 1569", "PD 1582", "PD 1584"
]

# Filter the DataFrame for high-density zoning
df_high_density = df[df['ZONE_CLASS'].isin(high_density_zones)]

# Save filtered data, ensuring PIN remains a string
output_path = "C:/Users/kaur6/Downloads/Urban Analytics/filtered_high_density_zoning.csv"
df_high_density.to_csv(output_path, index=False)

# Preview the filtered data
print(df_high_density.head())

               pin   latitude  longitude ZONE_CLASS
3   17333010410000  41.829544 -87.643993       RT-4
5   14201130370000  41.949544 -87.665653     RT-3.5
6   17042080140000  41.910595 -87.630613       RM-5
12  17193150050000  41.854619 -87.682744       RT-4
13  14202100200000  41.951253 -87.656356       RT-4


## Plotting PINs near bus stops and train stations that fall in high density areas

In [6]:
# Load dataset
file_path = "C:/Users/kaur6/Downloads/Urban Analytics/filtered_high_density_zoning.csv"
df = pd.read_csv(file_path)

# Ensure required columns exist
assert {'pin', 'latitude', 'longitude'}.issubset(df.columns), "CSV must contain 'pin', 'latitude', 'longitude' columns"

# Convert PIN to string
df['pin'] = df['pin'].astype(str)

# Create a Folium map centered on the dataset
map_center = [df["latitude"].mean(), df["longitude"].mean()]
m = folium.Map(location=map_center, zoom_start=11, tiles="CartoDB positron")

# Create cluster markers with PIN popups
marker_data = [
    [row["latitude"], row["longitude"], f"PIN: {row['pin']}"] for _, row in df.iterrows()
]

# Attach data to FastMarkerCluster
FastMarkerCluster(
    marker_data,
    callback="""
    function (row) {
        var marker = L.marker(new L.LatLng(row[0], row[1]));
        marker.bindPopup(row[2]); 
        return marker;
    }
    """
).add_to(m)

# Save to an HTML file
map_file = "C:/Users/kaur6/Downloads/Urban Analytics/pins_connected_comm_map.html"
m.save(map_file)
print(f"Map saved to {map_file}")

Map saved to C:/Users/kaur6/Downloads/Urban Analytics/pins_connected_comm_map.html


In [1]:
import folium
from folium.plugins import HeatMap

# Test with a minimal dataset
test_data = [
    [41.829544, -87.643993],
    [41.949544, -87.665653],
    [41.910595, -87.630613],
    [41.854619, -87.682744],
    [41.951253, -87.656356]
]

# Create a base map
m = folium.Map(location=[41.88, -87.63], zoom_start=10, tiles="cartodbpositron")

# Apply HeatMap with test data
HeatMap(test_data, name="Test HeatMap", radius=8).add_to(m)

# Save the map
map_file = "C:/Users/kaur6/Downloads/Urban Analytics/test_heatmap.html"
m.save(map_file)

print(f"Test HeatMap saved successfully at: {map_file}")

Test HeatMap saved successfully at: C:/Users/kaur6/Downloads/Urban Analytics/test_heatmap.html
