<a href="https://colab.research.google.com/github/JamionW/Advanced-Analysis-of-Algorithms/blob/master/Master_Code_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [54]:
## This is the master notebook.

In [55]:
!pip install osmnx # install the osmnx module




In [56]:
# IMPORTS

import pandas as pd
import geopandas as gpd
import fiona
import numpy as np
import osmnx as ox
import networkx as nx
from shapely.ops import nearest_points
from shapely.geometry import Point
from geopandas.tools import sjoin_nearest
from scipy.spatial import cKDTree

In [57]:
def find_nearest_linestring_efficient(gdf_points, gdf_lines, max_distance):
    """
    Find the nearest linestring for each point, up to a maximum distance.
    Uses spatial indexing for efficiency.

    :param gdf_points: GeoDataFrame with point geometries (in UTM)
    :param gdf_lines: GeoDataFrame with linestring geometries (in UTM)
    :param max_distance: Maximum distance to consider (in meters)
    :return: GeoDataFrame with points matched to nearest linestrings
    """
    # Use sjoin_nearest to find the nearest linestring for each point
    joined = sjoin_nearest(gdf_points, gdf_lines, max_distance=max_distance, how='left')

    # Calculate the actual distances
    joined['distance'] = joined.apply(lambda row: row['geometry'].distance(gdf_lines.loc[row['index_right'], 'geometry'])
                                      if pd.notnull(row['index_right']) else None, axis=1)

    # Remove matches beyond max_distance (should be unnecessary due to max_distance in sjoin_nearest, but just in case)
    joined = joined[joined['distance'] <= max_distance]

    # Drop unnecessary columns
    result = joined.drop(columns=['index_right', 'distance'])

    print(f"Matched {result.notna().any(axis=1).sum()} out of {len(result)} points")

    return result


In [58]:
# ADDRESSES

# Read in addresses
# this takes about 20 minutes for the State of Tennessee
# less than a minute for Chattanooga

# Read the GeoJSON file into a GeoDataFrame
#address_df = gpd.read_file('/content/drive/MyDrive/Colab Notebooks/data/tennessee.geojson')

# Chattanooga, for testing
address_df = gpd.read_file('/content/drive/MyDrive/Colab Notebooks/data/chattanooga.geojson')


In [59]:
# Change the CRS of the address dataframe to North America UTM

address_df = address_df.to_crs(epsg=32618)

In [60]:
### This is to engineer the feature for address density

# Extract coordinates from the geometry column
coords = np.array(list(address_df.geometry.apply(lambda x: (x.x, x.y))))

# Build the KD-tree
tree = cKDTree(coords)

# Set the buffer distance (e.g., 1000 meters)
buffer_distance = 1000

# Query the tree for all points within the buffer distance
indices = tree.query_ball_point(coords, r=buffer_distance)

# Count the number of neighbors, excluding the point itself
address_df['address_density'] = [len(idx) - 1 for idx in indices]

# Optionally, normalize the density
max_density = address_df['address_density'].max()
address_df['normalized_density'] = address_df['address_density'] / max_density

# Print some statistics
print(address_df['address_density'].describe())

count    102761.000000
mean       1235.861251
std         643.406327
min           0.000000
25%         729.000000
50%        1196.000000
75%        1663.000000
max        3676.000000
Name: address_density, dtype: float64


In [61]:
# SVI

# Path to .gdb file
gdb_file = "/content/drive/MyDrive/Colab Notebooks/data/SVI2022_TENNESSEE_tract.gdb"

# List all the layers in the .gdb file
layers = fiona.listlayers(gdb_file)
print("Layers in the geodatabase:", layers)

# Read the desired layer (replace 'SVI_layer_name' with the actual layer name)
svi_df = gpd.read_file(gdb_file, layer='SVI2022_TENNESSEE_tract')

#print(svi_df.head())

Layers in the geodatabase: ['SVI2022_TENNESSEE_tract']


  and should_run_async(code)


In [62]:
# Remove all columns from the svi_df dataframe except "geometry","STATE","ST_ABBR","COUNTY","FIPS","LOCATION","AREA_SQMI", and "RPL_THEME4".

svi_df = svi_df[["geometry","STATE","ST_ABBR","COUNTY","FIPS","LOCATION","AREA_SQMI", "RPL_THEME4"]]


In [63]:
#Use to_crs to set the svi df to North America UTM

svi_df = svi_df.to_crs(address_df.crs)
print(svi_df.crs)

EPSG:32618


  and should_run_async(code)


In [64]:
#Join on geometry attributes

joined_svi_address_df = gpd.overlay(address_df, svi_df, how='intersection')

In [65]:
# ROADS

# Import shapefiles
# https://www.census.gov/cgi-bin/geo/shapefiles/index.php

# documentation here: https://www2.census.gov/geo/pdfs/maps-data/data/tiger/tgrshp2023/TGRSHP2023_TechDoc.pdf

# Open the shapefile as a Fiona collection
with fiona.open('/content/drive/MyDrive/Colab Notebooks/data/tl_2023_47065_roads.shp') as collection:
    # Create a GeoDataFrame from the collection
    roads_df = gpd.GeoDataFrame.from_features(collection)

# Set geometry on the roads dataset (using UTM from the start)
roads_df = roads_df.set_crs(epsg=4269)

# Print the attributes of the GeoDataFrame
#print(roads_df.head())

  and should_run_async(code)


In [66]:
# AMENITIES

city = "Chattanooga, Tennessee, USA"
tags = {'amenity': ['school', 'hospital', 'library'],
        'shop': 'supermarket'}

amenities = ox.features_from_place(city, tags=tags)

In [67]:
# Transform Geographies

#4269 is Geographic
#32618 is North American UTM (metric, one unit equals one meter)

roads_df = roads_df.to_crs(epsg=32618)
joined_svi_address_df = joined_svi_address_df.to_crs(epsg=32618)
amenities = amenities.to_crs(epsg=32618)

  and should_run_async(code)


In [68]:
# ensure all dataframes have similar geometries

print("roads_df has the following CRS: ")
print(roads_df.crs)
print("joined_svi_address has the following CRS: ")
print(joined_svi_address_df.crs)
print("amenities has the following CRS: ")
print(amenities.crs)

roads_df has the following CRS: 
EPSG:32618
joined_svi_address has the following CRS: 
EPSG:32618
amenities has the following CRS: 
EPSG:32618


In [89]:
# This calls the function from the beginning which joins a shapefile dataset to a linestring set
joined_svi_df = find_nearest_linestring_efficient(joined_svi_address_df, roads_df, 200)


Matched 111676 out of 111676 points


In [90]:
# This calls the function from the beginning which joins a shapefile dataset to a linestring set
joined_amenities_df = find_nearest_linestring_efficient(amenities, roads_df, 200)


Matched 220 out of 220 points


In [91]:

print(f"Number of addresses: {len(joined_svi_df)}")
print(f"Number of unique address hashes: {joined_svi_df['hash'].nunique()}")
print(f"Number of amenities before cleaning: {len(joined_amenities_df)}")


Number of addresses: 111676
Number of unique address hashes: 102536
Number of amenities before cleaning: 220


In [92]:
# Remove rows where the 'amenity' column is null
joined_amenities_df = joined_amenities_df.dropna(subset=['amenity'])
print(f"Number of amenities after removing null 'amenity' values: {len(joined_amenities_df)}")

Number of amenities after removing null 'amenity' values: 175


In [93]:
# prompt: Can you give me the distribution of valid values in the amenity column of joined_amenities_df?

print(joined_amenities_df['amenity'].value_counts())


amenity
school        144
hospital       25
library         5
restaurant      1
Name: count, dtype: int64


In [94]:
# Filter amenities
amenity_types = ['school', 'library', 'hospital']
filtered_amenities_df = joined_amenities_df[joined_amenities_df['amenity'].isin(amenity_types)]

print(f"Number of filtered amenities: {len(filtered_amenities_df)}")


Number of filtered amenities: 174


In [95]:
# Ensure both GeoDataFrames have the same CRS
filtered_amenities_df = filtered_amenities_df.to_crs(joined_svi_df.crs)

In [96]:
# Create a KD-tree for efficient nearest neighbor search
amenity_coords = [(p.x, p.y) for p in filtered_amenities_df.geometry.centroid]
tree = cKDTree(amenity_coords)

In [97]:
# Find nearest amenities for each address
k = min(len(filtered_amenities_df), 5)  # Find up to 5 nearest amenities or all if less than 5
address_amenities = {}

for idx, address in joined_svi_df.iterrows():
    distances, indices = tree.query(address.geometry.centroid.coords[0], k=k)
    nearest_amenities = filtered_amenities_df.iloc[indices]

    amenity_dict = {}
    for dist, (_, amenity) in zip(distances, nearest_amenities.iterrows()):
        amenity_type = amenity['amenity']
        if amenity_type not in amenity_dict:
            amenity_dict[amenity_type] = (f"{amenity['name']}_{amenity_type}", dist)

    address_amenities[address['hash']] = amenity_dict

In [None]:
#need to work on the below. counts aren't right.

In [98]:
# Convert results to a DataFrame
results = pd.DataFrame.from_dict(address_amenities, orient='index')

# Rename columns and split amenity name and distance
for amenity_type in amenity_types:
    if amenity_type in results.columns:
        results[f'{amenity_type}_name'] = results[amenity_type].apply(lambda x: x[0] if pd.notnull(x) else None)
        results[f'{amenity_type}_distance'] = results[amenity_type].apply(lambda x: x[1] if pd.notnull(x) else None)
        results = results.drop(columns=[amenity_type])

results = results.reset_index().rename(columns={'index': 'address_hash'})

# Merge with original address data
final_results = pd.merge(joined_svi_df, results, left_on='hash', right_on='address_hash', how='left')

print(f"Number of rows in final results: {len(final_results)}")
for amenity_type in amenity_types:
    print(f"Number of non-null values in {amenity_type}_name: {final_results[f'{amenity_type}_name'].notnull().sum()}")
    print(f"Number of non-null values in {amenity_type}_distance: {final_results[f'{amenity_type}_distance'].notnull().sum()}")


Number of rows in final results: 111676
Number of non-null values in school_name: 100815
Number of non-null values in school_distance: 100815
Number of non-null values in library_name: 21947
Number of non-null values in library_distance: 21947
Number of non-null values in hospital_name: 43029
Number of non-null values in hospital_distance: 43029


In [None]:

selected_records = final_results[(final_results['street'] == 'TUCKER ST')] # & (final_results['number'] == '335')]


In [88]:
# prompt: Can I export the selected_record dataframe above to a csv file?

selected_records.to_csv('selected_records.csv', index=False)
