<a href="https://colab.research.google.com/github/JamionW/Advanced-Analysis-of-Algorithms/blob/master/Master_Code_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
## This is the master notebook.

In [2]:
!pip install osmnx # install the osmnx module




In [3]:
# IMPORTS

import pandas as pd
import geopandas as gpd
import fiona
import numpy as np
import osmnx as ox
import networkx as nx
from shapely.ops import nearest_points, linemerge, transform
from shapely.geometry import Point, LineString
from geopandas.tools import sjoin_nearest
from scipy.spatial import cKDTree
from collections import defaultdict
from pyproj import CRS, Transformer
from functools import partial

In [4]:
def find_nearest_linestring_efficient(gdf_points, gdf_lines, max_distance):
    """
    Find the nearest linestring for each point, up to a maximum distance.
    Uses spatial indexing for efficiency.

    :param gdf_points: GeoDataFrame with point geometries (in UTM)
    :param gdf_lines: GeoDataFrame with linestring geometries (in UTM)
    :param max_distance: Maximum distance to consider (in meters)
    :return: GeoDataFrame with points matched to nearest linestrings
    """
    # Use sjoin_nearest to find the nearest linestring for each point
    joined = sjoin_nearest(gdf_points, gdf_lines, max_distance=max_distance, how='left')

    # Calculate the actual distances
    joined['distance'] = joined.apply(lambda row: row['geometry'].distance(gdf_lines.loc[row['index_right'], 'geometry'])
                                      if pd.notnull(row['index_right']) else None, axis=1)

    # Remove matches beyond max_distance (should be unnecessary due to max_distance in sjoin_nearest, but just in case)
    joined = joined[joined['distance'] <= max_distance]

    # Drop unnecessary columns
    result = joined.drop(columns=['index_right', 'distance'])

    print(f"Matched {result.notna().any(axis=1).sum()} out of {len(result)} points")

    return result


# Dataset imports

In [5]:
# ADDRESSES

# Read in addresses
# this takes about 20 minutes for the State of Tennessee
# less than a minute for Chattanooga

# Read the GeoJSON file into a GeoDataFrame
#address_df = gpd.read_file('/content/drive/MyDrive/Colab Notebooks/data/tennessee.geojson')

# Chattanooga, for testing
address_df = gpd.read_file('/content/drive/MyDrive/Colab Notebooks/data/chattanooga.geojson')


In [6]:
# SVI

# Path to .gdb file
gdb_file = "/content/drive/MyDrive/Colab Notebooks/data/SVI2022_TENNESSEE_tract.gdb"

# List all the layers in the .gdb file
layers = fiona.listlayers(gdb_file)
print("Layers in the geodatabase:", layers)

# Read the desired layer
svi_df = gpd.read_file(gdb_file, layer='SVI2022_TENNESSEE_tract')


Layers in the geodatabase: ['SVI2022_TENNESSEE_tract']


In [7]:
# ROADS

# Import shapefiles
# https://www.census.gov/cgi-bin/geo/shapefiles/index.php

# documentation here: https://www2.census.gov/geo/pdfs/maps-data/data/tiger/tgrshp2023/TGRSHP2023_TechDoc.pdf

# Open the shapefile as a Fiona collection
with fiona.open('/content/drive/MyDrive/Colab Notebooks/data/tl_2023_47065_roads.shp') as collection:
    # Create a GeoDataFrame from the collection
    roads_df = gpd.GeoDataFrame.from_features(collection)


In [8]:
# AMENITIES

city = "Chattanooga, Tennessee, USA"
tags = {'amenity': ['school', 'hospital', 'library'],
        'shop': 'supermarket'}

amenities = ox.features_from_place(city, tags=tags)

### Define Coordinate Reference Systems

In [9]:
# Check the original CRS
print("Original CRS:", roads_df.crs)

# If the CRS is None, set it to WGS84 (assuming that's what it should be)
if roads_df.crs is None:
    roads_df.set_crs(epsg=4326, inplace=True)

# Define the target CRS (UTM zone 18N)
target_crs = CRS("EPSG:32618")

# Perform the transformation
roads_df_transformed = roads_df.to_crs(target_crs)

# Check the new CRS
print("New CRS:", roads_df_transformed.crs)

# Print a sample of the transformed geometries
print("Sample of transformed geometries:")
print(roads_df_transformed['geometry'].head())

  and should_run_async(code)


Original CRS: None
New CRS: EPSG:32618
Sample of transformed geometries:
0    LINESTRING (-438047.238 3925110.410, -438015.6...
1    LINESTRING (-438047.238 3925110.410, -437997.8...
2    LINESTRING (-438379.567 3925013.438, -438362.5...
3    LINESTRING (-434083.556 3936175.511, -434066.9...
4    LINESTRING (-436868.331 3941879.564, -436868.2...
Name: geometry, dtype: geometry


In [10]:
print("Bounding box of the data:")
print(roads_df.total_bounds)

Bounding box of the data:
[-85.469528  34.982924 -84.94233   35.459232]


In [11]:
# Define the coordinate reference systems
latlong_crs = CRS("EPSG:4326")  # WGS84 lat/long
utm_crs = CRS("EPSG:32618")  # UTM zone 18N

address_df = address_df.to_crs(utm_crs)
svi_df = svi_df.to_crs(utm_crs)
amenities = amenities.to_crs(utm_crs)

print(address_df.crs)
print(svi_df.crs)
print(roads_df_transformed.crs)
print(amenities.crs)

  and should_run_async(code)


EPSG:32618
EPSG:32618
EPSG:32618
EPSG:32618


### Data Engineering: SVI Filtering

In [12]:
# Remove all columns from the svi_df dataframe except "geometry","STATE","ST_ABBR","COUNTY","FIPS","LOCATION","AREA_SQMI", and "RPL_THEME4".

svi_df = svi_df[["geometry","STATE","ST_ABBR","COUNTY","FIPS","LOCATION","AREA_SQMI", "RPL_THEME4"]]


### Data Engineering: Amenities cleanup

In [13]:
# Remove rows where the 'amenity' column is null
joined_amenities_df = amenities.dropna(subset=['amenity'])
amenities = amenities.dropna(subset=['amenity'])
print(f"Number of amenities after removing null 'amenity' values: {len(amenities)}")

Number of amenities after removing null 'amenity' values: 101


  and should_run_async(code)


In [14]:
# Distribution of valid values in the amenity column of joined_amenities_df

print(amenities['amenity'].value_counts())


amenity
school        85
hospital      10
library        5
restaurant     1
Name: count, dtype: int64


  and should_run_async(code)


In [15]:
# Filter amenities
amenity_types = ['school', 'library', 'hospital']
filtered_amenities_df = amenities[amenities['amenity'].isin(amenity_types)]

print(f"Number of filtered amenities: {len(filtered_amenities_df)}")


Number of filtered amenities: 100


  and should_run_async(code)


### Feature Engineering: Address Density

In [16]:
### This is to engineer the feature for address density

# Extract coordinates from the geometry column
coords = np.array(list(address_df.geometry.apply(lambda x: (x.x, x.y))))

# Build the KD-tree
tree = cKDTree(coords)

# Set the buffer distance (e.g., 1000 meters)
buffer_distance = 1000

# Query the tree for all points within the buffer distance
indices = tree.query_ball_point(coords, r=buffer_distance)

# Count the number of neighbors, excluding the point itself
address_df['address_density'] = [len(idx) - 1 for idx in indices]

# Optionally, normalize the density
max_density = address_df['address_density'].max()
address_df['normalized_density'] = address_df['address_density'] / max_density

# Print some statistics
print(address_df['address_density'].describe())

  and should_run_async(code)


count    102761.000000
mean       1235.861251
std         643.406327
min           0.000000
25%         729.000000
50%        1196.000000
75%        1663.000000
max        3676.000000
Name: address_density, dtype: float64


In [17]:
#Join on geometry attributes

joined_svi_address_df = gpd.overlay(address_df, svi_df, how='intersection')

  and should_run_async(code)


## Place Addresses and Amenities on a graph

In [22]:
# Function to create a graph from a GeoDataFrame of roads
def create_graph_from_roads(roads_gdf):
    G = nx.Graph()
    for idx, row in roads_gdf.iterrows():
        if row.geometry.geom_type == 'LineString':
            start = row.geometry.coords[0]
            end = row.geometry.coords[-1]
            G.add_edge(start, end, geometry=row.geometry, length=row.geometry.length)
        elif row.geometry.geom_type == 'MultiLineString':
            merged = linemerge(row.geometry)
            if merged.geom_type == 'LineString':
                start = merged.coords[0]
                end = merged.coords[-1]
                G.add_edge(start, end, geometry=merged, length=merged.length)
            else:
                for line in merged.geoms:
                    start = line.coords[0]
                    end = line.coords[-1]
                    G.add_edge(start, end, geometry=line, length=line.length)
    return G

# Create the graph
G = create_graph_from_roads(roads_df_transformed)

  and should_run_async(code)


In [None]:
def add_points_to_graph(G, gdf_points):
    for idx, row in gdf_points.iterrows():
        try:
            #print(f"Processing point {idx}")
            point = row.geometry
            #print(f"Point type: {type(point)}")
            #print(f"Point value: {point}")

            if not isinstance(point, Point):
                print(f"Warning: geometry for point {idx} is not a Point object")
                continue

            point_coords = (point.x, point.y)
            #print(f"Point coordinates: {point_coords}")

            # Check if the graph has any nodes
            if len(G.nodes) == 0:
                print("Warning: Graph has no nodes")
                continue

            # Convert nodes to a list of coordinate tuples
            nodes = [n for n in G.nodes if isinstance(n, tuple) and len(n) == 2]

            if not nodes:
                print("Warning: No valid nodes found in the graph")
                continue

            # Use numpy for efficient distance calculation
            nodes_array = np.array(nodes)
            point_array = np.array(point_coords)
            distances = np.sqrt(np.sum((nodes_array - point_array)**2, axis=1))

            nearest_index = np.argmin(distances)
            nearest_node = tuple(nodes[nearest_index])

            #print(f"Nearest node: {nearest_node}")

            point_node = f"point_{idx}"
            G.add_node(point_node, geometry=point, point_data=row.to_dict())

            distance = distances[nearest_index]
            G.add_edge(point_node, nearest_node, length=distance)

        except Exception as e:
            print(f"Error processing point {idx}: {str(e)}")
            print(f"Row data: {row}")
            continue

    return G

# Before calling the function, let's check the graph
print(f"Number of nodes in graph before adding points: {len(G.nodes)}")
print(f"Number of edges in graph before adding points: {len(G.edges)}")
print(f"Sample of node types in graph: {[type(n) for n in list(G.nodes)[:5]]}")

# Now call the function
G = add_points_to_graph(G, joined_svi_address_df)

# After calling the function, check the graph again
print(f"Number of nodes in graph after adding points: {len(G.nodes)}")
print(f"Number of edges in graph after adding points: {len(G.edges)}")

Number of nodes in graph before adding points: 21154
Number of edges in graph before adding points: 13225
Sample of node types in graph: [<class 'tuple'>, <class 'tuple'>, <class 'tuple'>, <class 'tuple'>, <class 'tuple'>]


In [18]:
# This calls the function from the beginning which joins a shapefile dataset to a linestring set
joined_svi_df = find_nearest_linestring_efficient(joined_svi_address_df, roads_df_transformed, 200)


Matched 0 out of 0 points


  print(f"Matched {result.notna().any(axis=1).sum()} out of {len(result)} points")


In [14]:
# This calls the function from the beginning which joins a shapefile dataset to a linestring set
joined_amenities_df = find_nearest_linestring_efficient(amenities, roads_df_transformed, 200)


Matched 0 out of 0 points


  print(f"Matched {result.notna().any(axis=1).sum()} out of {len(result)} points")


# Testing only: filter and export

In [35]:
# Filter to my address (for testing)

selected_records = final_results[(final_results['street'] == 'TUCKER ST')] # & (final_results['number'] == '335')]


In [36]:
# Export the selected_record dataframe above to a csv file

selected_records.to_csv('selected_records.csv', index=False)


