In [None]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
import os

#Guide
#https://www.geeksforgeeks.org/get-the-city-state-and-country-names-from-latitude-and-longitude-using-python/
#
#  Load the GeoJSON file containing the neighborhood polygons
print("Loading GeoJSON file...")
gdf = gpd.read_file('pittsburgh_neighborhoods.geojson')  
print(f"GeoJSON file loaded. Found {len(gdf)} neighborhoods.")

# Ensure the coordinate reference system (CRS) is in WGS84 (EPSG:4326)
gdf = gdf.to_crs(epsg=4326)
print("GeoJSON CRS converted to WGS84 (EPSG:4326).")

# Function to find the neighborhood for a given lat/lon point
def find_neighborhood(lat, lon):
    point = Point(lon, lat)  # Note that the point is (lon, lat) for GeoJSON
    for _, row in gdf.iterrows():
        if row['geometry'].contains(point):  # Check if the point is inside the polygon
            return row['hood']  
    return None  # If no neighborhood contains the point

# Load your CSV file containing lat/lon
print("Loading CSV file...")
df = pd.read_csv('Sales-transaction-2024-geocoded.csv')  
print(f"CSV file loaded with {len(df)} rows.")

# Create a new file 
file_name, file_extension = os.path.splitext('Sales-transaction-2024-geocoded.csv')
output_file = f"{file_name}_with_neighborhood{file_extension}"

# Write headers initially
with open(output_file, 'w', newline='') as f:
    df.iloc[0:0].to_csv(f, header=True, index=False)  # Write the headers first

# Flag to track if header has been written
header_written = False

# Process each row and write to the file as we go
for index, row in df.iterrows():
    if index % 100 == 0:
        print(f"Processing row {index + 1} of {len(df)}...")

    # Find the neighborhood for this row
    neighborhood = find_neighborhood(row['latitude'], row['longitude'])

    if neighborhood:  # Only include rows where a valid neighborhood is found
        row['neighborhood'] = neighborhood  # Add the neighborhood to the row
        print(f"Row {index}: Latitude: {row['latitude']}, Longitude: {row['longitude']} -> Neighborhood: {neighborhood}")
        
        # Append this row to the output file
        with open(output_file, 'a', newline='') as f:
            row.to_frame().T.to_csv(f, header=False, index=False)  # Append the row without header
    else:
        print(f"Row {index}: Latitude: {row['latitude']}, Longitude: {row['longitude']} -> Neighborhood not found.")
        continue  # Skip this row if no neighborhood was found

print(f"Neighborhood lookup complete. File saved to: {output_file}")


Loading GeoJSON file...
GeoJSON file loaded. Found 90 neighborhoods.
GeoJSON CRS converted to WGS84 (EPSG:4326).
Loading CSV file...
CSV file loaded with 30861 rows.
Processing row 1 of 30861...
Row 0: Latitude: 40.428643, Longitude: -79.949116 -> Neighborhood: Greenfield
Row 1: Latitude: 40.510815, Longitude: -79.975114 -> Neighborhood not found.
Row 2: Latitude: 40.40829, Longitude: -80.03478 -> Neighborhood: Banksville
Row 3: Latitude: 40.327924, Longitude: -80.13384 -> Neighborhood not found.
Row 4: Latitude: 40.436931, Longitude: -79.982537 -> Neighborhood: Bluff
Row 5: Latitude: 40.57227, Longitude: -80.02788 -> Neighborhood not found.
Row 6: Latitude: 40.489757, Longitude: -79.776242 -> Neighborhood not found.
Row 7: Latitude: 40.57231, Longitude: -80.02593 -> Neighborhood not found.
Row 8: Latitude: 40.33836, Longitude: -79.84358 -> Neighborhood not found.
Row 9: Latitude: 40.470742, Longitude: -80.036216 -> Neighborhood: Marshall-Shadeland
Row 10: Latitude: 40.63526, Longitude