In [2]:
import pandas as pd
import geopandas as gpd
from shapely import wkt
import zipfile

# Place Data Processing

In [4]:
path = "../data/"

In [6]:
demand_path = f"{path}raw/yellow_tripdata_2025-01.parquet"
zones_path = f"{path}raw/taxi_zones.zip"
places_path = f"{path}raw/nyc_places.csv" 

## Adding zone Id to OpenStreetMap places

In [10]:
def add_zone_id_to_places(places_file, taxi_zones_zip):
    try:
        places_df = pd.read_csv(places_file)
    except FileNotFoundError:
        print(f"Error: Places file not found at {places_file}")
        return None
    try:
        places_df['geometry'] = places_df['geometry'].apply(wkt.loads)
        places_gdf = gpd.GeoDataFrame(places_df, geometry='geometry', crs="EPSG:4326") 
    except Exception as e:
        print(f"Error converting geometry in places file: {e}")
        return None

    try:
        with zipfile.ZipFile(taxi_zones_zip, 'r') as zf:
            shp_filename = [f.filename for f in zf.filelist if f.filename.endswith('.shp')][0]
            zones_gdf = gpd.read_file(f"zip://{taxi_zones_zip}!{shp_filename}")
            zones_gdf = zones_gdf.to_crs(places_gdf.crs) # Ensure same CRS
    except FileNotFoundError:
        print(f"Error: Taxi zones zip file not found at {taxi_zones_zip}")
        return None
    except IndexError:
        print(f"Error: No .shp file found inside {taxi_zones_zip}")
        return None
    except Exception as e:
        print(f"Error reading taxi zones: {e}")
        return None

    def find_zone_id(place):
        for index, zone in zones_gdf.iterrows():
            if place.geometry.intersects(zone.geometry):
                return zone['LocationID']
            elif place.geometry.within(zone.geometry):
                return zone['LocationID']
            elif zone.geometry.intersects(place.geometry): # For reverse intersection
                return zone['LocationID']
            elif zone.geometry.within(place.geometry): # For reverse within
                return zone['LocationID']
        return None

    places_gdf['zone_id'] = places_gdf.apply(find_zone_id, axis=1)

    return places_gdf


In [12]:
places_with_zones = add_zone_id_to_places(places_path, zones_path)

In [13]:
places_with_zones.head()

Unnamed: 0,place_type,place_subtype,geometry,opening_hours,zone_id
0,transport,bus stop,POINT (-73.94962 40.81410),,42.0
1,transport,bus stop,POINT (-73.94984 40.76168),,202.0
2,transport,bus stop,POINT (-73.98250 40.58210),,55.0
3,transport,bus stop,POINT (-73.95397 40.58801),,210.0
4,transport,station station,POINT (-73.90387 40.85841),,94.0


In [14]:
## Mapping of places in 11 different functionalities
### ["food", "shop", "company", "government", "entertainment", "health", "educational", "transportation", "residential", "community", "sport"]

In [15]:
subtype_mapping = {
    'bus stop': 'transportation', 
    'station station': 'transportation',
    'parking': 'transportation',
    'attraction': 'entertainment',
    'apartments': 'residential',
    'fast_food': 'food',
    'school': 'educational',
    'park': 'entertainment',
    'social_facility': 'community', 
    'hospital': 'health',
    'university': 'educational',
    'place_of_worship': 'community', 
    'bicycle_parking': 'transportation',
    'college': 'educational',
    'post_office': 'government',
    'clinic': 'health',
    'restaurant': 'food',
    'artwork': 'entertainment', 
    'theatre': 'entertainment',
    'doctors': 'health',
    'convenience': 'shop',
    'playground': 'entertainment',
    'sports_centre': 'sport',
    'clothes': 'shop',
    'doityourself': 'shop',
    'arts_centre': 'entertainment',
    'cinema': 'entertainment',
    'bar': 'food',
    'community_centre': 'community', 
    'fire_station': 'government',
    'library': 'educational', 
    'kindergarten': 'educational',
    'police': 'government',
    'museum': 'entertainment',
    'books': 'shop',
    'mall': 'shop',
    'residential area': 'residential',
    'hostel': 'entertainment',
    'pub': 'food',
    'viewpoint': 'entertainment',
    'cafe': 'food',
    'supermarket': 'shop',
    'butcher': 'food',
    'greengrocer': 'food',
    'veterinary': 'health',
    'childcare': 'educational',  
    'hardware': 'shop',
    'atm': 'company',  
    'bakery': 'food',
    'laundry': 'shop',
    'fountain': 'entertainment',  
    'fuel': 'company',
    'bank': 'company',
    'marketplace': 'shop',
    'pharmacy': 'health',
    'dentist': 'health',
    'jewelry': 'shop',
    'furniture': 'shop',
    'electronics': 'shop',
    'shoes': 'shop',
    'picnic_site': 'entertainment',
    'bench': 'entertainment',
    'taxi': 'transportation',
    'florist': 'shop',
    'garden_centre': 'shop',
    'estate_agent': 'company',
    'bus_station': 'transportation',
    'nightclub': 'entertainment',
    'kiosk': 'shop',
    'car_wash': 'company',
    'language_school': 'educational',
    'residential': 'residential',
    'internet_cafe': 'shop', 
    'camp_site': 'entertainment',
    'clock': 'entertainment', 
    'shelter': 'community',  
    'house': 'residential',
    'fortune_teller': 'entertainment', 
    'dojo': 'sport',
    'planetarium': 'entertainment',
    'boat_rental': 'entertainment',
    'driving_school': 'educational',
    'swimming_pool': 'sport',
    'cafe;bar': 'food',
    'money_transfer': 'company',
    'hairdresser': 'shop',
    'detached': 'residential',
    'stadium': 'sport', 
    'international_organization': 'government',
    'semidetached_house': 'residential',
    'dormitory': 'residential',  
    'events_venue': 'entertainment',
    'gallery': 'entertainment',
    'nursing_home': 'health',
    'bungalow': 'residential',
    'collector': 'government',  
    'funeral_directors': 'company',
    'deli': 'food',
    'storage_rental': 'company',
    'Medical Center': 'health',
    'diplomatic': 'government',
    'stock_exchange': 'company', 
    'toilets': 'community', 
    'gift': 'shop',
    'taxi stand': 'transportation',
    'yes': None, 
    'vacant': None, 
    None: None
}
places_with_zones['place_category'] = places_with_zones['place_subtype'].map(subtype_mapping)

In [16]:
### Checking and completing lines that have not been categorized

In [17]:
places_with_zones[places_with_zones.place_category.isna()]

Unnamed: 0,place_type,place_subtype,geometry,opening_hours,zone_id,place_category
60663,office,yes,"POLYGON ((-73.98650 40.74849, -73.98516 40.747...",Mo-Su 08:00-02:00,164.0,
65953,shop,vacant,"POLYGON ((-73.99017 40.72739, -73.99021 40.727...",,79.0,
169227,tourism,yes,"POLYGON ((-73.97509 40.77767, -73.97556 40.777...",,239.0,
174374,,,"POLYGON ((-73.86062 40.76066, -73.86072 40.760...",,70.0,


In [18]:
nan_mask = places_with_zones['place_category'].isna()
places_with_zones.loc[nan_mask, 'place_category'] = ['company', 'shop','entertainment','company']

In [19]:
### Counting and normalizing the number of places per category in each NYC zone

In [38]:
zone_places = places_with_zones.groupby(["zone_id", "place_category"])["place_type"].count().reset_index(name="number_places")
def normalize_group(group):
    max_val = group['number_places'].max()
    if max_val > 0:
        group['category_score'] = group['number_places'] / max_val
    else:
        group['category_score'] = 0
    return group

zone_places = zone_places.groupby('place_category').apply(normalize_group).reset_index(drop=True)


  zone_places = zone_places.groupby('place_category').apply(normalize_group).reset_index(drop=True)


In [40]:
zone_places.head()

Unnamed: 0,zone_id,place_category,number_places,category_score
0,3.0,community,4,0.068966
1,4.0,community,7,0.12069
2,5.0,community,2,0.034483
3,6.0,community,6,0.103448
4,7.0,community,23,0.396552


In [44]:
# Zones with maximum score (have more places) for each category
zone_places[zone_places.category_score == 1]

Unnamed: 0,zone_id,place_category,number_places,category_score
12,17.0,community,58,1.0
246,14.0,company,59,1.0
640,181.0,educational,83,1.0
758,43.0,entertainment,102,1.0
1048,79.0,food,532,1.0
1244,36.0,government,9,1.0
1593,181.0,health,122,1.0
1761,109.0,residential,9741,1.0
2105,228.0,shop,205,1.0
2235,109.0,sport,3128,1.0


In [46]:
zone_places.to_csv(f"{path}/processed/zone_places_type.csv")