# Enhanced Geospatial EDA for NYC Taxi Data
## Using Dask, GeoPandas, H3, and Advanced Visualization

This notebook performs comprehensive exploratory data analysis on NYC taxi data with focus on geospatial patterns, temporal trends, and advanced visualizations.

In [8]:
# Import all necessary libraries
import warnings
warnings.filterwarnings('ignore')

# Core data processing
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
import pandas as pd
import numpy as np

# Geospatial libraries
import geopandas as gpd
import h3
from shapely.geometry import Point, Polygon
from shapely import wkt
import contextily as ctx

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import HeatMap, MarkerCluster, FastMarkerCluster
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio

# Statistical analysis
from scipy import stats
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

# Utilities
import logging
from datetime import datetime, timedelta
import os
import json

# Configuration
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pio.templates.default = "plotly_white"

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

print("‚úÖ All libraries imported successfully")

‚úÖ All libraries imported successfully


In [9]:
# Configuration parameters
class Config:
    # Data paths
    RAW_TAXI_DATA_PATTERN = "../data/yellow_tripdata_*.csv"
    TAXI_ZONES_SHAPEFILE = "../data/taxi_zones/taxi_zones.shp"
    NYC_BOROUGHS_SHAPEFILE = "../data/boroughs/boroughs.shp"
    
    # Column names
    PICKUP_DATETIME_COL = 'tpep_pickup_datetime'
    DROPOFF_DATETIME_COL = 'tpep_dropoff_datetime'
    PICKUP_LAT_COL = 'pickup_latitude'
    PICKUP_LON_COL = 'pickup_longitude'
    DROPOFF_LAT_COL = 'dropoff_latitude'
    DROPOFF_LON_COL = 'dropoff_longitude'
    
    # H3 configuration
    H3_RESOLUTIONS = [7, 8, 9]  # Multiple resolutions for different analyses
    H3_MAIN_RESOLUTION = 8
    
    # NYC boundaries (approximate)
    NYC_BOUNDS = {
        'lat_min': 40.4774, 'lat_max': 40.9176,
        'lon_min': -74.2591, 'lon_max': -73.7004
    }
    
    # Dask configuration
    DASK_WORKERS = 4
    DASK_THREADS_PER_WORKER = 2
    DASK_MEMORY_LIMIT = '4GB'
    
    # Sampling rates for different analyses
    SAMPLE_RATES = {
        'visualization': 0.01,
        'clustering': 0.005,
        'heatmap': 0.001
    }

config = Config()
print("‚úÖ Configuration loaded")

‚úÖ Configuration loaded


In [10]:
# Setup Dask Client with optimized configuration
def setup_dask_client():
    """Initialize Dask client with proper cleanup."""
    try:
        # Clean up existing clients
        if 'client' in globals() and client:
            client.close()
        if 'cluster' in globals() and cluster:
            cluster.close()
    except (NameError, Exception) as e:
        logger.debug(f"No existing client to close: {e}")
    
    try:
        cluster = LocalCluster(
            n_workers=config.DASK_WORKERS,
            threads_per_worker=config.DASK_THREADS_PER_WORKER,
            memory_limit=config.DASK_MEMORY_LIMIT,
            dashboard_address=':8787'
        )
        client = Client(cluster)
        logger.info(f"üöÄ Dask Client initialized: {client.dashboard_link}")
        return client, cluster
    except Exception as e:
        logger.error(f"‚ùå Failed to initialize Dask client: {e}")
        return None, None

client, cluster = setup_dask_client()
print(f"Dask Dashboard: {client.dashboard_link if client else 'Not available'}")

2025-06-09 15:39:41,256 - INFO - üöÄ Dask Client initialized: http://127.0.0.1:62124/status


Dask Dashboard: http://127.0.0.1:62124/status


In [11]:
# Utility functions for data processing
class DataProcessor:
    @staticmethod
    def get_taxi_dtypes():
        """Define data types for taxi data to ensure proper loading."""
        return {
            'VendorID': 'float64',
            'passenger_count': 'float64',
            'trip_distance': 'float64',
            'RatecodeID': 'float64',
            'store_and_fwd_flag': 'object',
            config.PICKUP_LON_COL: 'float64',
            config.PICKUP_LAT_COL: 'float64',
            config.DROPOFF_LON_COL: 'float64',
            config.DROPOFF_LAT_COL: 'float64',
            'payment_type': 'float64',
            'fare_amount': 'float64',
            'extra': 'float64',
            'mta_tax': 'float64',
            'tip_amount': 'float64',
            'tolls_amount': 'float64',
            'improvement_surcharge': 'float64',
            'total_amount': 'float64',
            'congestion_surcharge': 'float64',
            'airport_fee': 'float64'
        }
    
    @staticmethod
    def filter_nyc_bounds(df, lat_col, lon_col):
        """Filter data to NYC boundaries."""
        return df[
            (df[lat_col].between(config.NYC_BOUNDS['lat_min'], config.NYC_BOUNDS['lat_max'])) &
            (df[lon_col].between(config.NYC_BOUNDS['lon_min'], config.NYC_BOUNDS['lon_max']))
        ]
    
    @staticmethod
    def safe_h3_convert(lat, lon, resolution):
        """Safely convert lat/lon to H3 hex."""
        try:
            if pd.isna(lat) or pd.isna(lon):
                return None
            return h3.geo_to_h3(float(lat), float(lon), int(resolution))
        except Exception:
            return None
    
    @staticmethod
    def apply_h3_to_partition(df_partition, lat_col, lon_col, resolution, h3_col_name):
        """Apply H3 conversion to a Dask partition."""
        if lat_col not in df_partition.columns or lon_col not in df_partition.columns:
            df_partition[h3_col_name] = None
            return df_partition
        
        df_partition[h3_col_name] = df_partition.apply(
            lambda row: DataProcessor.safe_h3_convert(row[lat_col], row[lon_col], resolution),
            axis=1
        )
        return df_partition

processor = DataProcessor()
print("‚úÖ Data processing utilities loaded")

‚úÖ Data processing utilities loaded


In [12]:
# Load and preprocess taxi data
def load_taxi_data():
    """Load taxi data with proper preprocessing."""
    logger.info("üìä Loading taxi data...")
    
    try:
        # Load data with specified dtypes
        dtypes = processor.get_taxi_dtypes()
        
        ddf = dd.read_csv(
            config.RAW_TAXI_DATA_PATTERN,
            blocksize='128MB',
            dtype=dtypes,
            assume_missing=True,
            on_bad_lines='skip'  # replaces `error_bad_lines=False` in pandas ‚â•1.3.0
        )

            
        # Convert datetime columns
        date_cols = [config.PICKUP_DATETIME_COL, config.DROPOFF_DATETIME_COL]
        for col in date_cols:
            if col in ddf.columns:
                ddf[col] = dd.to_datetime(ddf[col], errors='coerce')
        
        # Ensure lat/lon columns are numeric
        coord_cols = [config.PICKUP_LAT_COL, config.PICKUP_LON_COL, 
                     config.DROPOFF_LAT_COL, config.DROPOFF_LON_COL]
        
        for col in coord_cols:
            if col in ddf.columns:
                if not pd.api.types.is_numeric_dtype(ddf[col].dtype):
                    ddf[col] = dd.to_numeric(ddf[col], errors='coerce')
        
        logger.info(f"‚úÖ Loaded {ddf.npartitions} partitions")
        logger.info(f"Columns: {list(ddf.columns)}")
        
        return ddf
        
    except Exception as e:
        logger.error(f"‚ùå Failed to load taxi data: {e}")
        raise

# Load the data
raw_taxi_ddf = load_taxi_data()
print(f"Data shape: {raw_taxi_ddf.map_partitions(len).sum().compute()} rows")
print(f"Sample data:")
display(raw_taxi_ddf.head())

2025-06-09 15:39:41,331 - INFO - üìä Loading taxi data...
2025-06-09 15:39:41,472 - INFO - ‚úÖ Loaded 55 partitions
2025-06-09 15:39:41,472 - INFO - Columns: ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'pickup_longitude', 'pickup_latitude', 'RateCodeID', 'store_and_fwd_flag', 'dropoff_longitude', 'dropoff_latitude', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount']


Data shape: 47248845 rows
Sample data:


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2.0,2015-01-15 19:05:39,2015-01-15 19:23:42,1.0,1.59,-73.993896,40.750111,1.0,N,-73.974785,40.750618,1.0,12.0,1.0,0.5,3.25,0.0,0.3,17.05
1,1.0,2015-01-10 20:33:38,2015-01-10 20:53:28,1.0,3.3,-74.001648,40.724243,1.0,N,-73.994415,40.759109,1.0,14.5,0.5,0.5,2.0,0.0,0.3,17.8
2,1.0,2015-01-10 20:33:38,2015-01-10 20:43:41,1.0,1.8,-73.963341,40.802788,1.0,N,-73.95182,40.824413,2.0,9.5,0.5,0.5,0.0,0.0,0.3,10.8
3,1.0,2015-01-10 20:33:39,2015-01-10 20:35:31,1.0,0.5,-74.009087,40.713818,1.0,N,-74.004326,40.719986,2.0,3.5,0.5,0.5,0.0,0.0,0.3,4.8
4,1.0,2015-01-10 20:33:39,2015-01-10 20:52:58,1.0,3.0,-73.971176,40.762428,1.0,N,-74.004181,40.742653,2.0,15.0,0.5,0.5,0.0,0.0,0.3,16.3


In [13]:
# Add H3 hexagon zones for different resolutions
def add_h3_zones(ddf):
    """Add H3 hexagon zones at multiple resolutions."""
    logger.info("üî∑ Adding H3 zones...")
    
    processed_ddf = ddf.copy()
    
    # Check if coordinate columns exist
    if not all(col in ddf.columns for col in [config.PICKUP_LAT_COL, config.PICKUP_LON_COL]):
        logger.warning("‚ùå Coordinate columns not found")
        return processed_ddf
    
    # Add H3 zones for each resolution
    for resolution in config.H3_RESOLUTIONS:
        h3_col = f'pickup_h3_r{resolution}'
        logger.info(f"Adding H3 resolution {resolution}...")
        
        # Create metadata for new column
        meta = processed_ddf._meta.copy()
        meta[h3_col] = 'object'
        
        # Apply H3 conversion
        processed_ddf = processed_ddf.map_partitions(
            processor.apply_h3_to_partition,
            lat_col=config.PICKUP_LAT_COL,
            lon_col=config.PICKUP_LON_COL,
            resolution=resolution,
            h3_col_name=h3_col,
            meta=meta
        )
    
    # Add temporal features
    if config.PICKUP_DATETIME_COL in processed_ddf.columns:
        processed_ddf['pickup_hour'] = processed_ddf[config.PICKUP_DATETIME_COL].dt.hour
        processed_ddf['pickup_day_of_week'] = processed_ddf[config.PICKUP_DATETIME_COL].dt.dayofweek
        processed_ddf['pickup_month'] = processed_ddf[config.PICKUP_DATETIME_COL].dt.month
        processed_ddf['pickup_date'] = processed_ddf[config.PICKUP_DATETIME_COL].dt.date
    
    logger.info("‚úÖ H3 zones and temporal features added")
    return processed_ddf

# Process the data
processed_ddf = add_h3_zones(raw_taxi_ddf)

# Show sample with H3 zones
h3_sample = processed_ddf[[
    config.PICKUP_LAT_COL, config.PICKUP_LON_COL,
    f'pickup_h3_r{config.H3_MAIN_RESOLUTION}',
    'pickup_hour', 'pickup_day_of_week'
]].head()

print("Sample with H3 zones:")
display(h3_sample)

2025-06-09 15:41:11,792 - INFO - üî∑ Adding H3 zones...
2025-06-09 15:41:11,805 - INFO - Adding H3 resolution 7...
2025-06-09 15:41:11,840 - INFO - Adding H3 resolution 8...
2025-06-09 15:41:11,846 - INFO - Adding H3 resolution 9...
2025-06-09 15:41:11,892 - INFO - ‚úÖ H3 zones and temporal features added


Sample with H3 zones:


Unnamed: 0,pickup_latitude,pickup_longitude,pickup_h3_r8,pickup_hour,pickup_day_of_week
0,40.750111,-73.993896,,19,3
1,40.724243,-74.001648,,20,5
2,40.802788,-73.963341,,20,5
3,40.713818,-74.009087,,20,5
4,40.762428,-73.971176,,20,5


In [14]:
# Load geospatial reference data
def load_geospatial_data():
    """Load NYC geospatial reference data."""
    geodata = {}
    
    # Try to load taxi zones
    try:
        if os.path.exists(config.TAXI_ZONES_SHAPEFILE):
            geodata['taxi_zones'] = gpd.read_file(config.TAXI_ZONES_SHAPEFILE)
            logger.info(f"‚úÖ Loaded {len(geodata['taxi_zones'])} taxi zones")
        else:
            logger.warning("‚ö†Ô∏è Taxi zones shapefile not found")
    except Exception as e:
        logger.error(f"‚ùå Failed to load taxi zones: {e}")
    
    # Try to load boroughs
    try:
        if os.path.exists(config.NYC_BOROUGHS_SHAPEFILE):
            geodata['boroughs'] = gpd.read_file(config.NYC_BOROUGHS_SHAPEFILE)
            logger.info(f"‚úÖ Loaded {len(geodata['boroughs'])} boroughs")
        else:
            logger.warning("‚ö†Ô∏è Boroughs shapefile not found")
    except Exception as e:
        logger.error(f"‚ùå Failed to load boroughs: {e}")
    
    # Create NYC boundary if no shapefiles available
    if not geodata:
        logger.info("üìç Creating NYC boundary polygon")
        bounds = config.NYC_BOUNDS
        nyc_polygon = Polygon([
            (bounds['lon_min'], bounds['lat_min']),
            (bounds['lon_max'], bounds['lat_min']),
            (bounds['lon_max'], bounds['lat_max']),
            (bounds['lon_min'], bounds['lat_max'])
        ])
        geodata['nyc_boundary'] = gpd.GeoDataFrame(
            {'name': ['NYC']}, 
            geometry=[nyc_polygon], 
            crs='EPSG:4326'
        )
    
    return geodata

# Load geospatial data
geo_data = load_geospatial_data()

# Display available geospatial data
for key, gdf in geo_data.items():
    print(f"{key}: {len(gdf)} features")
    if len(gdf) > 0:
        print(f"  Columns: {list(gdf.columns)}")
        print(f"  CRS: {gdf.crs}")
        print()

2025-06-09 15:41:51,420 - INFO - ‚úÖ Loaded 263 taxi zones


taxi_zones: 263 features
  Columns: ['OBJECTID', 'Shape_Leng', 'Shape_Area', 'zone', 'LocationID', 'borough', 'geometry']
  CRS: EPSG:2263



In [15]:
# Data Quality Assessment
def assess_data_quality(ddf):
    """Comprehensive data quality assessment."""
    logger.info("üîç Assessing data quality...")
    
    # Basic statistics
    total_rows = len(ddf)
    logger.info(f"Total rows: {total_rows:,}")
    
    # Missing values analysis
    null_counts = ddf.isnull().sum().compute()
    null_percentages = (null_counts / total_rows * 100).round(2)
    
    quality_df = pd.DataFrame({
        'Column': null_counts.index,
        'Null_Count': null_counts.values,
        'Null_Percentage': null_percentages.values
    })
    quality_df = quality_df[quality_df['Null_Count'] > 0].sort_values('Null_Percentage', ascending=False)
    
    # Coordinate validity check
    coord_stats = {}
    coord_cols = [config.PICKUP_LAT_COL, config.PICKUP_LON_COL]
    
    for col in coord_cols:
        if col in ddf.columns:
            valid_coords = ddf[col].between(
                config.NYC_BOUNDS['lat_min'] if 'lat' in col else config.NYC_BOUNDS['lon_min'],
                config.NYC_BOUNDS['lat_max'] if 'lat' in col else config.NYC_BOUNDS['lon_max']
            ).sum().compute()
            coord_stats[col] = {
                'valid_count': valid_coords,
                'valid_percentage': (valid_coords / total_rows * 100).round(2)
            }
    
    return {
        'total_rows': total_rows,
        'quality_df': quality_df,
        'coord_stats': coord_stats
    }

# Assess data quality
quality_assessment = assess_data_quality(processed_ddf)

print(f"üìä Data Quality Assessment")
print(f"Total rows: {quality_assessment['total_rows']:,}")
print("\nüîç Missing Values:")
display(quality_assessment['quality_df'])

print("\nüìç Coordinate Validity:")
for col, stats in quality_assessment['coord_stats'].items():
    print(f"{col}: {stats['valid_count']:,} valid ({stats['valid_percentage']:.1f}%)")

2025-06-09 15:41:51,442 - INFO - üîç Assessing data quality...


ValueError: The columns in the computed data do not match the columns in the provided metadata.
  Extra:   ['RatecodeID']
  Missing: ['RateCodeID']