<a href="https://colab.research.google.com/github/MaryamAliAljallaf/Activity-Air-Quality-Analysis-in-NYC/blob/main/EDA_Chicago_Space_Time_Clustering_of_Mobility_Patterns_and_Air_Quality_Hotspots_(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Project Title: Space-Time Clustering of Mobility Patterns and Air Quality Hotspots for Chicago**

Temporal Resolution:
* Round timestamps to 1 hour for alignment.

Temporal Aggregation:
*    For each grid cell (geohash) & time slice (1 hour), calculate:

      *    Mobility: Count of total trips.

      *    Air Quality: Mean PM25, CO, Temperature, and	Humidity

In [None]:
#######AQ_data#######
# Round the datetime to 1h intervals
AQ_data['time_bin'] = AQ_data['ReadingDateTimeUTC'].dt.floor('1h')

# Group by geohash and the time_bin column and aggregates the environmental variables using the mean
AQ_aggregated = AQ_data.groupby(['geohash', 'time_bin']).agg({
    'Temperature': 'mean',
    'Humidity': 'mean',
    'PM25': 'mean',
    'CO':'mean',}).reset_index()

print("AQ_aggregated",AQ_aggregated.shape)
AQ_aggregated.head(5)


AQ_aggregated (198635, 6)


Unnamed: 0,geohash,time_bin,Temperature,Humidity,PM25,CO
0,dp3t59,2021-06-20 00:00:00,30.055297,46.810979,6.946481,0.147665
1,dp3t59,2021-06-20 01:00:00,26.354882,54.283714,6.39121,0.165499
2,dp3t59,2021-06-20 02:00:00,24.111119,60.046387,8.341706,0.143812
3,dp3t59,2021-06-20 03:00:00,22.109796,68.313267,12.796489,0.143388
4,dp3t59,2021-06-20 04:00:00,20.651819,72.248907,8.5597,0.136335


In [None]:
#######Taxi_trips_data_filtered#######
# Round the datetime to 1h intervals
Taxi_trips_data_filtered = Taxi_trips_data_filtered.copy()

Taxi_trips_data_filtered['Trip Start time_bin'] = Taxi_trips_data_filtered['Trip Start Timestamp'].dt.floor('1h')
Taxi_trips_data_filtered['Trip End time_bin'] = Taxi_trips_data_filtered['Trip End Timestamp'].dt.floor('1h')

# Group by geohash and the time_bin column

# Aggregate pickup trips per geohash per hour
Taxi_Pickup_trips_data_aggregated = Taxi_trips_data_filtered.groupby(
    ['Pickup_geohash', 'Trip Start time_bin']).agg({'Pickup Centroid Latitude': 'count'}).reset_index()

Taxi_Pickup_trips_data_aggregated.rename(columns={'Pickup Centroid Latitude': 'pickup_trip_count'}, inplace=True)

Taxi_Pickup_trips_data_aggregated.head(5)

Unnamed: 0,Pickup_geohash,Trip Start time_bin,pickup_trip_count
0,dp3qz6,2021-06-20 00:00:00,20
1,dp3qz6,2021-06-20 01:00:00,13
2,dp3qz6,2021-06-20 02:00:00,13
3,dp3qz6,2021-06-20 03:00:00,6
4,dp3qz6,2021-06-20 04:00:00,2


In [None]:
# Aggregate dropoff trips per geohash per hour
Taxi_Dropoff_trips_data_aggregated = Taxi_trips_data_filtered.groupby(
    ['Dropoff_geohash', 'Trip End time_bin']).agg({'Dropoff Centroid Latitude': 'count'}).reset_index()

Taxi_Dropoff_trips_data_aggregated.rename(columns={'Dropoff Centroid Latitude': 'dropoff_trip_count'}, inplace=True)

Taxi_Dropoff_trips_data_aggregated.head(5)

Unnamed: 0,Dropoff_geohash,Trip End time_bin,dropoff_trip_count
0,dp3qz6,2021-06-20 00:00:00,5
1,dp3qz6,2021-06-20 01:00:00,2
2,dp3qz6,2021-06-20 02:00:00,6
3,dp3qz6,2021-06-20 03:00:00,3
4,dp3qz6,2021-06-20 04:00:00,12


In [None]:
# Rename columns for consistency
Taxi_Pickup_trips_data_aggregated = Taxi_Pickup_trips_data_aggregated.rename(columns={
    'Pickup_geohash': 'geohash',
    'Trip Start time_bin': 'time_bin',})

Taxi_Dropoff_trips_data_aggregated = Taxi_Dropoff_trips_data_aggregated.rename(columns={
    'Dropoff_geohash': 'geohash',
    'Trip End time_bin': 'time_bin',})

# Merge on geohash and time_bin
Taxi_trips_joined = pd.merge(
    Taxi_Pickup_trips_data_aggregated,
    Taxi_Dropoff_trips_data_aggregated,
    on=['geohash', 'time_bin'],
    how='outer')

# Fill missing values with 0
Taxi_trips_joined[['pickup_trip_count', 'dropoff_trip_count']] = Taxi_trips_joined[['pickup_trip_count', 'dropoff_trip_count']].fillna(0).astype(int)

# Add total_trip_count column
Taxi_trips_joined['total_trip_count'] = Taxi_trips_joined['pickup_trip_count'] + Taxi_trips_joined['dropoff_trip_count']

#Sorting the results (for easier debugging/visualization):
Taxi_trips_joined = Taxi_trips_joined.sort_values(by=['geohash', 'time_bin']).reset_index(drop=True)

#Confirm data integrity:
print("Unique geohashes:", Taxi_trips_joined['geohash'].nunique())
print("Total time bins:", Taxi_trips_joined['time_bin'].nunique())


Taxi_trips_joined.head(5)



Unique geohashes: 376
Total time bins: 2688


Unnamed: 0,geohash,time_bin,pickup_trip_count,dropoff_trip_count,total_trip_count
0,dp3qz6,2021-06-20 00:00:00,20,5,25
1,dp3qz6,2021-06-20 01:00:00,13,2,15
2,dp3qz6,2021-06-20 02:00:00,13,6,19
3,dp3qz6,2021-06-20 03:00:00,6,3,9
4,dp3qz6,2021-06-20 04:00:00,2,12,14


Note: pd.merge(..., how='outer') — Full Outer Join
What it does:
Keeps all rows from both tables.

Fills in NaN for missing values where there is no match on the join keys.

Keep everything — even if it only exists in one side.

Now we will join the three datasets:

nyc1_joined (trip data)
AQ_aggregated (air quality from AQ_data)
PM_aggregated (air quality from PM_data)

by merge them on ['geohash', 'time_bin'].






In [None]:
# Step 1: Merge trip data with AQ data
final_merged = pd.merge(
    Taxi_trips_joined,
    AQ_aggregated,
    on=['geohash', 'time_bin'],
    how='inner')

print("final_merged:",final_merged.shape)
final_merged.head(10)


final_merged: (17272, 9)


Unnamed: 0,geohash,time_bin,pickup_trip_count,dropoff_trip_count,total_trip_count,Temperature,Humidity,PM25,CO
0,dp3tq3,2021-08-24 12:00:00,4,4,8,25.037874,77.819824,5.331973,0.399419
1,dp3trw,2021-07-01 14:00:00,0,3,3,24.530296,67.234039,8.821083,6.843058
2,dp3trw,2021-07-01 15:00:00,0,1,1,24.336943,69.757774,9.193285,0.854944
3,dp3trw,2021-07-01 16:00:00,2,1,3,24.12864,69.674047,9.983337,0.522668
4,dp3trw,2021-07-01 17:00:00,2,2,4,24.469547,66.735331,7.59266,0.408085
5,dp3trw,2021-07-01 18:00:00,2,3,5,24.705686,66.530124,7.895345,0.378521
6,dp3trw,2021-07-01 20:00:00,1,1,2,25.577899,62.026007,5.277436,0.308398
7,dp3trw,2021-07-01 21:00:00,1,1,2,26.078396,56.881968,2.689768,0.373332
8,dp3trw,2021-07-01 22:00:00,0,1,1,25.937174,55.002525,3.279789,0.235443
9,dp3trw,2021-07-02 00:00:00,0,1,1,22.75856,61.459351,2.276758,0.165204


Spatial join:
Performing sjoin by taking geohash center point and reflected to the Chicago administrative polygons
Using point in polygon spatial join


In [None]:

# Step 1: Decode geohash center to lat/lon
final_merged[['latitude', 'longitude']] = final_merged['geohash'].apply(
    lambda g: pd.Series(gh.decode(g)))

# Step 2: Convert to GeoDataFrame with point geometry
final_merged_gdf = gpd.GeoDataFrame(final_merged,geometry=gpd.points_from_xy(final_merged['longitude'], final_merged['latitude']),
    crs="EPSG:4326" )

# Step 3: Ensure both GeoDataFrames are in the same CRS
chicago_polygon = chicago_polygon.to_crs("EPSG:4326")

# Step 4: Spatial join: match each geohash point with its neighborhood polygon
joined = gpd.sjoin(final_merged_gdf, chicago_polygon, how="left", predicate="within")


print('joined shape:',joined.shape)
print('joined missing values:')
print(final_merged.isna().sum())
joined.head(10)


joined shape: (17272, 14)
joined missing values:
geohash               0
time_bin              0
pickup_trip_count     0
dropoff_trip_count    0
total_trip_count      0
Temperature           0
Humidity              0
PM25                  0
CO                    0
latitude              0
longitude             0
dtype: int64


Unnamed: 0,geohash,time_bin,pickup_trip_count,dropoff_trip_count,total_trip_count,Temperature,Humidity,PM25,CO,latitude,longitude,geometry,index_right,community
0,dp3tq3,2021-08-24 12:00:00,4,4,8,25.037874,77.819824,5.331973,0.399419,41.712341,-87.610474,POINT (-87.61047 41.71234),46,ROSELAND
1,dp3trw,2021-07-01 14:00:00,0,3,3,24.530296,67.234039,8.821083,6.843058,41.739807,-87.555542,POINT (-87.55554 41.73981),41,SOUTH CHICAGO
2,dp3trw,2021-07-01 15:00:00,0,1,1,24.336943,69.757774,9.193285,0.854944,41.739807,-87.555542,POINT (-87.55554 41.73981),41,SOUTH CHICAGO
3,dp3trw,2021-07-01 16:00:00,2,1,3,24.12864,69.674047,9.983337,0.522668,41.739807,-87.555542,POINT (-87.55554 41.73981),41,SOUTH CHICAGO
4,dp3trw,2021-07-01 17:00:00,2,2,4,24.469547,66.735331,7.59266,0.408085,41.739807,-87.555542,POINT (-87.55554 41.73981),41,SOUTH CHICAGO
5,dp3trw,2021-07-01 18:00:00,2,3,5,24.705686,66.530124,7.895345,0.378521,41.739807,-87.555542,POINT (-87.55554 41.73981),41,SOUTH CHICAGO
6,dp3trw,2021-07-01 20:00:00,1,1,2,25.577899,62.026007,5.277436,0.308398,41.739807,-87.555542,POINT (-87.55554 41.73981),41,SOUTH CHICAGO
7,dp3trw,2021-07-01 21:00:00,1,1,2,26.078396,56.881968,2.689768,0.373332,41.739807,-87.555542,POINT (-87.55554 41.73981),41,SOUTH CHICAGO
8,dp3trw,2021-07-01 22:00:00,0,1,1,25.937174,55.002525,3.279789,0.235443,41.739807,-87.555542,POINT (-87.55554 41.73981),41,SOUTH CHICAGO
9,dp3trw,2021-07-02 00:00:00,0,1,1,22.75856,61.459351,2.276758,0.165204,41.739807,-87.555542,POINT (-87.55554 41.73981),41,SOUTH CHICAGO
