In [303]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import math
import datetime as dt
import pytz
import holidays
from geopy.distance import geodesic
from geopy.geocoders import Nominatim
import ast
import requests
from category_encoders import TargetEncoder
import pickle

In [304]:
# read in zone id coordinate df
coords_df = pd.read_csv('../Resources/zone_coords_ids.csv')
coords_df['coords'] = coords_df['coords'].apply(ast.literal_eval)
coords_df[['latitude', 'longitude']] = pd.DataFrame(coords_df['coords'].tolist(), index=coords_df.index)
coords_df = coords_df[['LocationID', 'Zone', 'Borough', 'latitude', 'longitude']]
coords_df.head()

Unnamed: 0,LocationID,Zone,Borough,latitude,longitude
0,1,Newark Airport,EWR,40.689064,-74.177255
1,2,Jamaica Bay,Queens,40.603994,-73.835412
2,3,Allerton/Pelham Gardens,Bronx,40.86543,-73.867365
3,4,Alphabet City,Manhattan,40.725102,-73.979583
4,5,Arden Heights,Staten Island,40.5637,-74.191603


In [305]:
# read in model df with fixed distances and lat long coords
df = pd.read_parquet('Resources/fixed_model_df.parquet')
#df = pd.read_parquet('../Resources/model_coords_df.parquet')

df.columns = df.columns.str.replace("'", "").str.replace("(", "").str.replace(")", "").str.replace(",", "")

#df = df.loc[(df['tolls'] == 0)]

#df = df.drop(['fare', 'PUx', 'PUy', 'DOx', 'DOy', 'tolls', 'durationsec'], axis=1)

df

Unnamed: 0,second_of_day,day_of_year,weekend,holiday,morning_rush,evening rush,PULocationID,PUy,PUx,DOLocationID,...,prcp,temp,distance,durationsec,fare,tolls,airport,congestion,total,class
0,5,1,False,1,0,0,132,40.642948,-73.779373,41,...,0.0,42.80,14.48,1599.0,70.00,6.94,1.75,0.0,95.88,0
1,9,1,False,1,0,0,3,40.865430,-73.867365,51,...,0.0,42.80,2.07,785.0,14.59,0.00,0.00,0.0,14.59,1
2,33,1,False,1,0,0,145,40.741509,-73.956975,112,...,0.0,42.80,1.27,474.0,20.62,0.00,0.00,0.0,20.62,2
3,38,1,False,1,0,0,16,40.763120,-73.770745,73,...,0.0,42.80,0.81,459.0,23.20,0.00,0.00,0.0,23.20,2
4,42,1,False,1,0,0,164,40.749842,-73.984251,161,...,0.0,42.80,1.05,335.0,7.90,0.00,0.00,2.5,15.48,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5971769,85846,366,False,0,0,0,58,40.839167,-73.819722,213,...,0.0,48.02,2.67,534.0,14.38,0.00,0.00,0.0,14.38,2
5971770,85864,366,False,0,0,0,189,40.677871,-73.968473,62,...,0.0,48.02,1.49,488.0,15.01,0.00,0.00,0.0,15.01,2
5971771,85870,366,False,0,0,0,79,40.729269,-73.987361,79,...,0.0,48.02,0.00,256.0,5.80,0.00,0.00,2.5,10.80,0
5971772,86042,366,False,0,0,0,180,40.676770,-73.843746,63,...,0.0,48.02,2.50,266.0,8.92,0.00,0.00,0.0,8.92,1


In [306]:
# check NaNs
df.isna().sum()/len(df)

second_of_day    0.0
day_of_year      0.0
weekend          0.0
holiday          0.0
morning_rush     0.0
evening rush     0.0
PULocationID     0.0
PUy              0.0
PUx              0.0
DOLocationID     0.0
DOy              0.0
DOx              0.0
prcp             0.0
temp             0.0
distance         0.0
durationsec      0.0
fare             0.0
tolls            0.0
airport          0.0
congestion       0.0
total            0.0
class            0.0
dtype: float64

In [307]:
# double check proper dtypes
df.dtypes

second_of_day      int32
day_of_year        int32
weekend             bool
holiday            int64
morning_rush       int32
evening rush       int32
PULocationID       int32
PUy              float64
PUx              float64
DOLocationID       int32
DOy              float64
DOx              float64
prcp             float64
temp             float64
distance         float64
durationsec      float64
fare             float64
tolls            float64
airport          float64
congestion       float64
total            float64
class              int64
dtype: object

In [308]:
'''# if the test input is calculating distance through lat and long, the model should be trained the same
pickup_lat_col = 'PUy'  
pickup_lon_col = 'PUx'  
dropoff_lat_col = 'DOy' 
dropoff_lon_col = 'DOx' 

def calculate_geodesic_distance_vectorized(row):
    pickup = (row[pickup_lat_col], row[pickup_lon_col])
    dropoff = (row[dropoff_lat_col], row[dropoff_lon_col])
    original_distance = row.get('distance')

    try:
        calculated_distance = geodesic(pickup, dropoff).miles
        if pd.isna(calculated_distance):
            return original_distance
        else:
            return round(calculated_distance, 2)
    except (ValueError, TypeError): 
        return original_distance

df['distance'] = df.apply(calculate_geodesic_distance_vectorized, axis=1)



df'''

"# if the test input is calculating distance through lat and long, the model should be trained the same\npickup_lat_col = 'PUy'  \npickup_lon_col = 'PUx'  \ndropoff_lat_col = 'DOy' \ndropoff_lon_col = 'DOx' \n\ndef calculate_geodesic_distance_vectorized(row):\n    pickup = (row[pickup_lat_col], row[pickup_lon_col])\n    dropoff = (row[dropoff_lat_col], row[dropoff_lon_col])\n    original_distance = row.get('distance')\n\n    try:\n        calculated_distance = geodesic(pickup, dropoff).miles\n        if pd.isna(calculated_distance):\n            return original_distance\n        else:\n            return round(calculated_distance, 2)\n    except (ValueError, TypeError): \n        return original_distance\n\ndf['distance'] = df.apply(calculate_geodesic_distance_vectorized, axis=1)\n\n\n\ndf"

In [309]:
#for column in df:
    #print(df[column].value_counts())

In [310]:
#save fixed df for training
#df.to_parquet('Resources/fixed_model_df.parquet')

In [311]:
# convert airport fee to most consistent values for congestion
df.loc[df['congestion'] > 0, 'congestion'] = 2.50
df.loc[df['congestion'] > 2.50, 'congestion'] = 2.50
df['congestion'].value_counts()

congestion
2.5    3417747
0.0    2554027
Name: count, dtype: int64

In [312]:
df.columns

Index(['second_of_day', 'day_of_year', 'weekend', 'holiday', 'morning_rush',
       'evening rush', 'PULocationID', 'PUy', 'PUx', 'DOLocationID', 'DOy',
       'DOx', 'prcp', 'temp', 'distance', 'durationsec', 'fare', 'tolls',
       'airport', 'congestion', 'total', 'class'],
      dtype='object')

In [313]:
# dummy encode class feature
df = pd.get_dummies(df, columns=['class'], prefix='class')
df

Unnamed: 0,second_of_day,day_of_year,weekend,holiday,morning_rush,evening rush,PULocationID,PUy,PUx,DOLocationID,...,distance,durationsec,fare,tolls,airport,congestion,total,class_0,class_1,class_2
0,5,1,False,1,0,0,132,40.642948,-73.779373,41,...,14.48,1599.0,70.00,6.94,1.75,0.0,95.88,True,False,False
1,9,1,False,1,0,0,3,40.865430,-73.867365,51,...,2.07,785.0,14.59,0.00,0.00,0.0,14.59,False,True,False
2,33,1,False,1,0,0,145,40.741509,-73.956975,112,...,1.27,474.0,20.62,0.00,0.00,0.0,20.62,False,False,True
3,38,1,False,1,0,0,16,40.763120,-73.770745,73,...,0.81,459.0,23.20,0.00,0.00,0.0,23.20,False,False,True
4,42,1,False,1,0,0,164,40.749842,-73.984251,161,...,1.05,335.0,7.90,0.00,0.00,2.5,15.48,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5971769,85846,366,False,0,0,0,58,40.839167,-73.819722,213,...,2.67,534.0,14.38,0.00,0.00,0.0,14.38,False,False,True
5971770,85864,366,False,0,0,0,189,40.677871,-73.968473,62,...,1.49,488.0,15.01,0.00,0.00,0.0,15.01,False,False,True
5971771,85870,366,False,0,0,0,79,40.729269,-73.987361,79,...,0.00,256.0,5.80,0.00,0.00,2.5,10.80,True,False,False
5971772,86042,366,False,0,0,0,180,40.676770,-73.843746,63,...,2.50,266.0,8.92,0.00,0.00,0.0,8.92,False,True,False


In [314]:
df = df[df['durationsec'] < 7200]
df['durationsec'].max()

7198.0

In [317]:
df = df[df['durationsec'] > 300]
df['durationsec'].min()

301.0

In [319]:
df = df[df['total'] > 5]
df['total'].min()

5.01

In [320]:
df = df[df['total'] < 300]
df['total'].max()

299.45

In [321]:
# prep df for duration model
df_2 = df.copy()

df_2 = df_2.drop(['fare', 'PUx', 'PUy', 'DOx', 'DOy', 'tolls', 'total'],axis=1)

df_2

Unnamed: 0,second_of_day,day_of_year,weekend,holiday,morning_rush,evening rush,PULocationID,DOLocationID,prcp,temp,distance,durationsec,airport,congestion,class_0,class_1,class_2
0,5,1,False,1,0,0,132,41,0.0,42.80,14.48,1599.0,1.75,0.0,True,False,False
1,9,1,False,1,0,0,3,51,0.0,42.80,2.07,785.0,0.00,0.0,False,True,False
2,33,1,False,1,0,0,145,112,0.0,42.80,1.27,474.0,0.00,0.0,False,False,True
3,38,1,False,1,0,0,16,73,0.0,42.80,0.81,459.0,0.00,0.0,False,False,True
4,42,1,False,1,0,0,164,161,0.0,42.80,1.05,335.0,0.00,2.5,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5971765,85824,366,False,0,0,0,56,196,0.0,48.02,1.67,332.0,0.00,0.0,False,True,False
5971766,85830,366,False,0,0,0,263,75,0.0,48.02,1.19,335.0,0.00,2.5,True,False,False
5971768,85841,366,False,0,0,0,77,76,0.0,48.02,0.95,444.0,0.00,0.0,False,False,True
5971769,85846,366,False,0,0,0,58,213,0.0,48.02,2.67,534.0,0.00,0.0,False,False,True


In [322]:
# prep df for price model
#df = df.loc[(df['tolls'] == 0)]

df = df.drop(['fare', 'PUx', 'PUy', 'DOx', 'DOy', 'tolls', 'durationsec'], axis=1)

df

Unnamed: 0,second_of_day,day_of_year,weekend,holiday,morning_rush,evening rush,PULocationID,DOLocationID,prcp,temp,distance,airport,congestion,total,class_0,class_1,class_2
0,5,1,False,1,0,0,132,41,0.0,42.80,14.48,1.75,0.0,95.88,True,False,False
1,9,1,False,1,0,0,3,51,0.0,42.80,2.07,0.00,0.0,14.59,False,True,False
2,33,1,False,1,0,0,145,112,0.0,42.80,1.27,0.00,0.0,20.62,False,False,True
3,38,1,False,1,0,0,16,73,0.0,42.80,0.81,0.00,0.0,23.20,False,False,True
4,42,1,False,1,0,0,164,161,0.0,42.80,1.05,0.00,2.5,15.48,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5971765,85824,366,False,0,0,0,56,196,0.0,48.02,1.67,0.00,0.0,8.41,False,True,False
5971766,85830,366,False,0,0,0,263,75,0.0,48.02,1.19,0.00,2.5,16.32,True,False,False
5971768,85841,366,False,0,0,0,77,76,0.0,48.02,0.95,0.00,0.0,9.91,False,False,True
5971769,85846,366,False,0,0,0,58,213,0.0,48.02,2.67,0.00,0.0,14.38,False,False,True


In [323]:
df_2.columns

Index(['second_of_day', 'day_of_year', 'weekend', 'holiday', 'morning_rush',
       'evening rush', 'PULocationID', 'DOLocationID', 'prcp', 'temp',
       'distance', 'durationsec', 'airport', 'congestion', 'class_0',
       'class_1', 'class_2'],
      dtype='object')

In [324]:
# price model target is total
X = df.copy()
#X = X.drop('total', axis=1)

y = df['total'].values.reshape(-1,1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=29)


In [325]:
# duration model target is durationsec
X_2 = df_2.copy()
#X_2 = X_2.drop('durationsec', axis=1)

y_2 = df_2['durationsec'].values.reshape(-1,1)

X_train2, X_test2, y_train2, y_test2 = train_test_split(X_2, y_2, random_state=29)


In [326]:
# Target encode PU and DO Ids
encoder_pu_price = TargetEncoder(cols=['PULocationID'])
X_train['PULocationID_price_encoded'] = encoder_pu_price.fit_transform(X_train['PULocationID'], X_train['total'])
X_test['PULocationID_price_encoded'] = encoder_pu_price.fit_transform(X_test['PULocationID'], X_test['total'])

encoder_do_price = TargetEncoder(cols=['DOLocationID'])
X_train['DOLocationID_price_encoded'] = encoder_do_price.fit_transform(X_train['DOLocationID'], X_train['total'])
X_test['DOLocationID_price_encoded'] = encoder_do_price.fit_transform(X_test['DOLocationID'], X_test['total'])

encoder_pu_duration = TargetEncoder(cols=['PULocationID'])
X_train2['PULocationID_duration_encoded'] = encoder_pu_duration.fit_transform(X_train2['PULocationID'], X_train2['durationsec'])
X_test2['PULocationID_duration_encoded'] = encoder_pu_duration.fit_transform(X_test2['PULocationID'], X_test2['durationsec'])

encoder_do_duration = TargetEncoder(cols=['DOLocationID'])
X_train2['DOLocationID_duration_encoded'] = encoder_do_duration.fit_transform(X_train2['DOLocationID'], X_train2['durationsec'])
X_test2['DOLocationID_duration_encoded'] = encoder_do_duration.fit_transform(X_test2['DOLocationID'], X_test2['durationsec'])

X_train = X_train.drop(columns=['PULocationID', 'DOLocationID', 'total'], axis=1)
X_test = X_test.drop(columns=['PULocationID', 'DOLocationID', 'total'], axis=1)

X_train2 = X_train2.drop(columns=['PULocationID', 'DOLocationID', 'durationsec'], axis=1)
X_test2 = X_test2.drop(columns=['PULocationID', 'DOLocationID', 'durationsec'], axis=1)

display(X_train)
display(X_test)
display(X_train2)

Unnamed: 0,second_of_day,day_of_year,weekend,holiday,morning_rush,evening rush,prcp,temp,distance,airport,congestion,class_0,class_1,class_2,PULocationID_price_encoded,DOLocationID_price_encoded
4827037,72441,299,False,0,0,0,0.0,66.92,3.31,0.0,0.0,False,True,False,21.608081,29.048839
5619649,82568,345,False,0,0,0,0.0,65.00,2.29,0.0,2.5,False,False,True,29.147500,34.609682
4510655,41584,280,True,0,0,0,0.0,60.98,2.95,0.0,2.5,False,True,False,22.639995,28.337301
4475505,43836,278,False,0,0,0,0.0,64.94,2.82,0.0,0.0,False,True,False,20.500205,23.480792
4589562,49621,285,False,0,0,0,0.0,57.02,2.95,0.0,2.5,True,False,False,24.664146,25.943373
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5821092,64058,356,True,0,0,0,0.0,35.60,9.64,0.0,2.5,False,True,False,28.169514,27.875701
224514,76667,15,False,1,0,0,0.0,28.94,1.64,0.0,2.5,False,True,False,30.395421,27.992343
4569188,35348,284,False,0,1,0,0.0,53.06,6.19,0.0,0.0,False,False,True,26.946170,22.105210
3674645,62135,226,False,0,0,1,0.0,80.06,2.25,0.0,2.5,True,False,False,28.914805,26.007048


Unnamed: 0,second_of_day,day_of_year,weekend,holiday,morning_rush,evening rush,prcp,temp,distance,airport,congestion,class_0,class_1,class_2,PULocationID_price_encoded,DOLocationID_price_encoded
1212458,52122,69,True,0,0,0,0.0,42.08,4.51,0.0,0.0,False,False,True,18.760125,25.513067
843100,40195,54,False,0,0,0,0.4,42.08,12.44,2.5,2.5,False,True,False,26.581910,69.986051
4959086,1155,307,True,0,0,0,0.0,66.92,3.30,0.0,2.5,False,True,False,25.581795,25.274741
282363,50471,19,False,0,0,0,0.0,32.00,0.00,0.0,0.0,False,False,True,21.306588,22.268996
3946983,3971,245,True,0,0,0,0.0,73.94,2.69,0.0,2.5,True,False,False,28.264206,30.241238
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5643853,35285,347,False,0,1,0,0.0,37.94,1.20,0.0,2.5,False,True,False,33.683756,25.440024
3162688,57991,193,False,0,0,1,0.0,86.00,3.61,0.0,2.5,True,False,False,27.519757,27.110118
223781,72532,15,False,1,0,0,0.0,28.94,11.59,2.5,0.0,False,True,False,75.251548,26.537906
5926898,85572,363,True,0,0,0,0.1,41.36,3.74,0.0,0.0,False,True,False,20.039875,24.844005


Unnamed: 0,second_of_day,day_of_year,weekend,holiday,morning_rush,evening rush,prcp,temp,distance,airport,congestion,class_0,class_1,class_2,PULocationID_duration_encoded,DOLocationID_duration_encoded
4827037,72441,299,False,0,0,0,0.0,66.92,3.31,0.0,0.0,False,True,False,1153.153319,1316.809590
5619649,82568,345,False,0,0,0,0.0,65.00,2.29,0.0,2.5,False,False,True,1123.520210,1320.603089
4510655,41584,280,True,0,0,0,0.0,60.98,2.95,0.0,2.5,False,True,False,1212.788782,1174.139604
4475505,43836,278,False,0,0,0,0.0,64.94,2.82,0.0,0.0,False,True,False,1034.725023,1101.268356
4589562,49621,285,False,0,0,0,0.0,57.02,2.95,0.0,2.5,True,False,False,925.267783,1020.422453
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5821092,64058,356,True,0,0,0,0.0,35.60,9.64,0.0,2.5,False,True,False,1137.708966,1217.980317
224514,76667,15,False,1,0,0,0.0,28.94,1.64,0.0,2.5,False,True,False,1230.844749,1127.186577
4569188,35348,284,False,0,1,0,0.0,53.06,6.19,0.0,0.0,False,False,True,1143.808000,1048.439399
3674645,62135,226,False,0,0,1,0.0,80.06,2.25,0.0,2.5,True,False,False,1093.545103,1023.026018


In [327]:
has_negative_duration = (X_train2['PULocationID_duration_encoded'] < 0).any()

if has_negative_duration:
    print("The 'PULocationID_duration_encoded' column contains negative values.")
else:
    print("The 'PULocationID_duration_encoded' column does not contain any negative values.")

The 'PULocationID_duration_encoded' column does not contain any negative values.


In [328]:
has_negative_duration = (X_train2['DOLocationID_duration_encoded'] < 0).any()

if has_negative_duration:
    print("The 'DOLocationID_duration_encoded' column contains negative values.")
else:
    print("The 'DOLocationID_duration_encoded' column does not contain any negative values.")

The 'DOLocationID_duration_encoded' column does not contain any negative values.


In [329]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import xgboost as xgb
from scipy.stats import uniform, randint

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'max_depth': [3, 5, 7, 9, 11, 13],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4]
}

param_distributions = {
    'n_estimators': randint(100, 501),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(3, 14),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.4, 0.5),
    'gamma': uniform(0, 0.4)
}

In [330]:
X_train_reset = X_train.reset_index(drop=True)

X_train_sampled = X_train_reset.sample(frac=0.01, random_state=29)

y_train_sampled = y_train[X_train_sampled.index]

print(len(X_train_sampled))
print(len(y_train_sampled))

41717
41717


In [331]:
X_train2_reset = X_train2.reset_index(drop=True)

X_train2_sampled = X_train2_reset.sample(frac=0.01, random_state=29)

y_train2_sampled = y_train2[X_train2_sampled.index]

print(len(X_train2_sampled))
print(len(y_train2_sampled))

41717
41717


In [332]:
randomized_search = RandomizedSearchCV(estimator=xgb.XGBRegressor(objective='reg:squarederror', seed=29),
                                   param_distributions=param_distributions,
                                   n_iter=100,  # Number of parameter settings that are sampled
                                   scoring='neg_mean_absolute_error',  # Or 'r2', etc.
                                   cv=3,  # Number of cross-validation folds
                                   verbose=2,
                                   n_jobs=-1,
                                   random_state=29) # For reproducibility
randomized_search.fit(X_train_sampled, y_train_sampled)  # Replace X_train and y_train with your training data

print("Best parameters found by RandomizedSearchCV: ", randomized_search.best_params_)
print("Best score found by RandomizedSearchCV: ", randomized_search.best_score_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best parameters found by RandomizedSearchCV:  {'colsample_bytree': 0.669777056655587, 'gamma': 0.06350860448309326, 'learning_rate': 0.05062984705540893, 'max_depth': 10, 'n_estimators': 428, 'subsample': 0.8170775801318325}
Best score found by RandomizedSearchCV:  -5.65822982364273


In [333]:
randomized_search = RandomizedSearchCV(estimator=xgb.XGBRegressor(objective='reg:squarederror', seed=29),
                                   param_distributions=param_distributions,
                                   n_iter=100,  # Number of parameter settings that are sampled
                                   scoring='neg_mean_absolute_error',  # Or 'r2', etc.
                                   cv=3,  # Number of cross-validation folds
                                   verbose=2,
                                   n_jobs=-1,
                                   random_state=29) # For reproducibility
randomized_search.fit(X_train2_sampled, y_train2_sampled)  # Replace X_train and y_train with your training data

print("Best parameters found by RandomizedSearchCV: ", randomized_search.best_params_)
print("Best score found by RandomizedSearchCV: ", randomized_search.best_score_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best parameters found by RandomizedSearchCV:  {'colsample_bytree': 0.6745840150739562, 'gamma': 0.3768021943125054, 'learning_rate': 0.07450491289225031, 'max_depth': 8, 'n_estimators': 278, 'subsample': 0.8411565132446718}
Best score found by RandomizedSearchCV:  -297.04048872857373


In [334]:
# price model
xgbr = xgb.XGBRegressor(objective='reg:squarederror',
                        subsample= 0.8170775801318325,
                        n_estimators= 428,
                        max_depth = 10,
                        learning_rate = 0.05062984705540893,
                        colsample_bytree= 0.669777056655587,
                        gamma=0.06350860448309326,
                        seed=29)

In [338]:
# duration model
xgbr2 = xgb.XGBRegressor(objective='reg:squarederror',
                        subsample= 0.8411565132446718,
                        n_estimators= 278,
                        max_depth = 8,
                        learning_rate = 0.07450491289225031,
                        gamma= 0.3768021943125054,
                        colsample_bytree= 0.6745840150739562,
                        seed=29)

In [336]:
xgbr.fit(X_train, y_train)

In [339]:
xgbr2.fit(X_train2, y_train2)

In [340]:
# price model train score
xgbr.score(X_train, y_train)

0.86819533466797

In [341]:
# price model test score
xgbr.score(X_test, y_test)

0.8005024638392615

In [342]:
# duration model train score
xgbr2.score(X_train2, y_train2)

0.7656981327395944

In [343]:
# duration model test score
xgbr2.score(X_test2, y_test2)

0.7017068488836152

In [344]:
# price model mse, rmse
y_pred = xgbr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mse)
print(mse)
print(rmse)

84.14649100630487
9.173139648250476


In [345]:
# duration model mse, rmse
y_pred_2 = xgbr2.predict(X_test2)
mse2 = mean_squared_error(y_test2, y_pred_2)
rmse2 = math.sqrt(mse2)
print(mse2)
print(rmse2)

186862.52646226404
432.27598413775434


In [346]:
# price model MAE ($)
mean_absolute_error(y_test, y_pred)

5.782268960321373

In [347]:
# duration model MAE (seconds)
mean_absolute_error(y_test2, y_pred_2)

300.5816362557402

In [348]:
df.columns

Index(['second_of_day', 'day_of_year', 'weekend', 'holiday', 'morning_rush',
       'evening rush', 'PULocationID', 'DOLocationID', 'prcp', 'temp',
       'distance', 'airport', 'congestion', 'total', 'class_0', 'class_1',
       'class_2'],
      dtype='object')

In [349]:
# features for test input price model
features = [
            'second_of_day',
            'day_of_year',
            'weekend',
            'holiday',
            'morning_rush',
            'evening rush',
            'prcp',
            'temp',
            'distance',
            'airport',
            'congestion',
            'class_0',  
            'class_1',
            'class_2',
            'PULocationID_price_encoded',
            'DOLocationID_price_encoded']


In [350]:
# features for test input duration model
duration_features = [
            'second_of_day',
            'day_of_year',
            'weekend',
            'holiday',
            'morning_rush',
            'evening rush',
            'prcp',
            'temp',
            'distance',
            'airport',
            'congestion',
            'class_0', 
            'class_1',
            'class_2',
            'PULocationID_duration_encoded',
            'DOLocationID_duration_encoded'  
]

In [351]:
# setup datetime for NY timezone
timezone = pytz.timezone('America/New_York')
now = dt.datetime.now(timezone)
us_holidays = holidays.US(state='NY')

coords = coords_df


### Location ID Keys for Pickup and Dropoff inputs
#
![Alternative text](../Resources/zone_ids_legend.png)

In [355]:
## Prototype App ##
# test data for models to predict from ## pulls in real time info to fill in variables unknown to users at time of requests
test_input_dict = {
    'second_of_day': [(now.hour * 3600) + (now.minute * 60) + now.second],
    'day_of_year': [now.timetuple().tm_yday],
    'weekend': [now.strftime('%A') in ['Saturday', 'Sunday']],
    'holiday': [1 if now.date() in us_holidays else 0],
    'morning_rush': [1 if now.strftime('%A') in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] and 7 <= now.hour <= 9 else 0],
    'evening rush': [1 if now.strftime('%A') in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] and 16 <= now.hour <= 18 else 0],
    # Input Pickup LocationID Key ***
    'PULocationID': [63],
    # Input Dropoff LocationID Key ***
    'DOLocationID': [138],
    'prcp': [0], # do not change
    'temp': [0], # do not change
    'distance': [0.0], # do not change
    'airport': [0.0], # do not change
    'congestion': [2.50 if (now.strftime('%A') in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] and 7 <= now.hour <= 9) or (now.strftime('%A') in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] and 16 <= now.hour <= 18) else 0.0], # do not change
    'class': [0] # do not change
    }

################################################################################

test_input = pd.DataFrame(test_input_dict)

# create a duration test input df for the durationsec model
test_input_df_duration = pd.DataFrame(test_input_dict)

# assigns an airport fee based on location and cab and fhv parameters
airport_do_ids = [1, 32, 138]
fhv_classes = [1, 2]
fhv_airport_check = (test_input['DOLocationID'].isin(airport_do_ids)) & (test_input['class'].isin(fhv_classes))
cab_airport_check = (test_input['PULocationID'] == 132) | (test_input['PULocationID'] == 138) & (test_input['class'] == 0)

test_input.loc[fhv_airport_check, 'airport'] = 2.50
test_input.loc[cab_airport_check, 'airport'] = 1.75


# repeat encoding steps for test input locations
test_input['PULocationID_price_encoded'] = encoder_pu_price.transform(test_input['PULocationID'])
test_input['DOLocationID_price_encoded'] = encoder_do_price.transform(test_input['DOLocationID'])
test_input_df_duration['PULocationID_duration_encoded'] = encoder_pu_duration.transform(test_input_df_duration['PULocationID'])
test_input_df_duration['DOLocationID_duration_encoded'] = encoder_do_duration.transform(test_input_df_duration['DOLocationID'])

# repeat encoding steps for test input duration classes
test_input_df_duration = pd.get_dummies(test_input_df_duration, columns=['class'], prefix='class', drop_first=False)
for i in range(3):
    col_name = f'class_{i}'
    if col_name not in test_input_df_duration.columns:
        test_input_df_duration[col_name] = 0

# repeat encoding steps for test input classes
test_input = pd.get_dummies(test_input, columns=['class'], prefix='class', drop_first=False)
for i in range(3):
    col_name = f'class_{i}'
    if col_name not in test_input.columns:
        test_input[col_name] = 0


# use second duration model on duration test input
predicted_duration_yellow = xgbr2.predict(test_input_df_duration[duration_features])[0]
test_input['durationsec'] = predicted_duration_yellow

# copy the test input features and values to a df for price predictions
test_df = test_input[features].copy()

# geolocator function to calculate distances between each lat and long point of entered PU and DO locations
def calc_distance(coord1, coord2):
    return geodesic(coord1, coord2).miles

# grabs PU and DO values
pickup_location_id = test_input['PULocationID'][0]
dropoff_location_id = test_input['DOLocationID'][0]

# create PU and DO coord variables to reassign values
pickup_coords = None
dropoff_coords = None


# grab PU lat and long
try:
    pickup_row = coords_df[coords_df['LocationID'] == pickup_location_id].iloc[0]
    pickup_coords = (pickup_row['latitude'], pickup_row['longitude'])
    pickup_zone = pickup_row['Zone']
except IndexError:
    print(f"Warning: PULocationID {pickup_location_id} not found in coordinates data.")

# grab DO lat and long
try:
    dropoff_row = coords_df[coords_df['LocationID'] == dropoff_location_id].iloc[0]
    dropoff_coords = (dropoff_row['latitude'], dropoff_row['longitude'])
    dropoff_zone = dropoff_row['Zone']
except IndexError:
    print(f"Warning: DOLocationID {dropoff_location_id} not found in coordinates data.")

# use calc distance function to fill in the distance input
calculated_distance = None
if pickup_coords and dropoff_coords:
    calculated_distance = round(calc_distance(pickup_coords, dropoff_coords), 2)
    test_input['distance'] = [calculated_distance]
else:
    print("Could not determine both pickup and dropoff coordinates.")


# use the lat long values to ping the nearest weather station and return
# current Temperature and Precipitation Data
latitude = pickup_row['latitude']
longitude = pickup_row['longitude']

api_url = f"https://api.weather.gov/points/{latitude},{longitude}"
response = requests.get(api_url)
if response.status_code == 200:
    point_data = response.json()
    forecast_url = point_data['properties']['forecastHourly']
    observation_stations_url = point_data['properties']['observationStations']

    # forecast response holds most recent temp
    forecast_response = requests.get(forecast_url)
    if forecast_response.status_code == 200:
        forecast_data = forecast_response.json()
        periods = forecast_data['properties']['periods']
        if periods:
            current_temperature = periods[0]['temperature']
            test_input['temp'] = [current_temperature]

    #observation response holds most recent prcp
    observation_response = requests.get(observation_stations_url)
    if observation_response.status_code == 200:
        stations_data = observation_response.json()
        if stations_data['features']:
            first_station_url = stations_data['features'][0]['id']

            station_observation_response = requests.get(first_station_url + '/observations/latest')
            if station_observation_response.status_code == 200:
                observation_data = station_observation_response.json()
                precipitation_last_hour_mm = observation_data['properties'].get('precipLastHour', {}).get('value')
                if precipitation_last_hour_mm is not None:
                    test_input['prcp'] = [precipitation_last_hour_mm]
                else:
                    test_input['prcp'] = [0]

###############################################################################
# Predicted Outputs ###
print('*' * 80)

print('Predicted Price Per Service ($5 Variance)')
print('~' * 80)
# print price prediction for class 0 (Yellow Cab)
results_pred = xgbr.predict(test_df)
print(f'Total predicted Cab Price: ${results_pred[0]:.2f}')

# print price prediction for class 1 (Uber)
test_df_2 = test_input.copy()
test_df_2[['class_0', 'class_1', 'class_2']] = [0, 1, 0]
results_pred_2 = xgbr.predict(test_df_2[features])
print(f'Total predicted Uber Price: ${results_pred_2[0]:.2f}')

# print price prediction for class 2 (Lyft)
test_df_3 = test_input.copy()
test_df_3[['class_0', 'class_1', 'class_2']] = [0, 0, 1]
results_pred_3 = xgbr.predict(test_df_3[features])
print(f'Total predicted Lyft Price: ${results_pred_3[0]:.2f}')

print('~' * 80)
# Create a df with the results to compare min and max services
lowest_results = {'Yellow Cab': results_pred[0], 'Uber': results_pred_2[0], 'Lyft': results_pred_3[0]}
lowest_results_df = pd.DataFrame(lowest_results, index=[0])
min_value = lowest_results_df.loc[0].min()
min_service = lowest_results_df.loc[0].idxmin()
max_value = lowest_results_df.loc[0].max()
max_service = lowest_results_df.loc[0].idxmax()

# Print Service Suggestion based on lowest Price
print(f'For Pickup in {pickup_zone} and Dropoff in {dropoff_zone},\n(An avg distance of {calculated_distance} miles),\nWith {test_input["prcp"][0]} mm of Precipitation and a current Temperature of {current_temperature}°F,\n{min_service} has the lowest predicted total price of ${min_value:.2f}.\nWith {max_service} having the highest predicted total price of ${max_value:.2f}.')

# create a copy of test_input_duration for Uber
test_input_uber = test_input_df_duration.copy()
test_input_uber['class_0'] = 0
test_input_uber['class_1'] = 1
test_input_uber['class_2'] = 0

# uber duration predictions
test_input_uber_duration = test_input_uber[duration_features]
predicted_duration_uber = xgbr2.predict(test_input_uber_duration)[0]

# create a copy of test_input_duration for Lyft
test_input_lyft = test_input_df_duration.copy()
test_input_lyft['class_0'] = 0
test_input_lyft['class_1'] = 0
test_input_lyft['class_2'] = 1
# uber duration predictions
test_input_lyft_duration = test_input_lyft[duration_features]
predicted_duration_lyft = xgbr2.predict(test_input_lyft_duration)[0]

# create df with duration results / 60 for predictions in minutes
# compare min and max values
duration_predictions = {'Yellow Cab': round(predicted_duration_yellow / 60, 2),
                        'Uber': round(predicted_duration_uber / 60, 2),
                        'Lyft': round(predicted_duration_lyft / 60, 2)}

duration_df = pd.DataFrame(duration_predictions, index=[0])
shortest_duration = duration_df.loc[0].min()
shortest_service_duration = duration_df.loc[0].idxmin()
longest_duration = duration_df.loc[0].max()
longest_service_duration = duration_df.loc[0].idxmax()

# print Service Suggestion based on Lowest Predicted Duration
print('*' * 80)
print('*' * 80)
print("Predicted Average Trip Durations (~5 minute Variance):")
print('~' * 80)
print(f"  Yellow Cab: {duration_predictions['Yellow Cab']:.2f} minutes")
print(f"  Uber:       {duration_predictions['Uber']:.2f} minutes")
print(f"  Lyft:       {duration_predictions['Lyft']:.2f} minutes")
print(f"\n{shortest_service_duration} is predicted to have the shortest average trip duration of {shortest_duration:.2f} minutes.")
print(f"{longest_service_duration} is predicted to have the longest average trip duration of {longest_duration:.2f} minutes.")
print('*' * 80)

********************************************************************************
Predicted Price Per Service ($5 Variance)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Total predicted Cab Price: $52.38
Total predicted Uber Price: $57.36
Total predicted Lyft Price: $54.38
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
For Pickup in Cypress Hills and Dropoff in LaGuardia Airport,
(An avg distance of 6.87 miles),
With 0 mm of Precipitation and a current Temperature of 52°F,
Yellow Cab has the lowest predicted total price of $52.38.
With Uber having the highest predicted total price of $57.36.
********************************************************************************
********************************************************************************
Predicted Average Trip Durations (~5 minute Variance):
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  Yellow Cab: 21.45 minutes
  Uber:

In [356]:
# show what values were filled for test input price
print(test_input)

   second_of_day  day_of_year  weekend  holiday  morning_rush  evening rush  \
0          61309          106    False        0             0             1   

   PULocationID  DOLocationID  prcp  temp  distance  airport  congestion  \
0            63           138     0    52      6.87      0.0         2.5   

   PULocationID_price_encoded  DOLocationID_price_encoded  class_0  class_1  \
0                   20.887807                    55.42235     True        0   

   class_2  durationsec  
0        0  1287.157593  


In [357]:
# show what values were filled for test input duration
print(test_input_df_duration)

   second_of_day  day_of_year  weekend  holiday  morning_rush  evening rush  \
0          61309          106    False        0             0             1   

   PULocationID  DOLocationID  prcp  temp  distance  airport  congestion  \
0            63           138     0     0       0.0      0.0         2.5   

   PULocationID_duration_encoded  DOLocationID_duration_encoded  class_0  \
0                     1126.54675                    1699.842008     True   

   class_1  class_2  
0        0        0  


In [358]:
#saving models and encoders as pickles for streamlit app

with open('../../streamlit_app/ny_ride_service_predictor/xgbr_price.pkl', 'wb') as f:
    pickle.dump(xgbr, f)

with open('../../streamlit_app/ny_ride_service_predictor/xgbr_duration.pkl', 'wb') as f:
    pickle.dump(xgbr2, f)

with open('../../streamlit_app/ny_ride_service_predictor/encoder_pu_price.pkl', 'wb') as f:
    pickle.dump(encoder_pu_price, f)

with open('../../streamlit_app/ny_ride_service_predictor/encoder_do_price.pkl', 'wb') as f:
    pickle.dump(encoder_do_price, f)

with open('../../streamlit_app/ny_ride_service_predictor/encoder_pu_duration.pkl', 'wb') as f:
    pickle.dump(encoder_pu_duration, f)

with open('../../streamlit_app/ny_ride_service_predictor/encoder_do_duration.pkl', 'wb') as f:
    pickle.dump(encoder_do_duration, f)

with open('../../streamlit_app/ny_ride_service_predictor/coords_df.pkl', 'wb') as f:
    pickle.dump(coords_df, f)

In [103]:
# check what observation data was pulled
observation_data

{'@context': ['https://geojson.org/geojson-ld/geojson-context.jsonld',
  {'@version': '1.1',
   'wx': 'https://api.weather.gov/ontology#',
   's': 'https://schema.org/',
   'geo': 'http://www.opengis.net/ont/geosparql#',
   'unit': 'http://codes.wmo.int/common/unit/',
   '@vocab': 'https://api.weather.gov/ontology#',
   'geometry': {'@id': 's:GeoCoordinates', '@type': 'geo:wktLiteral'},
   'city': 's:addressLocality',
   'state': 's:addressRegion',
   'distance': {'@id': 's:Distance', '@type': 's:QuantitativeValue'},
   'bearing': {'@type': 's:QuantitativeValue'},
   'value': {'@id': 's:value'},
   'unitCode': {'@id': 's:unitCode', '@type': '@id'},
   'forecastOffice': {'@type': '@id'},
   'forecastGridData': {'@type': '@id'},
   'publicZone': {'@type': '@id'},
   'county': {'@type': '@id'}}],
 'id': 'https://api.weather.gov/stations/KLGA/observations/2025-03-21T00:51:00+00:00',
 'type': 'Feature',
 'geometry': {'type': 'Point', 'coordinates': [-73.9, 40.77]},
 'properties': {'@id': 'h

In [104]:
# check what farecast data was pulled
forecast_data

{'@context': ['https://geojson.org/geojson-ld/geojson-context.jsonld',
  {'@version': '1.1',
   'wx': 'https://api.weather.gov/ontology#',
   'geo': 'http://www.opengis.net/ont/geosparql#',
   'unit': 'http://codes.wmo.int/common/unit/',
   '@vocab': 'https://api.weather.gov/ontology#'}],
 'type': 'Feature',
 'geometry': {'type': 'Polygon',
  'coordinates': [[[-73.8866, 40.6884],
    [-73.88210000000001, 40.710100000000004],
    [-73.9107, 40.7135],
    [-73.9152, 40.6918],
    [-73.8866, 40.6884]]]},
 'properties': {'units': 'us',
  'forecastGenerator': 'HourlyForecastGenerator',
  'generatedAt': '2025-03-21T02:02:01+00:00',
  'updateTime': '2025-03-21T00:43:09+00:00',
  'validTimes': '2025-03-20T18:00:00+00:00/P7DT18H',
  'elevation': {'unitCode': 'wmoUnit:m', 'value': 29.8704},
  'periods': [{'number': 1,
    'name': '',
    'startTime': '2025-03-20T22:00:00-04:00',
    'endTime': '2025-03-20T23:00:00-04:00',
    'isDaytime': False,
    'temperature': 47,
    'temperatureUnit': 'F',

### Below was used to geolocate lat and long coords for each location ID
#

In [105]:
'''taxi_zone_df = pd.read_csv('https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv')
taxi_zone_df = taxi_zone_df.dropna()
zone_names_list = taxi_zone_df['Zone'].unique().tolist()

geolocator = Nominatim(user_agent="taxi_zone_locator_v2")
zone_coordinates = {}

borough_lookup = taxi_zone_df.set_index('Zone')['Borough'].to_dict()

for zone in zone_names_list:
    borough = borough_lookup.get(zone)
    if borough:
        query = f"{zone}, {borough}, New York City"
        try:
            location = geolocator.geocode(query, timeout=10)
            if location:
                zone_coordinates[zone] = (location.latitude, location.longitude)
            else:
                print(f"Could not find coordinates for: {zone} in {borough}")
                zone_coordinates[zone] = None
        except Exception as e:
            print(f"Error geocoding {zone} in {borough}: {e}")
            zone_coordinates[zone] = None
    else:
        print(f"Borough information not found for zone: {zone}")
        zone_coordinates[zone] = None

zone_coordinates'''

'taxi_zone_df = pd.read_csv(\'https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv\')\ntaxi_zone_df = taxi_zone_df.dropna()\nzone_names_list = taxi_zone_df[\'Zone\'].unique().tolist()\n\ngeolocator = Nominatim(user_agent="taxi_zone_locator_v2")\nzone_coordinates = {}\n\nborough_lookup = taxi_zone_df.set_index(\'Zone\')[\'Borough\'].to_dict()\n\nfor zone in zone_names_list:\n    borough = borough_lookup.get(zone)\n    if borough:\n        query = f"{zone}, {borough}, New York City"\n        try:\n            location = geolocator.geocode(query, timeout=10)\n            if location:\n                zone_coordinates[zone] = (location.latitude, location.longitude)\n            else:\n                print(f"Could not find coordinates for: {zone} in {borough}")\n                zone_coordinates[zone] = None\n        except Exception as e:\n            print(f"Error geocoding {zone} in {borough}: {e}")\n            zone_coordinates[zone] = None\n    else:\n        print(f"Borough

In [106]:
'''zone_names_list = taxi_zone_df['Zone'].unique().tolist()
borough_lookup = taxi_zone_df.set_index('Zone')['Borough'].to_dict()

geolocator = Nominatim(user_agent="improved_fix_locator")
zone_coordinates_fixed = {k: v for k, v in zone_coordinates.items()} 

zones_to_retry = [zone for zone, coords in zone_coordinates_fixed.items() if coords is None]

for zone in zones_to_retry:
    print(f"Type of zone: {type(zone)}, Value of zone: {zone}")
    borough = borough_lookup.get(zone)
    if borough:
        if "/" in zone:
            parts = zone.split("/")
            for part in parts:
                query = f"{part.strip()}, {borough}, New York City"
                location = geolocator.geocode(query, timeout=10)
                if location:
                    print(f"Found (split): {zone} -> {query} ({location.latitude}, {location.longitude})")
                    zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
                    break
        elif "(" in zone or ")" in zone:
            general_zone = zone.replace("(", "").replace(")", " Neighborhood").strip()
            query = f"{general_zone}, {borough}, New York City"
            location = geolocator.geocode(query, timeout=10)
            if location:
                print(f"Found (parenthesis): {zone} -> {query} ({location.latitude}, {location.longitude})")
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif "North" in zone or "South" in zone or "East" in zone or "West" in zone or "Upper" in zone or "Lower" in zone:
            general_zone = zone.replace(" North", "").replace(" South", "").replace(" East", "").replace(" West", "").replace(" Upper", "").replace(" Lower", "").strip()
            query = f"{general_zone}, {borough}, New York City"
            location = geolocator.geocode(query, timeout=10)
            if location:
                print(f"Found (cardinal): {zone} -> {query} ({location.latitude}, {location.longitude})")
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == "Governor's Island/Ellis Island/Liberty Island":
            location = geolocator.geocode("Governor's Island, New York City")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'Newark Airport':
            location = geolocator.geocode("Newark Liberty International Airport, Newark, NJ")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'Arden Heights':
            location = geolocator.geocode("Arden Heights, Staten Island, NY")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'Bronx Park':
            location = geolocator.geocode("Bronx Park, Bronx, NY")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'Central Harlem':
            location = geolocator.geocode("Central Harlem Neighborhood, Manhattan, NY")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'Central Harlem North':
            location = geolocator.geocode("Central Harlem North Neighborhood, Manhattan, NY")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'East Chelsea':
            location = geolocator.geocode("Chelsea Neighborhood, Manhattan, NY")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'Kew Gardens Hills':
            location = geolocator.geocode("Kew Gardens Hills, Queens, NY")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'Madison':
            location = geolocator.geocode("Madison Neighborhood, Brooklyn, NY 11229")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'North Corona':
            location = geolocator.geocode("Corona Neighborhood, Queens, NY")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'Old Astoria':
            location = geolocator.geocode("Astoria, Queens, NY")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'Prospect-Lefferts Gardens':
            location = geolocator.geocode("Prospect Lefferts Gardens Neighborhood, Brooklyn, NY")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'Van Cortlandt Village':
            location = geolocator.geocode("Van Cortlandt Village Neighborhood, Bronx, NY")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'Williamsburg (North Side)':
            location = geolocator.geocode("North Williamsburg Neighborhood, Brooklyn, NY")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'Williamsburg (South Side)':
            location = geolocator.geocode("South Williamsburg Neighborhood, Brooklyn, NY")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)

zone_coordinates_fixed'''

'zone_names_list = taxi_zone_df[\'Zone\'].unique().tolist()\nborough_lookup = taxi_zone_df.set_index(\'Zone\')[\'Borough\'].to_dict()\n\ngeolocator = Nominatim(user_agent="improved_fix_locator")\nzone_coordinates_fixed = {k: v for k, v in zone_coordinates.items()} \n\nzones_to_retry = [zone for zone, coords in zone_coordinates_fixed.items() if coords is None]\n\nfor zone in zones_to_retry:\n    print(f"Type of zone: {type(zone)}, Value of zone: {zone}")\n    borough = borough_lookup.get(zone)\n    if borough:\n        if "/" in zone:\n            parts = zone.split("/")\n            for part in parts:\n                query = f"{part.strip()}, {borough}, New York City"\n                location = geolocator.geocode(query, timeout=10)\n                if location:\n                    print(f"Found (split): {zone} -> {query} ({location.latitude}, {location.longitude})")\n                    zone_coordinates_fixed[zone] = (location.latitude, location.longitude)\n                    break

In [107]:
#zone_coordinates_fixed['Williamsburg (North Side)'] = (40.7149, -73.9528)
#print(zone_coordinates_fixed['Williamsburg (North Side)'])

In [108]:
#zone_coordinates_fixed['Williamsburg (South Side)'] = (40.7044, -73.9566)
#print(zone_coordinates_fixed['Williamsburg (South Side)'])

In [109]:
#zone_coordinates_fixed['Van Cortlandt Village'] = (40.8837, -73.8931)

#zone_coordinates_fixed['North Corona'] = (40.7544, -73.8669)

#zone_coordinates_fixed['Madison'] = (42.8990, -75.5121)

#zone_coordinates_fixed['East Chelsea'] = (40.7465, -74.0014)

#zone_coordinates_fixed['Central Harlem'] = (40.8089, -73.9482)

#zone_coordinates_fixed['Central Harlem North'] = (40.8089, -73.9482)

#zone_coordinates_fixed['Prospect-Lefferts Gardens'] = (40.6592, -73.9534)
#zone_coordinates_fixed

In [110]:
#taxi_zone_df

In [111]:
'''zone_coordinates_series = pd.Series(zone_coordinates_fixed)
zone_coordinates_df = pd.DataFrame(zone_coordinates_series, columns=['coords'])
zone_coordinates_df = zone_coordinates_df.reset_index()
zone_coordinates_df.rename(columns={'index': 'Zone'}, inplace=True)
zone_coordinates_df'''

"zone_coordinates_series = pd.Series(zone_coordinates_fixed)\nzone_coordinates_df = pd.DataFrame(zone_coordinates_series, columns=['coords'])\nzone_coordinates_df = zone_coordinates_df.reset_index()\nzone_coordinates_df.rename(columns={'index': 'Zone'}, inplace=True)\nzone_coordinates_df"

In [112]:
#zone_coords_ids_df = pd.merge(taxi_zone_df, zone_coordinates_df, on='Zone')
#zone_coords_ids_df

In [113]:
#zone_coords_ids_df.to_csv('../Resources/zone_coords_ids.csv')