In [65]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import math
import datetime as dt
import pytz
import holidays
from geopy.distance import geodesic
from geopy.geocoders import Nominatim
import ast
import requests
from category_encoders import TargetEncoder
import pickle

In [66]:
# read in zone id coordinate df
coords_df = pd.read_csv('../Resources/zone_coords_ids.csv')
coords_df['coords'] = coords_df['coords'].apply(ast.literal_eval)
coords_df[['latitude', 'longitude']] = pd.DataFrame(coords_df['coords'].tolist(), index=coords_df.index)
coords_df = coords_df[['LocationID', 'Zone', 'Borough', 'latitude', 'longitude']]
coords_df.head()

Unnamed: 0,LocationID,Zone,Borough,latitude,longitude
0,1,Newark Airport,EWR,40.689064,-74.177255
1,2,Jamaica Bay,Queens,40.603994,-73.835412
2,3,Allerton/Pelham Gardens,Bronx,40.86543,-73.867365
3,4,Alphabet City,Manhattan,40.725102,-73.979583
4,5,Arden Heights,Staten Island,40.5637,-74.191603


In [67]:
# read in model df with fixed distances and lat long coords
df = pd.read_parquet('Resources/fixed_model_df.parquet')
#df = pd.read_parquet('../Resources/model_coords_df.parquet')

df.columns = df.columns.str.replace("'", "").str.replace("(", "").str.replace(")", "").str.replace(",", "")

#df = df.loc[(df['tolls'] == 0)]

#df = df.drop(['fare', 'PUx', 'PUy', 'DOx', 'DOy', 'tolls', 'durationsec'], axis=1)

df

Unnamed: 0,second_of_day,day_of_year,weekend,holiday,morning_rush,evening rush,PULocationID,PUy,PUx,DOLocationID,...,prcp,temp,distance,durationsec,fare,tolls,airport,congestion,total,class
0,5,1,False,1,0,0,132,40.642948,-73.779373,41,...,0.0,42.80,14.48,1599.0,70.00,6.94,1.75,0.0,95.88,0
1,9,1,False,1,0,0,3,40.865430,-73.867365,51,...,0.0,42.80,2.07,785.0,14.59,0.00,0.00,0.0,14.59,1
2,33,1,False,1,0,0,145,40.741509,-73.956975,112,...,0.0,42.80,1.27,474.0,20.62,0.00,0.00,0.0,20.62,2
3,38,1,False,1,0,0,16,40.763120,-73.770745,73,...,0.0,42.80,0.81,459.0,23.20,0.00,0.00,0.0,23.20,2
4,42,1,False,1,0,0,164,40.749842,-73.984251,161,...,0.0,42.80,1.05,335.0,7.90,0.00,0.00,2.5,15.48,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5971769,85846,366,False,0,0,0,58,40.839167,-73.819722,213,...,0.0,48.02,2.67,534.0,14.38,0.00,0.00,0.0,14.38,2
5971770,85864,366,False,0,0,0,189,40.677871,-73.968473,62,...,0.0,48.02,1.49,488.0,15.01,0.00,0.00,0.0,15.01,2
5971771,85870,366,False,0,0,0,79,40.729269,-73.987361,79,...,0.0,48.02,0.00,256.0,5.80,0.00,0.00,2.5,10.80,0
5971772,86042,366,False,0,0,0,180,40.676770,-73.843746,63,...,0.0,48.02,2.50,266.0,8.92,0.00,0.00,0.0,8.92,1


In [68]:
# check NaNs
df.isna().sum()/len(df)

second_of_day    0.0
day_of_year      0.0
weekend          0.0
holiday          0.0
morning_rush     0.0
evening rush     0.0
PULocationID     0.0
PUy              0.0
PUx              0.0
DOLocationID     0.0
DOy              0.0
DOx              0.0
prcp             0.0
temp             0.0
distance         0.0
durationsec      0.0
fare             0.0
tolls            0.0
airport          0.0
congestion       0.0
total            0.0
class            0.0
dtype: float64

In [69]:
# double check proper dtypes
df.dtypes

second_of_day      int32
day_of_year        int32
weekend             bool
holiday            int64
morning_rush       int32
evening rush       int32
PULocationID       int32
PUy              float64
PUx              float64
DOLocationID       int32
DOy              float64
DOx              float64
prcp             float64
temp             float64
distance         float64
durationsec      float64
fare             float64
tolls            float64
airport          float64
congestion       float64
total            float64
class              int64
dtype: object

In [70]:
'''# if the test input is calculating distance through lat and long, the model should be trained the same
pickup_lat_col = 'PUy'  
pickup_lon_col = 'PUx'  
dropoff_lat_col = 'DOy' 
dropoff_lon_col = 'DOx' 

def calculate_geodesic_distance_vectorized(row):
    pickup = (row[pickup_lat_col], row[pickup_lon_col])
    dropoff = (row[dropoff_lat_col], row[dropoff_lon_col])
    original_distance = row.get('distance')

    try:
        calculated_distance = geodesic(pickup, dropoff).miles
        if pd.isna(calculated_distance):
            return original_distance
        else:
            return round(calculated_distance, 2)
    except (ValueError, TypeError): 
        return original_distance

df['distance'] = df.apply(calculate_geodesic_distance_vectorized, axis=1)



df'''

"# if the test input is calculating distance through lat and long, the model should be trained the same\npickup_lat_col = 'PUy'  \npickup_lon_col = 'PUx'  \ndropoff_lat_col = 'DOy' \ndropoff_lon_col = 'DOx' \n\ndef calculate_geodesic_distance_vectorized(row):\n    pickup = (row[pickup_lat_col], row[pickup_lon_col])\n    dropoff = (row[dropoff_lat_col], row[dropoff_lon_col])\n    original_distance = row.get('distance')\n\n    try:\n        calculated_distance = geodesic(pickup, dropoff).miles\n        if pd.isna(calculated_distance):\n            return original_distance\n        else:\n            return round(calculated_distance, 2)\n    except (ValueError, TypeError): \n        return original_distance\n\ndf['distance'] = df.apply(calculate_geodesic_distance_vectorized, axis=1)\n\n\n\ndf"

In [71]:
for column in df:
    print(df[column].value_counts())

second_of_day
27000    304
36000    299
28800    291
32400    285
30600    267
        ... 
15399      6
16068      6
12903      5
15191      5
18417      5
Name: count, Length: 86400, dtype: int64
day_of_year
69     37833
62     36098
83     35069
61     32137
68     32072
       ...  
126     4974
141     4970
146     4259
147     4045
148     3872
Name: count, Length: 366, dtype: int64
weekend
False    4161122
True     1810652
Name: count, dtype: int64
holiday
0    5833924
1     137850
Name: count, dtype: int64
morning_rush
0    5346033
1     625741
Name: count, dtype: int64
evening rush
0    5212234
1     759540
Name: count, dtype: int64
PULocationID
132    156307
138    141016
161    137764
237    126886
236    113690
        ...  
99         73
2          12
110         8
199         2
105         1
Name: count, Length: 261, dtype: int64
PUy
40.768311    422362
40.642948    156307
40.775714    141016
40.765064    137764
40.766437    136014
              ...  
40.577744        73


In [72]:
#save fixed df for training
#df.to_parquet('Resources/fixed_model_df.parquet')

In [73]:
# convert airport fee to most consistent values for congestion
df.loc[df['congestion'] > 0, 'congestion'] = 2.50
df.loc[df['congestion'] > 2.50, 'congestion'] = 2.50
df['congestion'].value_counts()

congestion
2.5    3417747
0.0    2554027
Name: count, dtype: int64

In [74]:
df.columns

Index(['second_of_day', 'day_of_year', 'weekend', 'holiday', 'morning_rush',
       'evening rush', 'PULocationID', 'PUy', 'PUx', 'DOLocationID', 'DOy',
       'DOx', 'prcp', 'temp', 'distance', 'durationsec', 'fare', 'tolls',
       'airport', 'congestion', 'total', 'class'],
      dtype='object')

In [75]:
# Target encode PU and DO Ids
encoder_pu_price = TargetEncoder(cols=['PULocationID'])
df['PULocationID_price_encoded'] = encoder_pu_price.fit_transform(df['PULocationID'], df['total'])

encoder_do_price = TargetEncoder(cols=['DOLocationID'])
df['DOLocationID_price_encoded'] = encoder_do_price.fit_transform(df['DOLocationID'], df['total'])

encoder_pu_duration = TargetEncoder(cols=['PULocationID'])
df['PULocationID_duration_encoded'] = encoder_pu_duration.fit_transform(df['PULocationID'], df['durationsec'])

encoder_do_duration = TargetEncoder(cols=['DOLocationID'])
df['DOLocationID_duration_encoded'] = encoder_do_duration.fit_transform(df['DOLocationID'], df['durationsec'])

df

Unnamed: 0,second_of_day,day_of_year,weekend,holiday,morning_rush,evening rush,PULocationID,PUy,PUx,DOLocationID,...,fare,tolls,airport,congestion,total,class,PULocationID_price_encoded,DOLocationID_price_encoded,PULocationID_duration_encoded,DOLocationID_duration_encoded
0,5,1,False,1,0,0,132,40.642948,-73.779373,41,...,70.00,6.94,1.75,0.0,95.88,0,74.509773,24.597328,2585.818754,1053.946453
1,9,1,False,1,0,0,3,40.865430,-73.867365,51,...,14.59,0.00,0.00,0.0,14.59,1,18.407239,21.733171,904.829327,1006.117639
2,33,1,False,1,0,0,145,40.741509,-73.956975,112,...,20.62,0.00,0.00,0.0,20.62,2,24.914211,25.832925,1119.940747,1152.268402
3,38,1,False,1,0,0,16,40.763120,-73.770745,73,...,23.20,0.00,0.00,0.0,23.20,2,22.297897,22.231324,965.573471,1062.154145
4,42,1,False,1,0,0,164,40.749842,-73.984251,161,...,7.90,0.00,0.00,2.5,15.48,0,29.209706,27.763912,1090.812392,1120.454953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5971769,85846,366,False,0,0,0,58,40.839167,-73.819722,213,...,14.38,0.00,0.00,0.0,14.38,2,22.215159,19.890125,989.741224,950.746821
5971770,85864,366,False,0,0,0,189,40.677871,-73.968473,62,...,15.01,0.00,0.00,0.0,15.01,2,24.636888,22.662937,1209.841759,1221.976983
5971771,85870,366,False,0,0,0,79,40.729269,-73.987361,79,...,5.80,0.00,0.00,2.5,10.80,0,25.647432,25.108819,1006.050324,987.283859
5971772,86042,366,False,0,0,0,180,40.676770,-73.843746,63,...,8.92,0.00,0.00,0.0,8.92,1,19.425836,20.700188,968.550166,1063.037634


In [76]:
# dummy encode class feature
df = pd.get_dummies(df, columns=['class'], prefix='class')
df

Unnamed: 0,second_of_day,day_of_year,weekend,holiday,morning_rush,evening rush,PULocationID,PUy,PUx,DOLocationID,...,airport,congestion,total,PULocationID_price_encoded,DOLocationID_price_encoded,PULocationID_duration_encoded,DOLocationID_duration_encoded,class_0,class_1,class_2
0,5,1,False,1,0,0,132,40.642948,-73.779373,41,...,1.75,0.0,95.88,74.509773,24.597328,2585.818754,1053.946453,True,False,False
1,9,1,False,1,0,0,3,40.865430,-73.867365,51,...,0.00,0.0,14.59,18.407239,21.733171,904.829327,1006.117639,False,True,False
2,33,1,False,1,0,0,145,40.741509,-73.956975,112,...,0.00,0.0,20.62,24.914211,25.832925,1119.940747,1152.268402,False,False,True
3,38,1,False,1,0,0,16,40.763120,-73.770745,73,...,0.00,0.0,23.20,22.297897,22.231324,965.573471,1062.154145,False,False,True
4,42,1,False,1,0,0,164,40.749842,-73.984251,161,...,0.00,2.5,15.48,29.209706,27.763912,1090.812392,1120.454953,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5971769,85846,366,False,0,0,0,58,40.839167,-73.819722,213,...,0.00,0.0,14.38,22.215159,19.890125,989.741224,950.746821,False,False,True
5971770,85864,366,False,0,0,0,189,40.677871,-73.968473,62,...,0.00,0.0,15.01,24.636888,22.662937,1209.841759,1221.976983,False,False,True
5971771,85870,366,False,0,0,0,79,40.729269,-73.987361,79,...,0.00,2.5,10.80,25.647432,25.108819,1006.050324,987.283859,True,False,False
5971772,86042,366,False,0,0,0,180,40.676770,-73.843746,63,...,0.00,0.0,8.92,19.425836,20.700188,968.550166,1063.037634,False,True,False


In [77]:
# prep df for duration model
df_2 = df.copy()

df_2 = df_2.drop(['fare', 'PULocationID', 'DOLocationID', 'PULocationID_price_encoded', 'DOLocationID_price_encoded', 'PUx', 'PUy', 'DOx', 'DOy', 'tolls', 'total'],axis=1)

df_2

Unnamed: 0,second_of_day,day_of_year,weekend,holiday,morning_rush,evening rush,prcp,temp,distance,durationsec,airport,congestion,PULocationID_duration_encoded,DOLocationID_duration_encoded,class_0,class_1,class_2
0,5,1,False,1,0,0,0.0,42.80,14.48,1599.0,1.75,0.0,2585.818754,1053.946453,True,False,False
1,9,1,False,1,0,0,0.0,42.80,2.07,785.0,0.00,0.0,904.829327,1006.117639,False,True,False
2,33,1,False,1,0,0,0.0,42.80,1.27,474.0,0.00,0.0,1119.940747,1152.268402,False,False,True
3,38,1,False,1,0,0,0.0,42.80,0.81,459.0,0.00,0.0,965.573471,1062.154145,False,False,True
4,42,1,False,1,0,0,0.0,42.80,1.05,335.0,0.00,2.5,1090.812392,1120.454953,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5971769,85846,366,False,0,0,0,0.0,48.02,2.67,534.0,0.00,0.0,989.741224,950.746821,False,False,True
5971770,85864,366,False,0,0,0,0.0,48.02,1.49,488.0,0.00,0.0,1209.841759,1221.976983,False,False,True
5971771,85870,366,False,0,0,0,0.0,48.02,0.00,256.0,0.00,2.5,1006.050324,987.283859,True,False,False
5971772,86042,366,False,0,0,0,0.0,48.02,2.50,266.0,0.00,0.0,968.550166,1063.037634,False,True,False


In [78]:
# prep df for price model
df = df.loc[(df['tolls'] == 0)]

df = df.drop(['fare', 'PULocationID', 'DOLocationID', 'PULocationID_duration_encoded', 'DOLocationID_duration_encoded', 'PUx', 'PUy', 'DOx', 'DOy', 'tolls', 'durationsec'], axis=1)

df

Unnamed: 0,second_of_day,day_of_year,weekend,holiday,morning_rush,evening rush,prcp,temp,distance,airport,congestion,total,PULocationID_price_encoded,DOLocationID_price_encoded,class_0,class_1,class_2
1,9,1,False,1,0,0,0.0,42.80,2.07,0.0,0.0,14.59,18.407239,21.733171,False,True,False
2,33,1,False,1,0,0,0.0,42.80,1.27,0.0,0.0,20.62,24.914211,25.832925,False,False,True
3,38,1,False,1,0,0,0.0,42.80,0.81,0.0,0.0,23.20,22.297897,22.231324,False,False,True
4,42,1,False,1,0,0,0.0,42.80,1.05,0.0,2.5,15.48,29.209706,27.763912,True,False,False
5,42,1,False,1,0,0,0.0,42.80,9.07,2.5,0.0,91.92,60.455697,22.658322,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5971769,85846,366,False,0,0,0,0.0,48.02,2.67,0.0,0.0,14.38,22.215159,19.890125,False,False,True
5971770,85864,366,False,0,0,0,0.0,48.02,1.49,0.0,0.0,15.01,24.636888,22.662937,False,False,True
5971771,85870,366,False,0,0,0,0.0,48.02,0.00,0.0,2.5,10.80,25.647432,25.108819,True,False,False
5971772,86042,366,False,0,0,0,0.0,48.02,2.50,0.0,0.0,8.92,19.425836,20.700188,False,True,False


In [79]:
df_2.columns

Index(['second_of_day', 'day_of_year', 'weekend', 'holiday', 'morning_rush',
       'evening rush', 'prcp', 'temp', 'distance', 'durationsec', 'airport',
       'congestion', 'PULocationID_duration_encoded',
       'DOLocationID_duration_encoded', 'class_0', 'class_1', 'class_2'],
      dtype='object')

In [80]:
# price model target is total
X = df.copy()
X = X.drop('total', axis=1)

y = df['total'].values.reshape(-1,1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=29)


In [81]:
# duration model target is durationsec
X_2 = df_2.copy()
X_2 = X_2.drop('durationsec', axis=1)

y_2 = df_2['durationsec'].values.reshape(-1,1)

X_train2, X_test2, y_train2, y_test2 = train_test_split(X_2, y_2, random_state=29)


In [82]:
X_train2

Unnamed: 0,second_of_day,day_of_year,weekend,holiday,morning_rush,evening rush,prcp,temp,distance,airport,congestion,PULocationID_duration_encoded,DOLocationID_duration_encoded,class_0,class_1,class_2
5513533,30646,340,False,0,1,0,0.0,37.04,0.46,0.00,2.5,1243.866706,1312.922060,False,False,True
2156274,47886,110,False,0,0,0,0.0,51.08,10.09,0.00,0.0,957.043054,930.813142,False,True,False
5643886,35396,347,False,0,1,0,0.0,37.94,13.71,1.75,2.5,2585.818754,1120.454953,True,False,False
4161738,76927,258,True,0,0,0,0.0,80.96,3.84,0.00,2.5,1023.460974,1381.386238,True,False,False
2553196,82204,154,True,0,0,0,0.0,75.92,13.68,2.50,2.5,2585.818754,1083.701205,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207209,63931,14,True,0,0,0,0.0,42.98,1.31,0.00,2.5,1218.966543,979.134174,True,False,False
5735208,83364,351,False,0,0,0,0.0,48.38,1.20,0.00,2.5,1055.962478,901.801291,False,True,False
4255266,71889,264,False,0,0,0,0.0,82.40,1.08,0.00,2.5,856.351297,982.034065,False,True,False
3418621,70935,209,True,0,0,0,0.0,84.92,1.75,0.00,2.5,905.279218,1132.936973,False,True,False


In [83]:
# price model
xgbr = xgb.XGBRegressor(objective='reg:squarederror',
                        subsample= 0.8,
                        n_estimators= 300,
                        max_depth = 13,
                        learning_rate = 0.2,
                        colsample_bytree= 0.6,
                        seed=29)

In [84]:
# duration model
xgbr2 = xgb.XGBRegressor(objective='reg:squarederror',
                        subsample= 0.8,
                        n_estimators= 500,
                        max_depth = 9,
                        learning_rate = 0.1,
                        gamma= 0.1,
                        colsample_bytree= 0.6,
                        seed=29)

In [85]:
xgbr.fit(X_train, y_train)

In [86]:
xgbr2.fit(X_train2, y_train2)

In [87]:
# price model train score
xgbr.score(X_train, y_train)

0.8648409114721716

In [88]:
# price model test score
xgbr.score(X_test, y_test)

0.7546300611867852

In [89]:
# duration model train score
xgbr2.score(X_train2, y_train2)

0.48825212692364195

In [90]:
# duration model test score
xgbr2.score(X_test2, y_test2)

0.24821528486624234

In [91]:
# price model mse, rmse
y_pred = xgbr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mse)
print(mse)
print(rmse)

48.74610374726333
6.981840999855506


In [92]:
# duration model mse, rmse
y_pred_2 = xgbr2.predict(X_test2)
mse2 = mean_squared_error(y_test2, y_pred_2)
rmse2 = math.sqrt(mse2)
print(mse2)
print(rmse2)

1398566.9408388485
1182.6102235474073


In [93]:
# price model MAE ($)
mean_absolute_error(y_test, y_pred)

4.085708486062361

In [94]:
# duration model MAE (seconds)
mean_absolute_error(y_test2, y_pred_2)

276.2314804174543

In [95]:
df.columns

Index(['second_of_day', 'day_of_year', 'weekend', 'holiday', 'morning_rush',
       'evening rush', 'prcp', 'temp', 'distance', 'airport', 'congestion',
       'total', 'PULocationID_price_encoded', 'DOLocationID_price_encoded',
       'class_0', 'class_1', 'class_2'],
      dtype='object')

In [96]:
# features for test input price model
features = [
            'second_of_day',
            'day_of_year',
            'weekend',
            'holiday',
            'morning_rush',
            'evening rush',
            'prcp',
            'temp',
            'distance',
            'airport',
            'congestion',
            'PULocationID_price_encoded',
            'DOLocationID_price_encoded',
            'class_0',  
            'class_1',
            'class_2']


In [97]:
# features for test input duration model
duration_features = [
            'second_of_day',
            'day_of_year',
            'weekend',
            'holiday',
            'morning_rush',
            'evening rush',
            'prcp',
            'temp',
            'distance',
            'airport',
            'congestion',
            'PULocationID_duration_encoded',
            'DOLocationID_duration_encoded',
            'class_0', 
            'class_1',
            'class_2'
            
   
]

In [98]:
# setup datetime for NY timezone
timezone = pytz.timezone('America/New_York')
now = dt.datetime.now(timezone)
us_holidays = holidays.US(state='NY')

coords = coords_df


### Location ID Keys for Pickup and Dropoff inputs
#
![Alternative text](../Resources/zone_ids_legend.png)

In [99]:
## Prototype App ##
# test data for models to predict from ## pulls in real time info to fill in variables unknown to users at time of requests
test_input_dict = {
    'second_of_day': [(now.hour * 3600) + (now.minute * 60) + now.second],
    'day_of_year': [now.timetuple().tm_yday],
    'weekend': [now.strftime('%A') in ['Saturday', 'Sunday']],
    'holiday': [1 if now.date() in us_holidays else 0],
    'morning_rush': [1 if now.strftime('%A') in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] and 7 <= now.hour <= 9 else 0],
    'evening rush': [1 if now.strftime('%A') in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] and 16 <= now.hour <= 18 else 0],
    # Input Pickup LocationID Key ***
    'PULocationID': [102],
    # Input Dropoff LocationID Key ***
    'DOLocationID': [138],
    'prcp': [0], # do not change
    'temp': [0], # do not change
    'distance': [0.0], # do not change
    'airport': [0.0], # do not change
    'congestion': [2.50 if (now.strftime('%A') in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] and 7 <= now.hour <= 9) or (now.strftime('%A') in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] and 16 <= now.hour <= 18) else 0.0], # do not change
    'class': [0] # do not change
    }

################################################################################

test_input = pd.DataFrame(test_input_dict)

# create a duration test input df for the durationsec model
test_input_df_duration = pd.DataFrame(test_input_dict)

# assigns an airport fee based on location and cab and fhv parameters
airport_do_ids = [1, 32, 138]
fhv_classes = [1, 2]
fhv_airport_check = (test_input['DOLocationID'].isin(airport_do_ids)) & (test_input['class'].isin(fhv_classes))
cab_airport_check = (test_input['PULocationID'] == 132) | (test_input['PULocationID'] == 138) & (test_input['class'] == 0)

test_input.loc[fhv_airport_check, 'airport'] = 2.50
test_input.loc[cab_airport_check, 'airport'] = 1.75


# repeat encoding steps for test input locations
test_input['PULocationID_price_encoded'] = encoder_pu_price.transform(test_input['PULocationID'])
test_input['DOLocationID_price_encoded'] = encoder_do_price.transform(test_input['DOLocationID'])
test_input_df_duration['PULocationID_duration_encoded'] = encoder_pu_duration.transform(test_input_df_duration['PULocationID'])
test_input_df_duration['DOLocationID_duration_encoded'] = encoder_do_duration.transform(test_input_df_duration['DOLocationID'])

# repeat encoding steps for test input duration classes
test_input_df_duration = pd.get_dummies(test_input_df_duration, columns=['class'], prefix='class', drop_first=False)
for i in range(3):
    col_name = f'class_{i}'
    if col_name not in test_input_df_duration.columns:
        test_input_df_duration[col_name] = 0

# repeat encoding steps for test input classes
test_input = pd.get_dummies(test_input, columns=['class'], prefix='class', drop_first=False)
for i in range(3):
    col_name = f'class_{i}'
    if col_name not in test_input.columns:
        test_input[col_name] = 0


# use second duration model on duration test input
predicted_duration_yellow = xgbr2.predict(test_input_df_duration[duration_features])[0]
test_input['durationsec'] = predicted_duration_yellow

# copy the test input features and values to a df for price predictions
test_df = test_input[features].copy()

# geolocator function to calculate distances between each lat and long point of entered PU and DO locations
def calc_distance(coord1, coord2):
    return geodesic(coord1, coord2).miles

# grabs PU and DO values
pickup_location_id = test_input['PULocationID'][0]
dropoff_location_id = test_input['DOLocationID'][0]

# create PU and DO coord variables to reassign values
pickup_coords = None
dropoff_coords = None


# grab PU lat and long
try:
    pickup_row = coords_df[coords_df['LocationID'] == pickup_location_id].iloc[0]
    pickup_coords = (pickup_row['latitude'], pickup_row['longitude'])
    pickup_zone = pickup_row['Zone']
except IndexError:
    print(f"Warning: PULocationID {pickup_location_id} not found in coordinates data.")

# grab DO lat and long
try:
    dropoff_row = coords_df[coords_df['LocationID'] == dropoff_location_id].iloc[0]
    dropoff_coords = (dropoff_row['latitude'], dropoff_row['longitude'])
    dropoff_zone = dropoff_row['Zone']
except IndexError:
    print(f"Warning: DOLocationID {dropoff_location_id} not found in coordinates data.")

# use calc distance function to fill in the distance input
calculated_distance = None
if pickup_coords and dropoff_coords:
    calculated_distance = round(calc_distance(pickup_coords, dropoff_coords), 2)
    test_input['distance'] = [calculated_distance]
else:
    print("Could not determine both pickup and dropoff coordinates.")


# use the lat long values to ping the nearest weather station and return
# current Temperature and Precipitation Data
latitude = pickup_row['latitude']
longitude = pickup_row['longitude']

api_url = f"https://api.weather.gov/points/{latitude},{longitude}"
response = requests.get(api_url)
if response.status_code == 200:
    point_data = response.json()
    forecast_url = point_data['properties']['forecastHourly']
    observation_stations_url = point_data['properties']['observationStations']

    # forecast response holds most recent temp
    forecast_response = requests.get(forecast_url)
    if forecast_response.status_code == 200:
        forecast_data = forecast_response.json()
        periods = forecast_data['properties']['periods']
        if periods:
            current_temperature = periods[0]['temperature']
            test_input['temp'] = [current_temperature]

    #observation response holds most recent prcp
    observation_response = requests.get(observation_stations_url)
    if observation_response.status_code == 200:
        stations_data = observation_response.json()
        if stations_data['features']:
            first_station_url = stations_data['features'][0]['id']

            station_observation_response = requests.get(first_station_url + '/observations/latest')
            if station_observation_response.status_code == 200:
                observation_data = station_observation_response.json()
                precipitation_last_hour_mm = observation_data['properties'].get('precipLastHour', {}).get('value')
                if precipitation_last_hour_mm is not None:
                    test_input['prcp'] = [precipitation_last_hour_mm]
                else:
                    test_input['prcp'] = [0]

###############################################################################
# Predicted Outputs ###
print('*' * 80)

print('Predicted Price Per Service ($4 Variance)')
print('~' * 80)
# print price prediction for class 0 (Yellow Cab)
results_pred = xgbr.predict(test_df)
print(f'Total predicted Cab Price: ${results_pred[0]:.2f}')

# print price prediction for class 1 (Uber)
test_df_2 = test_input.copy()
test_df_2[['class_0', 'class_1', 'class_2']] = [0, 1, 0]
results_pred_2 = xgbr.predict(test_df_2[features])
print(f'Total predicted Uber Price: ${results_pred_2[0]:.2f}')

# print price prediction for class 2 (Lyft)
test_df_3 = test_input.copy()
test_df_3[['class_0', 'class_1', 'class_2']] = [0, 0, 1]
results_pred_3 = xgbr.predict(test_df_3[features])
print(f'Total predicted Lyft Price: ${results_pred_3[0]:.2f}')

print('~' * 80)
# Create a df with the results to compare min and max services
lowest_results = {'Yellow Cab': results_pred[0], 'Uber': results_pred_2[0], 'Lyft': results_pred_3[0]}
lowest_results_df = pd.DataFrame(lowest_results, index=[0])
min_value = lowest_results_df.loc[0].min()
min_service = lowest_results_df.loc[0].idxmin()
max_value = lowest_results_df.loc[0].max()
max_service = lowest_results_df.loc[0].idxmax()

# Print Service Suggestion based on lowest Price
print(f'For Pickup in {pickup_zone} and Dropoff in {dropoff_zone},\n(An avg distance of {calculated_distance} miles),\nWith {test_input["prcp"][0]} mm of Precipitation and a current Temperature of {current_temperature}°F,\n{min_service} has the lowest predicted total price of ${min_value:.2f}.\nWith {max_service} having the highest predicted total price of ${max_value:.2f}.')

# create a copy of test_input_duration for Uber
test_input_uber = test_input_df_duration.copy()
test_input_uber['class_0'] = 0
test_input_uber['class_1'] = 1
test_input_uber['class_2'] = 0

# uber duration predictions
test_input_uber_duration = test_input_uber[duration_features]
predicted_duration_uber = xgbr2.predict(test_input_uber_duration)[0]

# create a copy of test_input_duration for Lyft
test_input_lyft = test_input_df_duration.copy()
test_input_lyft['class_0'] = 0
test_input_lyft['class_1'] = 0
test_input_lyft['class_2'] = 1
# uber duration predictions
test_input_lyft_duration = test_input_lyft[duration_features]
predicted_duration_lyft = xgbr2.predict(test_input_lyft_duration)[0]

# create df with duration results / 60 for predictions in minutes
# compare min and max values
duration_predictions = {'Yellow Cab': round(predicted_duration_yellow / 60, 2),
                        'Uber': round(predicted_duration_uber / 60, 2),
                        'Lyft': round(predicted_duration_lyft / 60, 2)}

duration_df = pd.DataFrame(duration_predictions, index=[0])
shortest_duration = duration_df.loc[0].min()
shortest_service_duration = duration_df.loc[0].idxmin()
longest_duration = duration_df.loc[0].max()
longest_service_duration = duration_df.loc[0].idxmax()

# print Service Suggestion based on Lowest Predicted Duration
print('*' * 80)
print('*' * 80)
print("Predicted Average Trip Durations (~4 minute Variance):")
print('~' * 80)
print(f"  Yellow Cab: {duration_predictions['Yellow Cab']:.2f} minutes")
print(f"  Uber:       {duration_predictions['Uber']:.2f} minutes")
print(f"  Lyft:       {duration_predictions['Lyft']:.2f} minutes")
print(f"\n{shortest_service_duration} is predicted to have the shortest average trip duration of {shortest_duration:.2f} minutes.")
print(f"{longest_service_duration} is predicted to have the longest average trip duration of {longest_duration:.2f} minutes.")
print('*' * 80)

********************************************************************************
Predicted Price Per Service ($4 Variance)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Total predicted Cab Price: $65.32
Total predicted Uber Price: $32.24
Total predicted Lyft Price: $44.71
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
For Pickup in Glendale and Dropoff in LaGuardia Airport,
(An avg distance of 5.17 miles),
With 0 mm of Precipitation and a current Temperature of 47°F,
Uber has the lowest predicted total price of $32.24.
With Yellow Cab having the highest predicted total price of $65.32.
********************************************************************************
********************************************************************************
Predicted Average Trip Durations (~4 minute Variance):
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  Yellow Cab: 4.25 minutes
  Uber:      

In [100]:
# show what values were filled for test input price
print(test_input)

   second_of_day  day_of_year  weekend  holiday  morning_rush  evening rush  \
0          80241           79    False        0             0             0   

   PULocationID  DOLocationID  prcp  temp  distance  airport  congestion  \
0           102           138     0    47      5.17      0.0         0.0   

   PULocationID_price_encoded  DOLocationID_price_encoded  class_0  class_1  \
0                   20.017997                    55.24608     True        0   

   class_2  durationsec  
0        0   255.120178  


In [101]:
# show what values were filled for test input duration
print(test_input_df_duration)

   second_of_day  day_of_year  weekend  holiday  morning_rush  evening rush  \
0          80241           79    False        0             0             0   

   PULocationID  DOLocationID  prcp  temp  distance  airport  congestion  \
0           102           138     0     0       0.0      0.0         0.0   

   PULocationID_duration_encoded  DOLocationID_duration_encoded  class_0  \
0                    1026.340419                    1691.845303     True   

   class_1  class_2  
0        0        0  


In [114]:
#saving models and encoders as pickles for streamlit app

with open('../../streamlit_app/ny_ride_service_predictor/xgbr_price.pkl', 'wb') as f:
    pickle.dump(xgbr, f)

with open('../../streamlit_app/ny_ride_service_predictor/xgbr_duration.pkl', 'wb') as f:
    pickle.dump(xgbr2, f)

with open('../../streamlit_app/ny_ride_service_predictor/encoder_pu_price.pkl', 'wb') as f:
    pickle.dump(encoder_pu_price, f)

with open('../../streamlit_app/ny_ride_service_predictor/encoder_do_price.pkl', 'wb') as f:
    pickle.dump(encoder_do_price, f)

with open('../../streamlit_app/ny_ride_service_predictor/encoder_pu_duration.pkl', 'wb') as f:
    pickle.dump(encoder_pu_duration, f)

with open('../../streamlit_app/ny_ride_service_predictor/encoder_do_duration.pkl', 'wb') as f:
    pickle.dump(encoder_do_duration, f)

with open('../../streamlit_app/ny_ride_service_predictor/coords_df.pkl', 'wb') as f:
    pickle.dump(coords_df, f)

In [103]:
# check what observation data was pulled
observation_data

{'@context': ['https://geojson.org/geojson-ld/geojson-context.jsonld',
  {'@version': '1.1',
   'wx': 'https://api.weather.gov/ontology#',
   's': 'https://schema.org/',
   'geo': 'http://www.opengis.net/ont/geosparql#',
   'unit': 'http://codes.wmo.int/common/unit/',
   '@vocab': 'https://api.weather.gov/ontology#',
   'geometry': {'@id': 's:GeoCoordinates', '@type': 'geo:wktLiteral'},
   'city': 's:addressLocality',
   'state': 's:addressRegion',
   'distance': {'@id': 's:Distance', '@type': 's:QuantitativeValue'},
   'bearing': {'@type': 's:QuantitativeValue'},
   'value': {'@id': 's:value'},
   'unitCode': {'@id': 's:unitCode', '@type': '@id'},
   'forecastOffice': {'@type': '@id'},
   'forecastGridData': {'@type': '@id'},
   'publicZone': {'@type': '@id'},
   'county': {'@type': '@id'}}],
 'id': 'https://api.weather.gov/stations/KLGA/observations/2025-03-21T00:51:00+00:00',
 'type': 'Feature',
 'geometry': {'type': 'Point', 'coordinates': [-73.9, 40.77]},
 'properties': {'@id': 'h

In [104]:
# check what farecast data was pulled
forecast_data

{'@context': ['https://geojson.org/geojson-ld/geojson-context.jsonld',
  {'@version': '1.1',
   'wx': 'https://api.weather.gov/ontology#',
   'geo': 'http://www.opengis.net/ont/geosparql#',
   'unit': 'http://codes.wmo.int/common/unit/',
   '@vocab': 'https://api.weather.gov/ontology#'}],
 'type': 'Feature',
 'geometry': {'type': 'Polygon',
  'coordinates': [[[-73.8866, 40.6884],
    [-73.88210000000001, 40.710100000000004],
    [-73.9107, 40.7135],
    [-73.9152, 40.6918],
    [-73.8866, 40.6884]]]},
 'properties': {'units': 'us',
  'forecastGenerator': 'HourlyForecastGenerator',
  'generatedAt': '2025-03-21T02:02:01+00:00',
  'updateTime': '2025-03-21T00:43:09+00:00',
  'validTimes': '2025-03-20T18:00:00+00:00/P7DT18H',
  'elevation': {'unitCode': 'wmoUnit:m', 'value': 29.8704},
  'periods': [{'number': 1,
    'name': '',
    'startTime': '2025-03-20T22:00:00-04:00',
    'endTime': '2025-03-20T23:00:00-04:00',
    'isDaytime': False,
    'temperature': 47,
    'temperatureUnit': 'F',

### Below was used to geolocate lat and long coords for each location ID
#

In [105]:
'''taxi_zone_df = pd.read_csv('https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv')
taxi_zone_df = taxi_zone_df.dropna()
zone_names_list = taxi_zone_df['Zone'].unique().tolist()

geolocator = Nominatim(user_agent="taxi_zone_locator_v2")
zone_coordinates = {}

borough_lookup = taxi_zone_df.set_index('Zone')['Borough'].to_dict()

for zone in zone_names_list:
    borough = borough_lookup.get(zone)
    if borough:
        query = f"{zone}, {borough}, New York City"
        try:
            location = geolocator.geocode(query, timeout=10)
            if location:
                zone_coordinates[zone] = (location.latitude, location.longitude)
            else:
                print(f"Could not find coordinates for: {zone} in {borough}")
                zone_coordinates[zone] = None
        except Exception as e:
            print(f"Error geocoding {zone} in {borough}: {e}")
            zone_coordinates[zone] = None
    else:
        print(f"Borough information not found for zone: {zone}")
        zone_coordinates[zone] = None

zone_coordinates'''

'taxi_zone_df = pd.read_csv(\'https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv\')\ntaxi_zone_df = taxi_zone_df.dropna()\nzone_names_list = taxi_zone_df[\'Zone\'].unique().tolist()\n\ngeolocator = Nominatim(user_agent="taxi_zone_locator_v2")\nzone_coordinates = {}\n\nborough_lookup = taxi_zone_df.set_index(\'Zone\')[\'Borough\'].to_dict()\n\nfor zone in zone_names_list:\n    borough = borough_lookup.get(zone)\n    if borough:\n        query = f"{zone}, {borough}, New York City"\n        try:\n            location = geolocator.geocode(query, timeout=10)\n            if location:\n                zone_coordinates[zone] = (location.latitude, location.longitude)\n            else:\n                print(f"Could not find coordinates for: {zone} in {borough}")\n                zone_coordinates[zone] = None\n        except Exception as e:\n            print(f"Error geocoding {zone} in {borough}: {e}")\n            zone_coordinates[zone] = None\n    else:\n        print(f"Borough

In [106]:
'''zone_names_list = taxi_zone_df['Zone'].unique().tolist()
borough_lookup = taxi_zone_df.set_index('Zone')['Borough'].to_dict()

geolocator = Nominatim(user_agent="improved_fix_locator")
zone_coordinates_fixed = {k: v for k, v in zone_coordinates.items()} 

zones_to_retry = [zone for zone, coords in zone_coordinates_fixed.items() if coords is None]

for zone in zones_to_retry:
    print(f"Type of zone: {type(zone)}, Value of zone: {zone}")
    borough = borough_lookup.get(zone)
    if borough:
        if "/" in zone:
            parts = zone.split("/")
            for part in parts:
                query = f"{part.strip()}, {borough}, New York City"
                location = geolocator.geocode(query, timeout=10)
                if location:
                    print(f"Found (split): {zone} -> {query} ({location.latitude}, {location.longitude})")
                    zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
                    break
        elif "(" in zone or ")" in zone:
            general_zone = zone.replace("(", "").replace(")", " Neighborhood").strip()
            query = f"{general_zone}, {borough}, New York City"
            location = geolocator.geocode(query, timeout=10)
            if location:
                print(f"Found (parenthesis): {zone} -> {query} ({location.latitude}, {location.longitude})")
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif "North" in zone or "South" in zone or "East" in zone or "West" in zone or "Upper" in zone or "Lower" in zone:
            general_zone = zone.replace(" North", "").replace(" South", "").replace(" East", "").replace(" West", "").replace(" Upper", "").replace(" Lower", "").strip()
            query = f"{general_zone}, {borough}, New York City"
            location = geolocator.geocode(query, timeout=10)
            if location:
                print(f"Found (cardinal): {zone} -> {query} ({location.latitude}, {location.longitude})")
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == "Governor's Island/Ellis Island/Liberty Island":
            location = geolocator.geocode("Governor's Island, New York City")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'Newark Airport':
            location = geolocator.geocode("Newark Liberty International Airport, Newark, NJ")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'Arden Heights':
            location = geolocator.geocode("Arden Heights, Staten Island, NY")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'Bronx Park':
            location = geolocator.geocode("Bronx Park, Bronx, NY")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'Central Harlem':
            location = geolocator.geocode("Central Harlem Neighborhood, Manhattan, NY")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'Central Harlem North':
            location = geolocator.geocode("Central Harlem North Neighborhood, Manhattan, NY")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'East Chelsea':
            location = geolocator.geocode("Chelsea Neighborhood, Manhattan, NY")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'Kew Gardens Hills':
            location = geolocator.geocode("Kew Gardens Hills, Queens, NY")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'Madison':
            location = geolocator.geocode("Madison Neighborhood, Brooklyn, NY 11229")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'North Corona':
            location = geolocator.geocode("Corona Neighborhood, Queens, NY")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'Old Astoria':
            location = geolocator.geocode("Astoria, Queens, NY")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'Prospect-Lefferts Gardens':
            location = geolocator.geocode("Prospect Lefferts Gardens Neighborhood, Brooklyn, NY")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'Van Cortlandt Village':
            location = geolocator.geocode("Van Cortlandt Village Neighborhood, Bronx, NY")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'Williamsburg (North Side)':
            location = geolocator.geocode("North Williamsburg Neighborhood, Brooklyn, NY")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)
        elif zone == 'Williamsburg (South Side)':
            location = geolocator.geocode("South Williamsburg Neighborhood, Brooklyn, NY")
            if location:
                zone_coordinates_fixed[zone] = (location.latitude, location.longitude)

zone_coordinates_fixed'''

'zone_names_list = taxi_zone_df[\'Zone\'].unique().tolist()\nborough_lookup = taxi_zone_df.set_index(\'Zone\')[\'Borough\'].to_dict()\n\ngeolocator = Nominatim(user_agent="improved_fix_locator")\nzone_coordinates_fixed = {k: v for k, v in zone_coordinates.items()} \n\nzones_to_retry = [zone for zone, coords in zone_coordinates_fixed.items() if coords is None]\n\nfor zone in zones_to_retry:\n    print(f"Type of zone: {type(zone)}, Value of zone: {zone}")\n    borough = borough_lookup.get(zone)\n    if borough:\n        if "/" in zone:\n            parts = zone.split("/")\n            for part in parts:\n                query = f"{part.strip()}, {borough}, New York City"\n                location = geolocator.geocode(query, timeout=10)\n                if location:\n                    print(f"Found (split): {zone} -> {query} ({location.latitude}, {location.longitude})")\n                    zone_coordinates_fixed[zone] = (location.latitude, location.longitude)\n                    break

In [107]:
#zone_coordinates_fixed['Williamsburg (North Side)'] = (40.7149, -73.9528)
#print(zone_coordinates_fixed['Williamsburg (North Side)'])

In [108]:
#zone_coordinates_fixed['Williamsburg (South Side)'] = (40.7044, -73.9566)
#print(zone_coordinates_fixed['Williamsburg (South Side)'])

In [109]:
#zone_coordinates_fixed['Van Cortlandt Village'] = (40.8837, -73.8931)

#zone_coordinates_fixed['North Corona'] = (40.7544, -73.8669)

#zone_coordinates_fixed['Madison'] = (42.8990, -75.5121)

#zone_coordinates_fixed['East Chelsea'] = (40.7465, -74.0014)

#zone_coordinates_fixed['Central Harlem'] = (40.8089, -73.9482)

#zone_coordinates_fixed['Central Harlem North'] = (40.8089, -73.9482)

#zone_coordinates_fixed['Prospect-Lefferts Gardens'] = (40.6592, -73.9534)
#zone_coordinates_fixed

In [110]:
#taxi_zone_df

In [111]:
'''zone_coordinates_series = pd.Series(zone_coordinates_fixed)
zone_coordinates_df = pd.DataFrame(zone_coordinates_series, columns=['coords'])
zone_coordinates_df = zone_coordinates_df.reset_index()
zone_coordinates_df.rename(columns={'index': 'Zone'}, inplace=True)
zone_coordinates_df'''

"zone_coordinates_series = pd.Series(zone_coordinates_fixed)\nzone_coordinates_df = pd.DataFrame(zone_coordinates_series, columns=['coords'])\nzone_coordinates_df = zone_coordinates_df.reset_index()\nzone_coordinates_df.rename(columns={'index': 'Zone'}, inplace=True)\nzone_coordinates_df"

In [112]:
#zone_coords_ids_df = pd.merge(taxi_zone_df, zone_coordinates_df, on='Zone')
#zone_coords_ids_df

In [113]:
#zone_coords_ids_df.to_csv('../Resources/zone_coords_ids.csv')