In [2]:
!where pip     

D:\Personal\self study\uber fare prediction\development\virtualenv_dev_uber_fare\Scripts\pip.exe
C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Scripts\pip.exe


In [3]:
!pip list

Package                            Version
---------------------------------- -----------
alembic                            1.13.2
aniso8601                          9.0.1
blinker                            1.8.2
cachetools                         5.5.0
certifi                            2024.7.4
charset-normalizer                 3.3.2
click                              8.1.7
cloudpickle                        3.0.0
colorama                           0.4.6
contourpy                          1.2.1
cycler                             0.12.1
databricks-sdk                     0.30.0
Deprecated                         1.2.14
docker                             7.1.0
entrypoints                        0.4
Flask                              3.0.3
fonttools                          4.53.1
gitdb                              4.0.11
GitPython                          3.1.43
google-auth                        2.34.0
graphene                           3.3
graphql-core                       3.2.3
g

In [4]:
import pandas as pd
import numpy as np
import mlflow
from math import radians, sin, cos, sqrt, atan2
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

In [5]:
df = pd.read_csv('../uber.csv')

In [6]:
df.drop(columns = ['Unnamed: 0','key'], inplace = True)

In [7]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [8]:
f'Number of rows: {df.shape[0]:,}'

'Number of rows: 200,000'

### Datetime feature engineering

In [9]:
df.pickup_datetime = pd.to_datetime(df.pickup_datetime)

In [10]:
df['dayofyear'] = df.pickup_datetime.dt.dayofyear
df['dayofweek'] = df.pickup_datetime.dt.weekday
df['time'] = df.pickup_datetime.dt.time

In [11]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dayofyear,dayofweek,time
0,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,127,3,19:52:06
1,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.99471,40.750325,1,198,4,20:04:56
2,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.74077,-73.962565,40.772647,1,236,0,21:45:00
3,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,177,4,08:22:21
4,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,240,3,17:47:00


**Checking Consistency**

In [12]:
np.sort(df.time.unique())

array([datetime.time(0, 0), datetime.time(0, 0, 2),
       datetime.time(0, 0, 3), ..., datetime.time(23, 59, 57),
       datetime.time(23, 59, 58), datetime.time(23, 59, 59)], dtype=object)

In [13]:
df.dayofweek.nunique()

7

In [14]:
df.dayofyear.nunique()

366

**It seems that all datetime features are consistant and that all possible values are available**

### Transforming location features and summerize them to be only distance

Here for simplicity, instead of using an API to get the exact street distance between the 2 locations, I'll use Haversine distance 

In [15]:
def haversine_distance(long1,lat1, long2, lat2):    
    # Convert latitude and longitude to radians
    lat1_rad = radians(lat1)
    long1_rad = radians(long1)
    lat2_rad = radians(lat2)
    long2_rad = radians(long2)

    # Haversine formula
    dlong = long2_rad - long1_rad
    dlat = lat2_rad - lat1_rad
    a = sin(dlat/2)**2 + cos(lat1_rad) * cos(lat2_rad) * sin(dlong/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    distance = 6371 * c  # Earth radius in kilometers
    return distance

In [16]:
df['distance_km'] = df.apply(lambda x: haversine_distance(x.pickup_longitude, x.pickup_latitude, x.dropoff_longitude, x.dropoff_latitude) ,axis = 1 )

In [17]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dayofyear,dayofweek,time,distance_km
0,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,127,3,19:52:06,1.683323
1,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.99471,40.750325,1,198,4,20:04:56,2.45759
2,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.74077,-73.962565,40.772647,1,236,0,21:45:00,5.036377
3,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,177,4,08:22:21,1.661683
4,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,240,3,17:47:00,4.47545


Lets see the fare amount and distance overtime

In [18]:
df.groupby(df.pickup_datetime.dt.year)[['fare_amount','distance_km']].median()

Unnamed: 0_level_0,fare_amount,distance_km
pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2009,7.7,2.063706
2010,7.7,2.095978
2011,7.7,2.110952
2012,8.5,2.166306
2013,9.5,2.153051
2014,9.5,2.146175
2015,9.5,2.100351


**As seen above, the year didn't really make that difference, so I'll discard the idea of adding the gas price or some feature reflects the inflation just for simplicity**

In [19]:
df.drop(columns = ['pickup_datetime','pickup_longitude','pickup_latitude','dropoff_longitude', 'dropoff_latitude' ], inplace = True)

In [20]:
df.head()

Unnamed: 0,fare_amount,passenger_count,dayofyear,dayofweek,time,distance_km
0,7.5,1,127,3,19:52:06,1.683323
1,7.7,1,198,4,20:04:56,2.45759
2,12.9,1,236,0,21:45:00,5.036377
3,5.3,3,177,4,08:22:21,1.661683
4,16.0,5,240,3,17:47:00,4.47545


### Checking other features

In [21]:
df.passenger_count.value_counts()

passenger_count
1      138425
2       29428
5       14009
3        8881
4        4276
6        4271
0         709
208         1
Name: count, dtype: int64

I'll simply drop any row with passanger count greater than 6 or equal to 0

In [22]:
df = df[(df['passenger_count'] <= 6) & (df['passenger_count'] >= 1)]

In [23]:
f'{df.shape[0]:,}'

'199,290'

Now that all features are ready, I'll start transforming

### Feature Transformation

In [24]:
df['dayofyear_sin'] = np.sin(2*np.pi*df['dayofyear']/366)
df['dayofyear_cos'] = np.cos(2*np.pi*df['dayofyear']/366)
df['dayofweek_sin'] = np.sin(2*np.pi*df['dayofweek']/6)
df['dayofweek_cos'] = np.cos(2*np.pi*df['dayofweek']/6)
df['total_seconds'] = df['time'].apply(lambda t: t.hour * 3600 + t.minute * 60 + t.second )
df['time_sin'] = np.sin(2*np.pi*df['total_seconds']/86400)
df['time_cos'] = np.cos(2*np.pi*df['total_seconds']/86400)

In [25]:
df.head()

Unnamed: 0,fare_amount,passenger_count,dayofyear,dayofweek,time,distance_km,dayofyear_sin,dayofyear_cos,dayofweek_sin,dayofweek_cos,total_seconds,time_sin,time_cos
0,7.5,1,127,3,19:52:06,1.683323,0.819972,-0.572404,1.224647e-16,-1.0,71526,-0.882743,0.469857
1,7.7,1,198,4,20:04:56,2.45759,-0.254671,-0.967028,-0.8660254,-0.5,72296,-0.855063,0.518525
2,12.9,1,236,0,21:45:00,5.036377,-0.789418,-0.613856,0.0,1.0,78300,-0.55557,0.83147
3,5.3,3,177,4,08:22:21,1.661683,0.102821,-0.9947,-0.8660254,-0.5,30141,0.813228,-0.581946
4,16.0,5,240,3,17:47:00,4.47545,-0.829677,-0.558244,1.224647e-16,-1.0,64020,-0.998392,-0.056693


In [26]:
final_df = df.drop(columns = ['dayofyear','dayofweek','time','total_seconds']).reset_index(drop = True)

In [27]:
final_df.head()

Unnamed: 0,fare_amount,passenger_count,distance_km,dayofyear_sin,dayofyear_cos,dayofweek_sin,dayofweek_cos,time_sin,time_cos
0,7.5,1,1.683323,0.819972,-0.572404,1.224647e-16,-1.0,-0.882743,0.469857
1,7.7,1,2.45759,-0.254671,-0.967028,-0.8660254,-0.5,-0.855063,0.518525
2,12.9,1,5.036377,-0.789418,-0.613856,0.0,1.0,-0.55557,0.83147
3,5.3,3,1.661683,0.102821,-0.9947,-0.8660254,-0.5,0.813228,-0.581946
4,16.0,5,4.47545,-0.829677,-0.558244,1.224647e-16,-1.0,-0.998392,-0.056693


**Now that we have the features store ready lets do some ML experiments**

# Modeling

In [28]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc_uber_fare")

2024/08/21 16:38:09 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2024/08/21 16:38:09 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

<Experiment: artifact_location='file:///D:/Personal/self study/uber fare prediction/development/mlruns/1', creation_time=1724247490097, experiment_id='1', last_update_time=1724247490097, lifecycle_stage='active', name='nyc_uber_fare', tags={}>

In [29]:
X = final_df.drop(['fare_amount'], axis = 1).reset_index(drop = True)
y = final_df['fare_amount']

In [30]:
from sklearn.metrics import mean_squared_error as MSE 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [32]:
import xgboost

In [33]:
import warnings
warnings.filterwarnings('ignore')

In [34]:
import os
import pickle
os.makedirs("models", exist_ok=True)

Trying a run with the default params of xgboost

In [35]:
with mlflow.start_run():
    mlflow.set_tag("model_type", "xgboost")
    mlflow.set_tag('developer', 'Marwan Yasser')
    xgb_r = xgboost.XGBRegressor()
    xgb_r.fit(X_train, y_train)
    pred = xgb_r.predict(X_test) 
    rmse = MSE(y_test, pred, squared = False)
    print('rmse: ', rmse)
    mlflow.log_metric("rmse",rmse)

    with open("models/xgb_r.bin", "wb") as f:
        pickle.dump(xgb_r, f)
    mlflow.log_artifact(local_path="models/xgb_r.bin", artifact_path="models_pickle")

rmse:  5.399106777604885


In [41]:
pred = xgb_r.predict(X_train)
train_rmse = MSE(y_train,pred, squared =False)
pred = xgb_r.predict(X_test) 
test_rmse = MSE(y_test, pred, squared = False)
print('train loss: ', train_rmse, ',  Test loss: ', test_rmse)

train loss:  4.319850361794423 ,  Test loss:  5.399106777604885


Use random search to tune the hyperparameters of xgboost

In [39]:
param_dist = {
    'n_estimators': [100, 200, 300],           # Number of boosting rounds
    'max_depth': [3, 4, 5, 6, 7, 8, 9],              # Maximum depth of a tree
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],        # Step size shrinkage
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],              # Fraction of samples used per tree
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],       # Fraction of features used per tree
    'min_child_weight': [1, 3, 5, 7],                    # Minimum sum of instance weight (Hessian)
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],                    # Minimum loss reduction required for a split
    'reg_alpha': [0, 0.01, 0.1, 1, 10],                  # L1 regularization term on weights
    'reg_lambda': [0, 0.01, 0.1, 1, 10],                 # L2 regularization term on weights
}


In [42]:
with mlflow.start_run():
    mlflow.set_tag("model_type", "xgboost")
    mlflow.set_tag('developer', 'Marwan Yasser')
    xgb_r = xgboost.XGBRegressor( tree_method="gpu_hist")
    # Initialize RandomizedSearchCV
    random_search = RandomizedSearchCV(
        estimator=xgb_r, 
        param_distributions=param_dist, 
        n_iter=150,          # Number of parameter settings that are sampled
        cv=5,                # 5-fold cross-validation
        verbose=2,           # Display progress messages
        random_state=42,     # For reproducibility
    )
    
    random_search.fit(X_train, y_train)
    
    best_params = random_search.best_params_
    print("Best Parameters found by RandomizedSearchCV:")
    print(best_params)
    mlflow.log_params(best_params)
    best_model_xgb= random_search.best_estimator_
    
    pred = best_model_xgb.predict(X_train)
    train_rmse = MSE(y_train,pred, squared =False)
    
    pred = best_model_xgb.predict(X_test) 
    test_rmse = MSE(y_test, pred, squared = False)
    print('Train loss: ', train_rmse, ', Test loss: ', test_rmse)
    mlflow.log_metric('rmse', test_rmse)

    with open("models/xgb_r_tuned.bin", "wb") as f:
        pickle.dump(best_model_xgb, f)
    mlflow.log_artifact(local_path="models/xgb_r_tuned.bin", artifact_path="models_pickle")

Fitting 5 folds for each of 150 candidates, totalling 750 fits
[CV] END colsample_bytree=0.6, gamma=0.2, learning_rate=0.05, max_depth=7, min_child_weight=3, n_estimators=100, reg_alpha=1, reg_lambda=0.01, subsample=0.9; total time=   1.2s
[CV] END colsample_bytree=0.6, gamma=0.2, learning_rate=0.05, max_depth=7, min_child_weight=3, n_estimators=100, reg_alpha=1, reg_lambda=0.01, subsample=0.9; total time=   0.7s
[CV] END colsample_bytree=0.6, gamma=0.2, learning_rate=0.05, max_depth=7, min_child_weight=3, n_estimators=100, reg_alpha=1, reg_lambda=0.01, subsample=0.9; total time=   0.7s
[CV] END colsample_bytree=0.6, gamma=0.2, learning_rate=0.05, max_depth=7, min_child_weight=3, n_estimators=100, reg_alpha=1, reg_lambda=0.01, subsample=0.9; total time=   0.8s
[CV] END colsample_bytree=0.6, gamma=0.2, learning_rate=0.05, max_depth=7, min_child_weight=3, n_estimators=100, reg_alpha=1, reg_lambda=0.01, subsample=0.9; total time=   0.7s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_r

NotFittedError: need to call fit or load_model beforehand

In [48]:
with mlflow.start_run():
    mlflow.set_tag("model_type", "xgboost")
    mlflow.set_tag('developer', 'Marwan Yasser')
    
    pred = best_model_xgb.predict(X_test) 
    test_rmse = MSE(y_test, pred, squared = False)
    print('Train loss: ', train_rmse, ', Test loss: ', test_rmse)
    mlflow.log_metric('rmse', test_rmse)
    mlflow.log_params(best_params)
    with open("models/xgb_r_tuned.bin", "wb") as f:
        pickle.dump(best_model_xgb, f)
    mlflow.log_artifact(local_path="models/xgb_r_tuned.bin", artifact_path="models_pickle")

Train loss:  5.506313232133612 , Test loss:  5.215810195970013


I'll just stop here for simplicity