In [1]:
!where pip     # Use a virtual environment for development

D:\Personal\self study\uber fare prediction\virtualenv_dev_uber_fare\Scripts\pip.exe
C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Scripts\pip.exe


In [2]:
!pip list

Package                            Version
---------------------------------- --------------
alembic                            1.13.2
aniso8601                          9.0.1
anyio                              4.4.0
argon2-cffi                        23.1.0
argon2-cffi-bindings               21.2.0
arrow                              1.3.0
asttokens                          2.4.1
async-lru                          2.0.4
attrs                              24.2.0
babel                              2.16.0
beautifulsoup4                     4.12.3
bleach                             6.1.0
blinker                            1.8.2
cachetools                         5.4.0
certifi                            2024.7.4
cffi                               1.17.0
charset-normalizer                 3.3.2
click                              8.1.7
cloudpickle                        3.0.0
colorama                           0.4.6
comm                               0.2.2
contourpy                          1

In [3]:
import pandas as pd
import numpy as np
import mlflow
from math import radians, sin, cos, sqrt, atan2
import matplotlib.pyplot as plt
import datetime

In [75]:
import seaborn as sns

In [142]:
df = pd.read_csv('uber.csv')

In [143]:
df.drop(columns = ['Unnamed: 0','key'], inplace = True)

In [144]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [145]:
f'Number of rows: {df.shape[0]:,}'

'Number of rows: 200,000'

### Datetime feature engineering

In [146]:
df.pickup_datetime = pd.to_datetime(df.pickup_datetime)

In [147]:
df['dayofyear'] = df.pickup_datetime.dt.dayofyear
df['dayofweek'] = df.pickup_datetime.dt.weekday
df['time'] = df.pickup_datetime.dt.time

In [148]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dayofyear,dayofweek,time
0,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,127,3,19:52:06
1,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.99471,40.750325,1,198,4,20:04:56
2,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.74077,-73.962565,40.772647,1,236,0,21:45:00
3,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,177,4,08:22:21
4,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,240,3,17:47:00


**Checking Consistency**

In [149]:
np.sort(df.time.unique())

array([datetime.time(0, 0), datetime.time(0, 0, 2),
       datetime.time(0, 0, 3), ..., datetime.time(23, 59, 57),
       datetime.time(23, 59, 58), datetime.time(23, 59, 59)], dtype=object)

In [150]:
df.dayofweek.nunique()

7

In [151]:
df.dayofyear.nunique()

366

**It seems that all datetime features are consistant and that all possible values are available**

### Transforming location features and summerize them to be only distance

Here for simplicity, instead of using an API to get the exact street distance between the 2 locations, I'll use Haversine distance 

In [152]:
def haversine_distance(long1,lat1, long2, lat2):    
    # Convert latitude and longitude to radians
    lat1_rad = radians(lat1)
    long1_rad = radians(long1)
    lat2_rad = radians(lat2)
    long2_rad = radians(long2)

    # Haversine formula
    dlong = long2_rad - long1_rad
    dlat = lat2_rad - lat1_rad
    a = sin(dlat/2)**2 + cos(lat1_rad) * cos(lat2_rad) * sin(dlong/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    distance = 6371 * c  # Earth radius in kilometers
    return distance

In [153]:
df['distance_km'] = df.apply(lambda x: haversine_distance(x.pickup_longitude, x.pickup_latitude, x.dropoff_longitude, x.dropoff_latitude) ,axis = 1 )

In [154]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dayofyear,dayofweek,time,distance_km
0,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,127,3,19:52:06,1.683323
1,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.99471,40.750325,1,198,4,20:04:56,2.45759
2,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.74077,-73.962565,40.772647,1,236,0,21:45:00,5.036377
3,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,177,4,08:22:21,1.661683
4,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,240,3,17:47:00,4.47545


Lets see the fare amount and distance overtime

In [155]:
df.groupby(df.pickup_datetime.dt.year)[['fare_amount','distance_km']].median()

Unnamed: 0_level_0,fare_amount,distance_km
pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2009,7.7,2.063706
2010,7.7,2.095978
2011,7.7,2.110952
2012,8.5,2.166306
2013,9.5,2.153051
2014,9.5,2.146175
2015,9.5,2.100351


**As seen above, the year didn't really make that difference, so I'll discard the idea of adding the gas price or some feature reflects the inflation just for simplicity**

In [156]:
df.drop(columns = ['pickup_datetime','pickup_longitude','pickup_latitude','dropoff_longitude', 'dropoff_latitude' ], inplace = True)

In [157]:
df.head()

Unnamed: 0,fare_amount,passenger_count,dayofyear,dayofweek,time,distance_km
0,7.5,1,127,3,19:52:06,1.683323
1,7.7,1,198,4,20:04:56,2.45759
2,12.9,1,236,0,21:45:00,5.036377
3,5.3,3,177,4,08:22:21,1.661683
4,16.0,5,240,3,17:47:00,4.47545


### Checking other features

In [158]:
df.passenger_count.value_counts()

passenger_count
1      138425
2       29428
5       14009
3        8881
4        4276
6        4271
0         709
208         1
Name: count, dtype: int64

I'll simply drop any row with passanger count greater than 6 or equal to 0

In [163]:
df = df[(df['passenger_count'] <= 6) & (df['passenger_count'] >= 1)]

In [164]:
f'{df.shape[0]:,}'

'199,290'

Now that all features are ready, I'll start transforming

### Feature Transformation

In [165]:
df['dayofyear_sin'] = np.sin(2*np.pi*df['dayofyear']/366)
df['dayofyear_cos'] = np.cos(2*np.pi*df['dayofyear']/366)
df['dayofweek_sin'] = np.sin(2*np.pi*df['dayofweek']/6)
df['dayofweek_cos'] = np.cos(2*np.pi*df['dayofweek']/6)
df['total_seconds'] = df['time'].apply(lambda t: t.hour * 3600 + t.minute * 60 + t.second )
df['time_sin'] = np.sin(2*np.pi*df['total_seconds']/86400)
df['time_cos'] = np.cos(2*np.pi*df['total_seconds']/86400)

In [166]:
df.head()

Unnamed: 0,fare_amount,passenger_count,dayofyear,dayofweek,time,distance_km,dayofyear_sin,dayofyear_cos,dayofweek_sin,dayofweek_cos,total_seconds,time_sin,time_cos
0,7.5,1,127,3,19:52:06,1.683323,0.819972,-0.572404,1.224647e-16,-1.0,71526,-0.882743,0.469857
1,7.7,1,198,4,20:04:56,2.45759,-0.254671,-0.967028,-0.8660254,-0.5,72296,-0.855063,0.518525
2,12.9,1,236,0,21:45:00,5.036377,-0.789418,-0.613856,0.0,1.0,78300,-0.55557,0.83147
3,5.3,3,177,4,08:22:21,1.661683,0.102821,-0.9947,-0.8660254,-0.5,30141,0.813228,-0.581946
4,16.0,5,240,3,17:47:00,4.47545,-0.829677,-0.558244,1.224647e-16,-1.0,64020,-0.998392,-0.056693


In [170]:
final_df = df.drop(columns = ['dayofyear','dayofweek','time','total_seconds']).reset_index(drop = True)

In [172]:
final_df.head()

Unnamed: 0,fare_amount,passenger_count,distance_km,dayofyear_sin,dayofyear_cos,dayofweek_sin,dayofweek_cos,time_sin,time_cos
0,7.5,1,1.683323,0.819972,-0.572404,1.224647e-16,-1.0,-0.882743,0.469857
1,7.7,1,2.45759,-0.254671,-0.967028,-0.8660254,-0.5,-0.855063,0.518525
2,12.9,1,5.036377,-0.789418,-0.613856,0.0,1.0,-0.55557,0.83147
3,5.3,3,1.661683,0.102821,-0.9947,-0.8660254,-0.5,0.813228,-0.581946
4,16.0,5,4.47545,-0.829677,-0.558244,1.224647e-16,-1.0,-0.998392,-0.056693


**Now that we have the features store ready lets do some ML experiments**

## Modeling

To Do
- Monitor using Evidently, premetheus 
