# Driver Lifetime Value
source: https://platform.stratascratch.com/data-projects/driver-lifetime-value

In [66]:
import pandas as pd
import numpy as np

In [4]:
driver = pd.read_csv("./datasets/driver_ids.csv")
ride = pd.read_csv("./datasets/ride_ids.csv")
ridets = pd.read_csv("./datasets/ride_timestamps.csv")

print(driver.info())
print(ride.info())
print(ridets.info())

ridets

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 937 entries, 0 to 936
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   driver_id            937 non-null    object
 1   driver_onboard_date  937 non-null    object
dtypes: object(2)
memory usage: 14.8+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193502 entries, 0 to 193501
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   driver_id        193502 non-null  object
 1   ride_id          193502 non-null  object
 2   ride_distance    193502 non-null  int64 
 3   ride_duration    193502 non-null  int64 
 4   ride_prime_time  193502 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 7.4+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 970405 entries, 0 to 970404
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     -

Unnamed: 0,ride_id,event,timestamp
0,00003037a262d9ee40e61b5c0718f7f0,requested_at,2016-06-13 09:39:19
1,00003037a262d9ee40e61b5c0718f7f0,accepted_at,2016-06-13 09:39:51
2,00003037a262d9ee40e61b5c0718f7f0,arrived_at,2016-06-13 09:44:31
3,00003037a262d9ee40e61b5c0718f7f0,picked_up_at,2016-06-13 09:44:33
4,00003037a262d9ee40e61b5c0718f7f0,dropped_off_at,2016-06-13 10:03:05
...,...,...,...
970400,ffffccd77f47a3de26dfed9a851464b4,requested_at,2016-05-18 08:44:13
970401,ffffccd77f47a3de26dfed9a851464b4,accepted_at,2016-05-18 08:44:21
970402,ffffccd77f47a3de26dfed9a851464b4,arrived_at,2016-05-18 08:44:36
970403,ffffccd77f47a3de26dfed9a851464b4,picked_up_at,2016-05-18 08:44:42


### (1) Feature engineering

In [32]:
# Pivot ride timestamps
ridets["timestamp"] = pd.to_datetime(ridets["timestamp"])
ridets_pivot = (
    ridets.pivot(index="ride_id", columns="event", values="timestamp")
    .reset_index()
    .assign(
        request_to_accept=lambda x: (x.accepted_at - x.requested_at).dt.total_seconds(),
        pickup_to_dropoff=lambda x: (x.dropped_off_at - x.picked_up_at)
        / pd.Timedelta(minutes=1),
    )
)


In [185]:
# Merge driver, ride and ride_ts
trips = ride.merge(ridets_pivot, on="ride_id", how="inner", validate="1:1").merge(
    driver, on="driver_id", how="inner", validate="m:1"
)
print(f"{driver.shape=} {ride.shape=} {ridets_pivot.shape=} {trips.shape=}")

driver.shape=(937, 2) ride.shape=(193502, 5) ridets_pivot.shape=(194081, 8) trips.shape=(184209, 13)


In [186]:
# Calculate fares
trips = trips.assign(
    bare_fare=lambda x: (
        2 + 0.22 * x.ride_duration / 60 + 1.15 * x.ride_distance / 1609.34
    )
    * (100 + x.ride_prime_time)
    / 100
    + 1.75,
    final_fare=lambda x: np.minimum(400, np.maximum(5, x.bare_fare)),
)


In [183]:
trips.dropped_off_at.max()

Timestamp('2016-06-27 00:50:50')

In [182]:
trips.columns

Index(['driver_id', 'ride_id', 'ride_distance', 'ride_duration',
       'ride_prime_time', 'accepted_at', 'arrived_at', 'dropped_off_at',
       'picked_up_at', 'requested_at', 'request_to_accept',
       'pickup_to_dropoff', 'driver_onboard_date', 'bare_fare', 'final_fare'],
      dtype='object')

In [222]:
# Active days
# Total trips
# Total distance
# Total duration
# Total prime time
# Total fare
# Tenure
# pct_active
# Average trips per day
# Average distance per day
# Average duration per day
# Average distance per trip
# Average prime time per day
# Average speed
# Dominant working hours: Morning, Afternoon, Evening, Night

driveragg = (
    trips.assign(
        active_date=lambda x: x.accepted_at.dt.date,
        driver_onboard_date=lambda x: pd.to_datetime(x.driver_onboard_date),
        tenure=lambda x: (pd.to_datetime("2016-06-30") - x.driver_onboard_date).dt.days,
        prime_trip=lambda x: np.where(x.ride_prime_time > 0, 1, 0),
    )
    .groupby(["driver_id", "tenure"], as_index=False)
    .agg(
        driver_onboard_date=("driver_onboard_date", "min"),
        active_days=("active_date", "nunique"),
        last_active_day=("accepted_at", "max"),
        total_trips=("ride_id", "nunique"),
        total_distance=("ride_distance", "sum"),
        total_duration=("ride_duration", "sum"),
        total_fare=("final_fare", "sum"),
        total_prime=("ride_prime_time", "sum"),
        total_prime_trip=("prime_trip", "sum"),
    )
    .assign(
        lifetime=lambda x: (x.last_active_day - x.driver_onboard_date).dt.days,
        pct_active=lambda x: x.active_days / x.tenure,
        trips_pday=lambda x: x.total_trips / x.active_days,
        distance_pday=lambda x: x.total_distance / x.active_days,
        duration_pday=lambda x: x.total_duration / x.active_days,
        distance_ptrip=lambda x: x.total_distance / x.total_trips,
        prime_pday=lambda x: x.total_prime / x.active_days,
        fare_ptenure=lambda x: x.total_fare / x.tenure,
        primetrip_pday=lambda x: x.total_prime_trip / x.active_days,
    )
)

In [223]:
driveragg

Unnamed: 0,driver_id,tenure,driver_onboard_date,active_days,last_active_day,total_trips,total_distance,total_duration,total_fare,total_prime,total_prime_trip,lifetime,pct_active,trips_pday,distance_pday,duration_pday,distance_ptrip,prime_pday,fare_ptenure,primetrip_pday
0,002be0ffdc997bd5c50703158b7c2491,93,2016-03-29,56,2016-06-23 10:06:30,277,1740287,221238,3560.926071,5375,110,86,0.602151,4.946429,31076.553571,3950.678571,6282.624549,95.982143,38.289528,1.964286
1,007f0389f9c7b03ef97098422f902e62,93,2016-03-29,12,2016-06-22 13:17:44,31,117531,20497,321.494948,625,12,85,0.129032,2.583333,9794.250000,1708.083333,3791.322581,52.083333,3.456935,1.000000
2,011e5c5dfc5c2c92501b8b24d47509bc,86,2016-04-05,12,2016-06-12 20:22:27,34,269653,29205,482.428390,675,16,68,0.139535,2.833333,22471.083333,2433.750000,7930.970588,56.250000,5.609632,1.333333
3,0152a2f305e71d26cc964f8d4411add9,68,2016-04-23,40,2016-06-26 10:16:39,191,1471239,174521,2610.871561,2050,48,64,0.588235,4.775000,36780.975000,4363.025000,7702.821990,51.250000,38.395170,1.200000
4,01674381af7edd264113d4e6ed55ecda,62,2016-04-29,40,2016-06-24 13:03:42,375,3123644,357443,5381.097539,4700,99,56,0.645161,9.375000,78091.100000,8936.075000,8329.717333,117.500000,86.791896,2.475000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
832,ff419a3476e21e269e340b5f1f05414e,65,2016-04-26,36,2016-06-26 17:58:22,242,2002400,246308,3538.377658,3100,70,61,0.553846,6.722222,55622.222222,6841.888889,8274.380165,86.111111,54.436579,1.944444
833,ff714a67ba8c6a108261cd81e3b77f3a,94,2016-03-28,61,2016-06-17 09:19:04,485,2834765,442492,6287.368597,8625,182,81,0.648936,7.950820,46471.557377,7253.967213,5844.876289,141.393443,66.886900,2.983607
834,fff482c704d36a1afe8b8978d5486283,83,2016-04-08,16,2016-05-12 16:08:46,35,174394,27693,380.661435,175,4,34,0.192771,2.187500,10899.625000,1730.812500,4982.685714,10.937500,4.586282,0.250000
835,fffecccc49436c5389075b13209f0dfa,55,2016-05-06,47,2016-06-26 20:55:49,406,2924913,353974,6052.149594,12050,215,51,0.854545,8.638298,62232.191489,7531.361702,7204.219212,256.382979,110.039084,4.574468
