In [3]:
# Lyft Data Challenge
# Team Name: ikun
# Team Member: Zhen Jiang & Yihua Xu

import pandas as pd
from pandas.plotting import radviz
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm, preprocessing
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.linear_model import Perceptron

In [4]:
# read the file

def read(file):
    return pd.read_csv(file)

driver_id = read('driver_ids.csv')
ride_id = read('ride_ids.csv')
ride_time = read('ride_timestamps.csv')
ride_time = ride_time[ride_time.event != 'requested_at']
ride_time = ride_time[ride_time.event != 'accepted_at']
ride_time = ride_time[ride_time.event != 'picked_up_at']
ride_time = ride_time[ride_time.event != 'arrived_at']

In [5]:
# convert the file to numpy array

driver_id_np = (driver_id).to_numpy()
ride_id_np = (ride_id).to_numpy()
ride_time_np = (ride_time).to_numpy()

In [6]:
# convert the file to list

driver_id_li = (driver_id_np).tolist()
ride_id_li = (ride_id_np).tolist()
ride_time_li = (ride_time_np).tolist()

In [7]:
# calculate the fare for each ride

for i in range(len(ride_id_li)):
    fare = (ride_id_li[i][2]*1.15*0.000621371 + ride_id_li[i][3]*0.22/60 + 2) * ((100+ride_id_li[i][4])/100) + 1.75
    if fare < 5:
        fare = 5
    if fare >400:
        fare = 400
    ride_id_li[i].append(fare)

In [8]:
print("The fare for this ride is $", ride_id_li[0][5])

The fare for this ride is $ 8.489647469725


In [9]:
# use a dictionary to contain the fare for every ride that a driver takes

driver_faredict = {}
for i in ride_id_li:
    if i[0] not in driver_faredict:
        driver_faredict[i[0]] = [i[5]]
    else:
        driver_faredict[i[0]].append(i[5])
        
print("The fare that a driver got from every ride:\n", driver_faredict['052bba06c5fc0bdea4bc2f9cb92b37c7'])

The fare that a driver got from every ride:
 [6.700114503433333, 14.164552560133334, 11.986016797200001, 8.458346249625, 6.7957250118, 11.112514430216667, 13.425959146166665, 10.387466316000001, 9.183669310133332, 5.80547858925, 14.793924386574998, 16.60572080527083, 79.21695783837501, 5.37209658295, 29.26577196708333, 24.03772408118333, 5, 23.947333165699998, 6.430094137933333, 6.7295773108666666, 15.26709700827083, 7.245827043533334, 13.5179694204375, 10.713279103683332, 5.575009774283333, 7.208218367483333, 8.95951549955, 6.06995168155, 5.916577412983333, 13.146081789541665, 7.023437654016667, 19.40753091528333, 24.403870965183334, 13.142189291249998, 7.01413904415, 7.406763604133333, 9.1067805574, 16.738936807216668, 7.610252021616667, 14.751586773016665, 19.795075953483334, 15.808288824016666, 9.858678273266666, 6.492747924433333, 8.082489179020833, 6.936259544520833, 7.57439552325, 10.460616257716666, 10.02738689055, 6.2671313184583335, 22.807747047766668, 26.362538088483333, 17.

In [10]:
# make a dictionary with the key as driver id and the value as ride id

driver_rideiddict = {}
for i in ride_id_li:
    if i[0] not in driver_rideiddict:
        driver_rideiddict[i[0]] = [i[1]]
    else:
        driver_rideiddict[i[0]].append(i[1])
        
print("This driver has completed these rides(ID):\n",driver_rideiddict['007f0389f9c7b03ef97098422f902e62'])

This driver has completed these rides(ID):
 ['01f133164433ea7682545a41643e6949', '07f9b5246c8431e3e5bac56d9f48b4f9', '13ac4cc1bf311c6944d0e6cd4c93ebb1', '274033cb9ee08c4499d4f16ce9db9a82', '2ca7357aa39d812da3055ae3d8ffa3e5', '2efa365088953fc2427eca4685082353', '319454bab6c79927906ffa747d196093', '51fbbf26de390105a9b5074e3c51397f', '5a6861ae3b9498188027a02c587348a2', '6bae1803f94384d4e763f64bf6124f5e', '833bfa5564b71fcdb8130cc82319cd73', '942d0071fcb8c16f47182f3fe7aa02ea', '9acc3e2e4e4b6dafae49dd866f32dd94', 'acafdb36eff085643765d58b3fd209cd', 'ad54b7bb09f71f8425abb9350056983e', 'b00676fa85988030cb4ecefb7a721e81', 'b6b8293e442483822046dbd73fe4cf34', 'c39f3aefd5fb1d6edaf94bd97ddaf51d', 'ca9b900ed4ebdbbe72f1cfaf21fcd497', 'cee97d3a86ab81ff1ee41e7426a082cc', 'cfd4a9c275c812148d0ecaffb17fa854', 'd04a2fab8f224ee824f66ece82d729c0', 'd31d7c47c230715656f7842f2ef4641a', 'd860426f111362c7d22bff032062bd40', 'db46f1b031e64c66f1559c36d47caa6d', 'e12e0da37fc9cdf44b5d8df10b979149', 'e6c6d6c487f0e797e2

In [11]:
# we reverse the dictionary (use ride id as key and driver id as value)

rideid_driverdict = {}
for i in driver_rideiddict.keys():
    for j in driver_rideiddict[i]:
        rideid_driverdict[j] = [i]
        
print("This ride belongs to this driver",rideid_driverdict['01f133164433ea7682545a41643e6949'])

This ride belongs to this driver ['007f0389f9c7b03ef97098422f902e62']


In [12]:
# make a dictionary using ride id as key and a list of driver id and drop off time as value

rideid_driver_timedict = {}
for i in ride_time_li:
    if i[0] in rideid_driverdict:
        rideid_driverdict[i[0]].append(i[2])
        
for i in rideid_driverdict.keys():
    if len(rideid_driverdict[i]) == 2:
        rideid_driver_timedict[i] = rideid_driverdict[i]
        
print("This ride belongs to this driver, and its drop-off time is attached.", rideid_driver_timedict['07f9b5246c8431e3e5bac56d9f48b4f9'])

#To check every value of the dictionary contains exactly two elements
signal = 0
for i in rideid_driver_timedict.values():
    if len(i) != 2:
        siganl += 1
        
print("Number of elements in the value of the dictionary not equal to 2:", signal)

This ride belongs to this driver, and its drop-off time is attached. ['007f0389f9c7b03ef97098422f902e62', '2016-04-08 20:45:16']
Number of elements in the value of the dictionary not equal to 2: 0


In [13]:
# we create a new dictionary with driver id as key and a list of drop-off time for every ride as value

driver_alltripsdict = {}
for i in rideid_driver_timedict.keys():
    if rideid_driver_timedict[i][0] not in driver_alltripsdict:
        driver_alltripsdict[rideid_driver_timedict[i][0]] = [rideid_driver_timedict[i][1]]
    elif rideid_driver_timedict[i][0] in driver_alltripsdict:
        driver_alltripsdict[rideid_driver_timedict[i][0]].append(rideid_driver_timedict[i][1])
        
print("A driver has completed the rides are on these dates:\n", driver_alltripsdict['007f0389f9c7b03ef97098422f902e62'])

A driver has completed the rides are on these dates:
 ['2016-05-18 21:27:51', '2016-04-08 20:45:16', '2016-05-18 21:48:00', '2016-04-27 21:47:05', '2016-06-14 21:11:59', '2016-03-29 22:48:33', '2016-06-14 21:27:19', '2016-05-06 21:42:55', '2016-04-24 16:58:58', '2016-06-05 17:06:34', '2016-06-05 14:32:07', '2016-05-19 16:18:45', '2016-04-27 21:11:13', '2016-05-06 21:00:19', '2016-06-22 13:16:46', '2016-04-29 20:18:50', '2016-04-27 19:41:07', '2016-05-18 22:07:25', '2016-05-06 21:17:11', '2016-05-18 21:40:01', '2016-06-05 17:28:09', '2016-06-02 16:58:42', '2016-06-22 13:28:38', '2016-04-24 18:51:22', '2016-04-27 20:27:05', '2016-04-24 17:57:44', '2016-06-05 13:47:18', '2016-05-18 20:39:43', '2016-04-29 21:01:30', '2016-04-24 15:53:38', '2016-04-24 18:12:53']


In [14]:
# we modify the current dictionary to just have a list of lists that contain the month and day

for i in driver_alltripsdict.keys():
    for j in range(len(driver_alltripsdict[i])):
        driver_alltripsdict[i][j] = driver_alltripsdict[i][j].split()[0].split("-")[1:]
        
print("A driver has completed the rides on these dates:\n", driver_alltripsdict['007f0389f9c7b03ef97098422f902e62'])

A driver has completed the rides on these dates:
 [['05', '18'], ['04', '08'], ['05', '18'], ['04', '27'], ['06', '14'], ['03', '29'], ['06', '14'], ['05', '06'], ['04', '24'], ['06', '05'], ['06', '05'], ['05', '19'], ['04', '27'], ['05', '06'], ['06', '22'], ['04', '29'], ['04', '27'], ['05', '18'], ['05', '06'], ['05', '18'], ['06', '05'], ['06', '02'], ['06', '22'], ['04', '24'], ['04', '27'], ['04', '24'], ['06', '05'], ['05', '18'], ['04', '29'], ['04', '24'], ['04', '24']]


In [15]:
# modify the value of current dictionary from string type to integer type

for i in driver_alltripsdict.keys():
    for j in range(len(driver_alltripsdict[i])):
        driver_alltripsdict[i][j] = [int(driver_alltripsdict[i][j][0]),int(driver_alltripsdict[i][j][1])]
        
print("A driver has completed the rides on these dates:\n", driver_alltripsdict['052bba06c5fc0bdea4bc2f9cb92b37c7'])

A driver has completed the rides on these dates:
 [[5, 9], [4, 23], [4, 23], [4, 15], [4, 16], [5, 26], [4, 21], [4, 23], [5, 8], [4, 21], [4, 22], [5, 7], [5, 11], [4, 16], [4, 18], [5, 8], [5, 8], [5, 9], [5, 11], [6, 20], [5, 9], [4, 18], [4, 18], [4, 21], [5, 7], [4, 23], [4, 16], [4, 15], [4, 16], [4, 18], [4, 16], [4, 16], [5, 26], [4, 18], [6, 20], [4, 16], [5, 11], [4, 16], [4, 22], [4, 16], [4, 15], [6, 20], [5, 11], [5, 7], [4, 23], [4, 16], [5, 11], [4, 18], [4, 16], [5, 11], [4, 23], [5, 11], [5, 9], [4, 15], [4, 15], [4, 18]]


In [16]:
'''Get last boarding date'''

for i in driver_alltripsdict.keys():
    month = 1
    day = 1
    for j in driver_alltripsdict[i]:
        if j[0] > month:
            month = j[0]
            day = j[1]
        elif j[0] == month:
            if j[1] > day:
                day = j[1]
    driver_alltripsdict[i] = [[month, day]]
    
print("A driver has compelted the last ride on this date",driver_alltripsdict['052bba06c5fc0bdea4bc2f9cb92b37c7'])  

A driver has compelted the last ride on this date [[6, 20]]


In [17]:
# Add the onboard date

for i in driver_alltripsdict.keys():
    for k in driver_id_li:
        if i == k[0]:
            driver_alltripsdict[i].append(k[1].split()[0].split("-")[1:])
            
print("The driver has his end date and his onboard date respectively on", driver_alltripsdict['052bba06c5fc0bdea4bc2f9cb92b37c7']) 

The driver has his end date and his onboard date respectively on [[6, 20], ['04', '15']]


In [18]:
'''convert to int'''

for i in driver_alltripsdict.keys():
    for j in range(len(driver_alltripsdict[i])):
        driver_alltripsdict[i][j] = [int(driver_alltripsdict[i][j][0]),int(driver_alltripsdict[i][j][1])]
print("The driver has his end date and his onboard date respectively on", driver_alltripsdict['052bba06c5fc0bdea4bc2f9cb92b37c7']) 

driver_on_offdict = {}
for i in driver_alltripsdict.keys():
    if len(driver_alltripsdict[i]) == 2:
        driver_on_offdict[i] = driver_alltripsdict[i]
        
# Check if the value of the dictionary contains exactly 2 elements
signal = 0
for i in driver_on_offdict.keys():
    if len(driver_alltripsdict[i]) != 2:
        signal += 1
print("Number of elements in the value of the dictionary not equal to 2:", signal)

The driver has his end date and his onboard date respectively on [[6, 20], [4, 15]]
Number of elements in the value of the dictionary not equal to 2: 0


In [19]:
'''calculate number of days between the onboard date and the end date for each driver'''

for i in driver_on_offdict.keys():
    lastMonth = driver_on_offdict[i][0][0]
    lastDay = driver_on_offdict[i][0][1]
    startMonth = driver_on_offdict[i][1][0]
    startDay = driver_on_offdict[i][1][1]
    
    startDays = (startMonth - 1) * 30 + startDay
    lastDays = (lastMonth - 1) * 30 + lastDay
    
    difference = lastDays - startDays
    driver_on_offdict[i] = difference
    
print("The difference of the start date and the end date for this driver is", driver_on_offdict['fff482c704d36a1afe8b8978d5486283']) 

The difference of the start date and the end date for this driver is 34


In [20]:
'''Average Lifetime Value'''

farelist = []
farelistnew = []
for i in driver_on_offdict.keys():
    for k in driver_faredict.keys():
        if i == k:
            farelist.append(driver_faredict[k])
            
for i in farelist:
    sum3 = 0
    for k in i:
        sum3 += k
    farelistnew.append(sum3)
    
sum4 = 0
for i in farelistnew:
    sum4 +=  i
newaverage = sum4/len(farelistnew)
print("Average Lifetime Value:", "$", round(newaverage, 3))

Average Lifetime Value: $ 2979.856


In [21]:
'''Average days spent in Lyft'''

averageDay = 0
for i in driver_on_offdict.keys():
    averageDay += driver_on_offdict[i]
averageDay /= len(driver_on_offdict)
print("Average days spent in Lyft:", round(averageDay, 3))

Average days spent in Lyft: 54.184


In [22]:
'''feature1: Driver ID'''
# Create numpy array to contain Driver ID

Driver_ID_LIST = []
for i in driver_on_offdict.keys():
    Driver_ID_LIST.append(i)
driver_id_nparray = np.asarray(Driver_ID_LIST).reshape(-1,1)

In [23]:
'''feature2: Days in Lyft'''
#Create a numpy array to contain Days in Lyft

Driver_day_list = []
for i in Driver_ID_LIST:
    if i in driver_on_offdict:
        Driver_day_list.append(driver_on_offdict[i])
driver_days_nparray = np.asarray(Driver_day_list).reshape(-1,1)

In [24]:
'''Target: Life Time Value'''
# Create a numpy array to contain Lifetime Value

LTV_dict = {}
Driver_LTV_list = []
for i in driver_on_offdict.keys():
    for j in driver_faredict.keys():
        if (i == j):
            LTV_dict[j] = driver_faredict[j]
            
for i in LTV_dict.keys():
    total = 0
    for j in LTV_dict[i]:
        total += j
    LTV_dict[i] = total

for i in Driver_ID_LIST:
    if i in LTV_dict:
        Driver_LTV_list.append(LTV_dict[i])
        
driver_LTV_nparrays = np.asarray(Driver_LTV_list).reshape(-1,1)

In [25]:
'''feature3: Primetime Percentage for each driver'''

Driver_total_primetime = {}
for i in range(len(Driver_ID_LIST)):
    for k in range(len(ride_id_li)):
        if Driver_ID_LIST[i] == ride_id_li[k][0]:
            Driver_total_primetime[Driver_ID_LIST[i]] = []
            
for i in range(len(Driver_ID_LIST)):
    for k in range(len(ride_id_li)):
        if Driver_ID_LIST[i] == ride_id_li[k][0]:
            Driver_total_primetime[Driver_ID_LIST[i]].append(ride_id_li[k][4]) 

In [26]:
# the dictionary contains different levels of prime time for each ride for every driver

print("A driver has completed rides during different prime time:\n", Driver_total_primetime['007f0389f9c7b03ef97098422f902e62']) 

A driver has completed rides during different prime time:
 [25, 0, 50, 0, 0, 75, 25, 0, 50, 75, 0, 0, 50, 0, 0, 0, 0, 50, 0, 25, 75, 0, 0, 0, 0, 0, 50, 0, 0, 0, 75]


In [27]:
# Modify the current dictionary: add different levels of prime time together
# and divde by the number of rides to get an average evaluation of prime time for each driver

for i in Driver_total_primetime.keys():
    sum1 = 0
    for k in Driver_total_primetime[i]:
        sum1 += k
    num = len(Driver_total_primetime[i])
    Driver_total_primetime[i] = sum1/num
print("The average prime time estimation for a driver is ", Driver_total_primetime['007f0389f9c7b03ef97098422f902e62']) 

The average prime time estimation for a driver is  20.161290322580644


In [28]:
# Create a numpy array that contains prime time for every driver

Driver_primetime_list = []
for i in Driver_total_primetime:
    Driver_primetime_list.append(Driver_total_primetime[i])
Driver_primetime_nparrays = np.asarray(Driver_primetime_list).reshape(-1,1)
print("The shape of Driver_primetime_nparrays", Driver_primetime_nparrays.shape)

The shape of Driver_primetime_nparrays (837, 1)


In [29]:
'''feature4: total ride distance'''
# Create a numpy array that contains total ride distance

driver_dist_dict = {}
for i in ride_id_li:
    if i[0] not in driver_dist_dict:
        driver_dist_dict[i[0]] = [i[2]]
    else:
        driver_dist_dict[i[0]].append(i[2])


for i in driver_dist_dict.keys():
    total_dist = 0
    num_rides = 0
    for j in driver_dist_dict[i]:
        total_dist += j
        num_rides += 1
    driver_dist_dict[i] = [total_dist, num_rides]
print("The total ride distance and the number of rides for a driver are", driver_dist_dict["002be0ffdc997bd5c50703158b7c2491"])

total_dist_list = []
for i in Driver_ID_LIST:
    if i in driver_dist_dict:
        total_dist_list.append(driver_dist_dict[i][0])

total_dist_nparray = np.asarray(total_dist_list).reshape(-1,1)

The total ride distance and the number of rides for a driver are [1740287, 277]


In [30]:
'''feature5: total number of rides'''
# Create a numpy array to contain number of rides

total_numrides_list = []
for i in Driver_ID_LIST:
    if i in driver_dist_dict:
        total_numrides_list.append(driver_dist_dict[i][1])

total_numrides_nparray = np.asarray(total_numrides_list).reshape(-1,1)

In [31]:
'''feature6: total duration'''
# Create a numpy array to store the total duration of the rides for each driver

driver_time_dict = {}
for i in ride_id_li:
    if i[0] not in driver_time_dict:
        driver_time_dict[i[0]] = [i[3]]
    else:
        driver_time_dict[i[0]].append(i[3])



for i in driver_time_dict.keys():
    total_time = 0
    for j in driver_time_dict[i]:
        total_time += j
    
    driver_time_dict[i] = total_time / 60 
print("The total duration of the rides for this driver is", driver_time_dict["002be0ffdc997bd5c50703158b7c2491"])

total_time_list = []
for i in Driver_ID_LIST:
    if i in driver_time_dict:
        total_time_list.append(driver_time_dict[i])

total_time_nparray = np.asarray(total_time_list).reshape(-1,1)

The total duration of the rides for this driver is 3687.3


In [32]:
"""feature7: month (categorical) ---> march"""
# Create a numpy array for march

driver_onboard_dict = {}
for i in driver_id_li:
    driver_onboard_dict[i[0]] = int(i[1].split()[0].split("-")[1])

print("The month that the driver is onboard is",driver_onboard_dict["11506b81721ca68ef019764de3d8edbd"])

march_list = []
for i in Driver_ID_LIST:
    if i in driver_onboard_dict:
        if driver_onboard_dict[i] == 3:
            march_list.append(1)
        else:
            march_list.append(0)

march_nparray = np.asarray(march_list).reshape(-1,1)

"""feature7: month (categorical) ---> april"""
# Create a numpy array for april

april_list = []
for i in Driver_ID_LIST:
    if i in driver_onboard_dict:
        if driver_onboard_dict[i] == 4:
            april_list.append(1)
        else:
            april_list.append(0)

april_nparray = np.asarray(april_list).reshape(-1,1)


"""feature7: month (categorical) ---> may"""
# Create a numpy array for may

may_list = []
for i in Driver_ID_LIST:
    if i in driver_onboard_dict:
        if driver_onboard_dict[i] == 5:
            may_list.append(1)
        else:
            may_list.append(0)

may_nparray = np.asarray(may_list).reshape(-1,1)


The month that the driver is onboard is 4


In [33]:
'''feature8: accepted to pickup interval time'''
# Create a numpy array that contains the time inteveral between accpeted time to pickup time

# Get rid of requested time, drop-off time, and arrive time
ride_timenew = read('ride_timestamps.csv')
ride_timenew = ride_timenew[ride_timenew.event != 'requested_at']
ride_timenew = ride_timenew[ride_timenew.event != 'dropped_off_at']
ride_timenew = ride_timenew[ride_timenew.event != 'arrived_at']
ride_time_npnew = (ride_timenew).to_numpy()
ride_time_linew = (ride_time_npnew).tolist()
accept_pick_intervaldict = {}

for k in ride_time_linew:
    if k[0] in rideid_driverdict:
        if k[0] not in accept_pick_intervaldict:
            accept_pick_intervaldict[k[0]] = [k[2].split()[1]]
        else:
            accept_pick_intervaldict[k[0]].append(k[2].split()[1])
print("The accepted time and the pickup time for this ride is", accept_pick_intervaldict["00006efeb0d5e3ccad7d921ddeee9900"])

for i in accept_pick_intervaldict.keys():
    accept_time = accept_pick_intervaldict[i][0].split(":")
    pick_time = accept_pick_intervaldict[i][1].split(":")
    
    accept_hour = int(accept_time[0])
    accept_minute = int(accept_time[1])
    accept_second = int(accept_time[2])
    
    pick_hour = int(pick_time[0])
    pick_minute = int(pick_time[1])
    pick_second = int(pick_time[2])
    
    if pick_hour == 0 and accept_hour == 23:
        pick_hour = 24
    time_difference = (pick_hour - accept_hour) * 60 + (pick_minute - accept_minute) + (pick_second - accept_second)/60
    time_difference = round(time_difference, 2)
    accept_pick_intervaldict[i] = [time_difference]
    
for i in rideid_driverdict.keys():
    if i in accept_pick_intervaldict:
        accept_pick_intervaldict[i].append(rideid_driverdict[i][0])
        
print("The time interval between accepted time and pickup time and the driver id is shown to be:\n",accept_pick_intervaldict["00006efeb0d5e3ccad7d921ddeee9900"])
print()

driver_interval_dict = {}
for i in accept_pick_intervaldict.keys():
    if accept_pick_intervaldict[i][1] not in driver_interval_dict:
        driver_interval_dict[accept_pick_intervaldict[i][1]] = [accept_pick_intervaldict[i][0]]
    else:
        driver_interval_dict[accept_pick_intervaldict[i][1]].append(accept_pick_intervaldict[i][0])


for i in driver_interval_dict:
    total = 0
    num = 0
    for j in driver_interval_dict[i]:
        total += j
        num += 1
    driver_interval_dict[i] = total / num
print("The averge interval between accepted time and pick-up time for a driver is", driver_interval_dict["039da9c077e17af98ca8530e4d7975f1"])

accept_pick_list = []
for i in Driver_ID_LIST:
    if i in driver_interval_dict:
        accept_pick_list.append(driver_interval_dict[i])

accept_pick_nparray = np.asarray(accept_pick_list).reshape(-1,1)



The accepted time and the pickup time for this ride is ['19:29:43', '19:35:15']
The time interval between accepted time and pickup time and the driver id is shown to be:
 [5.53, '689bdf87fb2de49f98bf4946cfaa5068']

The averge interval between accepted time and pick-up time for a driver is 2.520227272727271


In [34]:
"""Create a featurelist putting all feature components together"""
# We have used backward elimination to get rid of the features that doesn't contribute a lot
# to our target value: Lifetime Value. 
# The features that we comment are the ones that we removed

# We also have standardized the dataset

#featurelist = np.append(driver_id_nparray,driver_days_nparray,axis = 1)
featurelist = driver_days_nparray
#featurelist = Driver_primetime_nparrays
featurelist = np.append(featurelist, Driver_primetime_nparrays,axis = 1)
featurelist = np.append(featurelist,total_dist_nparray ,axis = 1)
featurelist = np.append(featurelist, total_numrides_nparray,axis = 1)
featurelist = np.append(featurelist, total_time_nparray,axis = 1)
#featurelist = np.append(featurelist,march_nparray ,axis = 1)
#featurelist = np.append(featurelist, april_nparray,axis = 1)
#featurelist = np.append(featurelist,may_nparray ,axis = 1)
#featurelist = np.append(featurelist,accept_pick_nparray,axis = 1)
featurelist = np.array(featurelist)
#feature scaling
featurelist = StandardScaler().fit_transform(featurelist)

print("The first row of the featurelist is", featurelist[0])

The first row of the featurelist is [1.39560151 0.46925237 0.16818195 0.31962893 0.20787298]


In [35]:
"""Reshape our target value: Lifetime Value"""

LFTlist = np.array(driver_LTV_nparrays).reshape(len(driver_LTV_nparrays))


In [36]:
"""split our data into training data and testing data"""
"""We use our training data to make a model, and utilize our testing data to calculate the MAPE"""
"""which stands for mean absolute percent error."""

train_features, test_features, train_labels, test_labels = train_test_split(featurelist, LFTlist, test_size = 0.25, random_state = 42)

In [37]:
"""Make sure the shapes of our training data and testing data are consistent"""

print('Training Features Shape: ', train_features.shape)
print('Training Labels Shape: ', train_labels.shape)
print('Testing Features Shape: ', test_features.shape)
print('Testing Labels Shape: ', test_labels.shape)

Training Features Shape:  (627, 5)
Training Labels Shape:  (627,)
Testing Features Shape:  (210, 5)
Testing Labels Shape:  (210,)


In [38]:
"""method 1: random forest"""

rf = RandomForestRegressor(n_estimators = 1000, random_state = 187)
rf.fit(train_features, train_labels)
#Make Predictions
#Use the forest's predict method on the test data 
predictions = rf.predict(test_features)

#Calculate the absolute errors
errors = abs(predictions - test_labels)

#print out the mean absolute error
print('Mean Absolute Error:' "$", round(np.mean(errors), 3), '.')
# Calculate mean absolute percentage error
mape = 100 * (errors / test_labels)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print("Mean Absolute Percent Error (MAPE):", np.mean(mape), '%.')
print('Accuracy:', round(accuracy, 3), '%.')

Mean Absolute Error:$ 103.339 .
Mean Absolute Percent Error (MAPE): 3.4497559611463604 %.
Accuracy: 96.55 %.


In [39]:
"""Method 2: Linear Regression"""

clf = svm.SVR(kernel = "linear")
clf.fit(train_features,train_labels)
print("The coefficient of determination of R2 of the prediction is",clf.score(train_features, train_labels))

The coefficient of determination of R2 of the prediction is 0.885624390293917


In [40]:
# Calulate MAPE and Accuracy for Linear Regression

errors = 0
pmax_pred = np.empty(test_labels.shape)
i = 0
print("Showcase the predicted value and the actual value")
print()
for X,y in zip(test_features, test_labels):
    print(f"Model: {clf.predict([X])[0]}, Actual: {y}")
    errors += abs(clf.predict([X])[0] - y)
    pmax_pred[i] = clf.predict([X])[0]
    i += 1
errors = errors / test_features.shape[0]

print()
print('Mean Absolute error:', '$',round(errors, 2) , '.')

# Calculate mean absolute percentage error
mape = 100 * (errors / test_labels)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)

print("Mean Absolute Percent Error (MAPE)", np.mean(mape), '%.')
print('Accuracy:', round(accuracy, 2), '%.')


Showcase the predicted value and the actual value

Model: 3331.512936742913, Actual: 4226.796682176512
Model: 3631.794064244631, Actual: 4046.4623292206497
Model: 1236.8175579465187, Actual: 812.2827949136291
Model: 1116.5589954542882, Actual: 956.2473532765456
Model: 3089.309358516354, Actual: 3714.3262096078697
Model: 3957.159380930348, Actual: 4670.535149216064
Model: 6127.595416649936, Actual: 8115.041149204423
Model: 1159.6833681073026, Actual: 902.3384603115002
Model: 4616.082992736307, Actual: 6122.861609837136
Model: 1438.3636736943631, Actual: 456.4475758069375
Model: 3084.5781470342636, Actual: 2899.4532139655753
Model: 2802.7414156874293, Actual: 3017.254235402269
Model: 6569.30610923896, Actual: 8899.713414317466
Model: 3707.8400369333867, Actual: 4406.258797995462
Model: 3461.2903362721163, Actual: 3495.996852692181
Model: 5518.98134148965, Actual: 6828.055628634553
Model: 2501.295119647762, Actual: 2369.630901937676
Model: 3171.2382493483674, Actual: 3585.3974141864664
Mo

In [41]:
"""Method 3 Polynomial Regression"""
clf = svm.SVR(kernel = "poly", degree = 6)
clf.fit(train_features,train_labels)
print(clf.score(train_features, train_labels))

0.2104549143564043




In [42]:
# Calculate Accuracy
errors = 0
ltv_pred = np.empty(test_labels.shape)
i = 0
for X,y in zip(test_features, test_labels):
    print(f"Model: {clf.predict([X])[0]}, Actual: {y}")
    errors += abs(clf.predict([X])[0] - y)
    ltv_pred[i] = clf.predict([X])[0]
    i += 1
errors = errors / test_features.shape[0]

print('Mean Absolute error:',round(errors, 2) , 'newton.')

# Calculate mean absolute percentage error
mape = 100 * (errors / test_labels)

# Calculate and display accuracy
accuracy = 100 - np.abs(np.mean(mape))
print(ltv_pred)
print('Accuracy:', round(accuracy, 2), '%.')

Model: 2934.6839634111084, Actual: 4226.796682176512
Model: 2937.644469325679, Actual: 4046.4623292206497
Model: 2927.7699956461674, Actual: 812.2827949136291
Model: 2871.3068203075522, Actual: 956.2473532765456
Model: 2938.09832583753, Actual: 3714.3262096078697
Model: 2941.354215871543, Actual: 4670.535149216064
Model: 3213.9464025542234, Actual: 8115.041149204423
Model: 2855.811809295504, Actual: 902.3384603115002
Model: 2792.8708183525314, Actual: 6122.861609837136
Model: -1520.0228302994406, Actual: 456.4475758069375
Model: 2932.6643646259417, Actual: 2899.4532139655753
Model: 2938.065912803372, Actual: 3017.254235402269
Model: 4453.249327566109, Actual: 8899.713414317466
Model: 2938.255650106363, Actual: 4406.258797995462
Model: 2927.01767400334, Actual: 3495.996852692181
Model: 2046.137115364353, Actual: 6828.055628634553
Model: 2938.0684955988045, Actual: 2369.630901937676
Model: 2938.03403027576, Actual: 3585.3974141864664
Model: 2419.0711410051763, Actual: 571.6063733175624
M

Model: 2935.750894156338, Actual: 4142.390443420497
Model: 2938.0681437111707, Actual: 2821.27594437795
Model: 2882.2298320548794, Actual: 494.0673651053917
Model: 2938.072351341207, Actual: 2865.8627055872166
Model: 2941.655182295322, Actual: 321.9376428902958
Model: 3071.58257015486, Actual: 6867.29315821445
Model: 2940.6340930320002, Actual: 4923.447127550085
Model: 2938.0890516839963, Actual: 3316.2985467757644
Model: 2600.297960209139, Actual: 442.72019661700415
Model: 2540.8164313873253, Actual: 441.2077354612625
Model: 2908.4374931614566, Actual: 731.7890402799
Model: 2940.9863086860005, Actual: 1169.2710991686627
Model: 2941.765888102954, Actual: 747.7213759816627
Model: 2937.9371061841894, Actual: 3095.4264090243505
Model: 4139.545857051026, Actual: 9071.052005948008
Model: 2941.5657932135173, Actual: 5012.540537097024
Model: 2916.5839100721932, Actual: 526.0308848297709
Mean Absolute error: 1879.16 newton.
[ 2934.68396341  2937.64446933  2927.76999565  2871.30682031
  2938.09

In [43]:
"""Method 4: Perceptron"""

threshold = 2979.856 
#2979.856 average number for lifetime value
for i in range(test_labels.shape[0]):
    if test_labels[i] > threshold:
        test_labels[i] = 1
    else:
        test_labels[i] = -1
        
# We classify the value above the shrehold as 1, the value below the shrehold as -1
print("The new test_labels data is:\n", test_labels)

for i in range(train_labels.shape[0]):
    if train_labels[i] > threshold:
        train_labels[i] = 1
    else:
        train_labels[i] = -1


The new test_labels data is:
 [ 1.  1. -1. -1.  1.  1.  1. -1.  1. -1. -1.  1.  1.  1.  1.  1. -1.  1.
 -1.  1. -1.  1. -1.  1.  1. -1.  1. -1.  1.  1. -1.  1.  1.  1.  1. -1.
 -1. -1.  1.  1.  1.  1.  1.  1. -1.  1.  1.  1. -1. -1.  1.  1.  1.  1.
  1. -1. -1.  1. -1.  1. -1. -1.  1.  1.  1. -1.  1.  1.  1. -1.  1.  1.
 -1. -1. -1. -1.  1. -1.  1. -1.  1. -1.  1.  1.  1. -1.  1. -1.  1.  1.
  1.  1.  1. -1.  1. -1.  1. -1. -1. -1.  1.  1.  1.  1.  1. -1. -1.  1.
  1. -1.  1.  1. -1. -1.  1.  1. -1. -1. -1. -1. -1.  1. -1. -1.  1. -1.
 -1. -1. -1. -1. -1. -1.  1. -1.  1. -1. -1.  1. -1.  1.  1. -1. -1. -1.
 -1. -1.  1. -1.  1.  1.  1.  1.  1. -1.  1. -1. -1.  1. -1. -1.  1.  1.
  1. -1.  1. -1.  1.  1.  1. -1. -1. -1. -1. -1.  1.  1. -1. -1. -1.  1.
 -1. -1.  1.  1. -1. -1. -1. -1. -1. -1. -1. -1. -1.  1. -1. -1. -1. -1.
  1.  1.  1. -1. -1. -1. -1. -1.  1.  1.  1. -1.]


In [44]:
# Calculate the Mean Absolute Error for Perceptron

ppn = Perceptron(max_iter = 100, eta0=1, random_state=0)

ltv_pred = np.empty(test_labels.shape)

# Train the perceptron
ppn = ppn.fit(train_features, np.asarray(train_labels, dtype="|S6"))
y_pred = ppn.predict(test_features)

for i in range(y_pred.shape[0]):
    ltv_pred[i] = y_pred[i]
#Calculate the absolute errors
errors = abs(np.asarray(y_pred, dtype=float) - np.asarray(test_labels, dtype=float))

# pmax prediction array
print("The prediction array is:\n",ltv_pred)

#print out the mean absolute error
print('Mean Absolute Error:', '$',round(np.mean(errors), 2), '.')

The prediction array is:
 [ 1.  1. -1. -1.  1.  1.  1. -1.  1. -1. -1.  1.  1.  1.  1.  1. -1.  1.
 -1.  1. -1.  1. -1.  1.  1. -1.  1. -1.  1.  1. -1.  1.  1.  1.  1. -1.
 -1. -1.  1.  1.  1.  1.  1.  1. -1.  1.  1.  1. -1. -1.  1.  1.  1.  1.
  1. -1. -1.  1. -1.  1. -1. -1.  1.  1.  1. -1.  1.  1.  1. -1.  1.  1.
 -1. -1. -1. -1.  1. -1.  1. -1.  1. -1.  1.  1.  1. -1.  1. -1.  1.  1.
  1.  1.  1. -1.  1. -1.  1. -1. -1. -1.  1.  1.  1.  1.  1. -1. -1.  1.
  1. -1.  1.  1. -1. -1.  1.  1. -1. -1. -1. -1. -1.  1. -1. -1.  1. -1.
 -1. -1. -1. -1. -1. -1.  1. -1.  1. -1. -1.  1. -1.  1.  1. -1. -1. -1.
 -1. -1.  1. -1.  1.  1.  1.  1.  1. -1.  1. -1. -1.  1. -1. -1.  1.  1.
  1. -1.  1. -1.  1.  1.  1.  1. -1. -1. -1. -1.  1.  1. -1. -1. -1.  1.
 -1. -1.  1.  1. -1. -1. -1. -1. -1. -1. -1. -1. -1.  1. -1. -1. -1. -1.
  1.  1.  1. -1. -1. -1. -1. -1.  1.  1.  1. -1.]
Mean Absolute Error: $ 0.01 .


In [45]:
# Calculate MAPE and Accuracy for Perceptron

mape = 100 * (errors / test_labels)

# Calculate and display accuracy
accuracy = 100 - np.abs(np.mean(mape))
print("Mean Absolute Percent Error (MAPE):", np.mean(mape), '%.')
print('Accuracy:', round(accuracy, 2), '%.')

Mean Absolute Percent Error (MAPE): -0.9523809523809523 %.
Accuracy: 99.05 %.
