In [None]:
# Data preparation: split the dataset as training/testing, and extract the data based on feature list
import pandas
from datetime import datetime, timedelta

base_features = ['city_id', 'day_of_week',
       'hour_of_day', 'demand_surge',
       'eta', 'rounded_eta', 'fd_eta', 'eyeball_eta',
       'forward_dispatched', 'starpower', 
#         'request_location_latitude', 'request_location_longitude', 
#         'supply_location_latitude', 'supply_location_longitude', 
#         'dropoff_location_latitude', 'dropoff_location_longitude', 
        'days_since_signup',
       'days_since_first_trip', 'trip_distance_haversine', 'is_commute',
       'is_fifo',
       'driver_surge_multiplier', 'est_rider_fare_distance_miles',
       'fare_distance_miles', 'fare_duration_minutes', 'trip_distance_miles',
       'trip_duration_seconds']

NRT_features = ['request_average_demand_surge',
       'supply_average_demand_surge', 'request_average_eta',
       'supply_average_eta', 'request_average_rounded_eta',
       'supply_average_rounded_eta', 'request_average_eyeball_eta',
       'supply_average_eyeball_eta', 'request_average_trip_distance_haversine',
       'supply_average_trip_distance_haversine', 'request_average_fd_eta',
       'supply_average_fd_eta', 'request_average_forward_dispatched',
       'supply_average_forward_dispatched',
       'request_average_driver_surge_multiplier',
       'supply_average_driver_surge_multiplier',
       'request_average_est_rider_fare_distance_miles',
       'supply_average_est_rider_fare_distance_miles',
       'request_average_fare_distance_miles',
       'supply_average_fare_distance_miles',
       'request_average_fare_duration_minutes',
       'supply_average_fare_duration_minutes',
       'request_average_trip_distance_miles',
       'supply_average_trip_distance_miles',
       'request_average_trip_duration_seconds',
       'supply_average_trip_duration_seconds',
#         'request_average_cancel_rate', 'supply_average_cancel_rate'
               ]

NRT_30_features = ['request_average_demand_surge_30', 
                   'request_average_driver_surge_multiplier_30', 
                   'request_average_est_rider_fare_distance_miles_30', 
                   'request_average_eta_30', 
                   'request_average_eyeball_eta_30', 
                   'request_average_fare_distance_miles_30', 
                   'request_average_fare_duration_minutes_30', 
                   'request_average_fd_eta_30', 
                   'request_average_forward_dispatched_30', 
                   'request_average_rating_30', 
                   'request_average_rounded_eta_30', 
                   'request_average_supply_surge_30', 
                   'request_average_surge_diff_30', 
                   'request_average_trip_distance_haversine_30', 
                   'request_average_trip_distance_miles_30', 
                   'request_average_trip_duration_seconds_30', 
                   'supply_average_demand_surge_30', 
                   'supply_average_driver_surge_multiplier_30', 
                   'supply_average_est_rider_fare_distance_miles_30', 
                   'supply_average_eta_30', 
                   'supply_average_eyeball_eta_30', 
                   'supply_average_fare_distance_miles_30', 
                   'supply_average_fare_duration_minutes_30', 
                   'supply_average_fd_eta_30', 
                   'supply_average_forward_dispatched_30', 
                   'supply_average_rating_30', 
                   'supply_average_rounded_eta_30', 
                   'supply_average_supply_surge_30', 
                   'supply_average_surge_diff_30', 
                   'supply_average_trip_distance_haversine_30', 
                   'supply_average_trip_distance_miles_30', 
                   'supply_average_trip_duration_seconds_30'
                  ]

def SplitByDate(raw:pandas.DataFrame, split_date:str):
    '''Split the data by split_date.
    The data with split_date is included in testing set.
    The trainsing and testing sets are returned as tuple.
    '''
    training_set = raw[raw['datestr']<split_date]
    testing_set = raw[raw['datestr']>=split_date]
    
    return (training_set, testing_set)

def DataPreparation(raw:pandas.DataFrame, split_date, features):
    '''Prepare the data as training/testing data with defined features.
    Return training and testing data as tuple.
    '''
    
    # Fill all null values as zero
    raw = raw.fillna(0)
    
    training_set, testing_set = SplitByDate(raw, split_date)
    
    training_x = training_set[features]
    testing_x = testing_set[features]
    
    training_y = training_set['canceled']
    testing_y = testing_set['canceled']
    
    return (training_x, training_y, testing_x, testing_y)

def TemporalDataPreparation(raw:pandas.DataFrame, start_date:str, end_date:str, training_size:int, testing_size:int, step:int, features:list):
    '''Prepare the data as training/tesing data with defined features, based on sliding window,
    training_size, testing_size, step are in days.
    Return a list of training and testing data.
    '''
    start = datetime.strptime(start_date, '%Y-%m-%d')
    end = datetime.strptime(end_date, '%Y-%m-%d')
    result = []
    
    #initialize cursor, and cursor is the start date of each training set
    cursor = start
    
    #initialize the split date, testing set date
    split = cursor + timedelta(days=training_size)
    end_test = split + timedelta(days=testing_size)
    
    while (end_test <= end):
        #Split the training/testing data
        filtered = raw[(raw['datestr']>=cursor.strftime('%Y-%m-%d')) & (raw['datestr']<end_test.strftime('%Y-%m-%d'))]
        training_x, training_y, testing_x, testing_y = DataPreparation(filtered, split.strftime('%Y-%m-%d'), features)
        result.append(
            {
                'training_x': training_x,
                'training_y': training_y,
                'testing_x': testing_x,
                'testing_y': testing_y
            }
        )
        
        #Move to next, update cursor, split, end_test
        cursor = cursor + timedelta(days=step)
        split = cursor + timedelta(days=training_size)
        end_test = split + timedelta(days=testing_size)
    return result

print('Done')

In [None]:
from sklearn.ensemble import RandomForestClassifier


raw = pandas.read_csv('Sample-1-1/report-trip-hexter-NRT-1-1-2022-0701-0715.csv', header=0)
print(raw.shape)

base_training_x, base_training_y, base_testing_x, base_testing_y = DataPreparation(raw, '2022-07-10', base_features)
training_x, training_y, testing_x, testing_y = DataPreparation(raw, '2022-07-10', base_features + NRT_30_features)

print(training_x.columns)

print('Training baseline model')
base_model = RandomForestClassifier(n_estimators=4, max_depth=12, random_state=919, n_jobs=4)
base_model.fit(base_training_x, base_training_y)

base_predict_y = base_model.predict(base_testing_x)
base_training_predict_y = base_model.predict(base_training_x)

print('Training NRT model')
NRT_model = RandomForestClassifier(n_estimators=10, max_depth=15, random_state=919, n_jobs=4)
NRT_model.fit(training_x, training_y)

predict_y = NRT_model.predict(testing_x)
training_predict_y = NRT_model.predict(training_x)



print('Done')


In [None]:
# Evaluation metrics
from sklearn.metrics import roc_auc_score, f1_score


base_training_roc_auc = roc_auc_score(base_training_y, base_training_predict_y)
base_training_f1 = f1_score(base_training_y, base_training_predict_y)

base_roc_auc = roc_auc_score(base_testing_y, base_predict_y)
base_f1 = f1_score(base_testing_y, base_predict_y)

print('Baseline AUC: %s, baseline F1: %s. Training: %s, %s.' % (base_roc_auc, base_f1, base_training_roc_auc, base_training_f1))


training_roc_auc = roc_auc_score(training_y, training_predict_y)
training_f1 = f1_score(training_y, training_predict_y)

roc_auc = roc_auc_score(testing_y, predict_y)
f1 = f1_score(testing_y, predict_y)

print('NRT AUC: %s, NRT F1: %s. Training: %s, %s.' % (roc_auc, f1, training_roc_auc, training_f1))

print('Done')


In [None]:
# Feature importance

import numpy as np
import matplotlib.pyplot as plt
import sklearn
import plotly.graph_objects as go



# NRT = ['NRT_' + x for x in NRT_features]
NRT = ['NRT_'+x+'_30' for x in NRT_30_features]

n_base_features = len(base_features)
n_features = len(base_features+NRT_features)
print(n_base_features, n_features)



#NRT model
importances = NRT_model.feature_importances_
std = np.std([tree.feature_importances_ for tree in NRT_model.estimators_], axis=0)



print(importances)

fig = go.Figure()
fig.add_trace(go.Bar(
    x=importances, y=base_features+NRT, error_x=dict(type='data', array=std), orientation='h'
))
fig.update_layout(barmode='group', yaxis={'categoryorder':'total ascending'}, height=1000, width=900)

fig.update_yaxes(type='category', nticks=n_features)
fig.update_layout(xaxis_title='Feature importance')
fig.show()





In [None]:
# Training and testing with sliding windows
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, f1_score


raw = pandas.read_csv('Sample-1-1/report-trip-hexter-NRT-1-1-2022-0701-0715.csv', header=0)
print(raw.shape)

# Split the dataset with sliding windows
base_set = TemporalDataPreparation(raw, start_date='2022-07-01', end_date='2022-07-15', training_size=7, testing_size=3, step=1, features=base_features+NRT_30_features)
metrics = []

training_metrics = []

print('Start training')
for item in base_set:
    training_x = item['training_x']
    training_y = item['training_y']
    testing_x = item['testing_x']
    testing_y = item['testing_y']
    
    model = RandomForestClassifier(n_estimators=10, max_depth=15, random_state=919, n_jobs=4)
    model.fit(training_x, training_y)

    predict_y = model.predict(testing_x)

    predict_training_y = model.predict(training_x)
    
    # Calculate training metrics
    training_roc_auc = roc_auc_score(training_y, predict_training_y)
    training_f1 = f1_score(training_y, predict_training_y)
    
    # Calculate testing metrics
    roc_auc = roc_auc_score(testing_y, predict_y)
    f1 = f1_score(testing_y, predict_y)
    
    training_metrics.append(
        {
            'AUC': training_roc_auc,
            'F1': training_f1
        }
    )
    
    metrics.append(
        {
            'AUC': roc_auc,
            'F1': f1
        }
    )
    
print(training_metrics)
print('***')
print(metrics)


In [None]:
# Plot the metrics
import plotly.graph_objects as go



x_labels = [1,2,3,4,5]

# SF dataset
baseline_metrics = [{'AUC': 0.6620574080564671, 'F1': 0.5319849576470521}, {'AUC': 0.6615395663862847, 'F1': 0.5405177991804042}, {'AUC': 0.6603894860062521, 'F1': 0.5468657279286574}, {'AUC': 0.6433148587965223, 'F1': 0.5027744253939144}, {'AUC': 0.6305539685908852, 'F1': 0.47292936656289397}]
# NRT_metrics = [{'AUC': 0.6650541650941918, 'F1': 0.5353107466489516}, {'AUC': 0.6678927807503241, 'F1': 0.5520939842094722}, {'AUC': 0.6661744957705457, 'F1': 0.5538367010654788}, {'AUC': 0.6493805667753964, 'F1': 0.5123467193388118}, {'AUC': 0.6400926422522591, 'F1': 0.4874351251781536}]

# 30
NRT_metrics = [{'AUC': 0.6631622713219711, 'F1': 0.5335935449264445}, {'AUC': 0.6651563831020831, 'F1': 0.5492524808210929}, {'AUC': 0.6622734655894722, 'F1': 0.5499634575775696}, {'AUC': 0.647061616889139, 'F1': 0.509317229397077}, {'AUC': 0.6345009481324229, 'F1': 0.47731614296803804}]


training_baseline_metrics = [{'AUC': 0.6987256684922597, 'F1': 0.6028353730071503}, {'AUC': 0.6963268647320288, 'F1': 0.5930674970507902}, {'AUC': 0.6899193740878362, 'F1': 0.5799469646154118}, {'AUC': 0.6848123986619821, 'F1': 0.5738008058542632}, {'AUC': 0.6847363951546127, 'F1': 0.57355100047029}]
training_NRT_metrics = [{'AUC': 0.7275042299546662, 'F1': 0.644549352505522}, {'AUC': 0.7248268938238777, 'F1': 0.6365058108252813}, {'AUC': 0.7183830605724439, 'F1': 0.6242910667654262}, {'AUC': 0.7156956735040735, 'F1': 0.6211565836531951}, {'AUC': 0.7125724755342857, 'F1': 0.6156996604803265}]


training_baseline_metrics = pandas.DataFrame(training_baseline_metrics)
training_NRT_metrics = pandas.DataFrame(training_NRT_metrics)
baseline_metrics = pandas.DataFrame(baseline_metrics)
NRT_metrics = pandas.DataFrame(NRT_metrics)

print(training_baseline_metrics)
print(training_NRT_metrics)
print(round(baseline_metrics.describe(),4))
print(round(NRT_metrics.describe(), 4))

fig = go.Figure(data=[
    go.Bar(name='Baseline', x=['AUC', 'F1'], y=[baseline_metrics['AUC'].mean(), baseline_metrics['F1'].mean()], width=0.2),
    go.Bar(name='NRT', x=['AUC', 'F1'], y=[NRT_metrics['AUC'].mean(), NRT_metrics['F1'].mean()], width=0.2)
]
)

fig.update_layout(barmode='group')
fig.show()

fig = go.Figure()

# Training scores
# fig.add_trace(go.Scatter(x=x_labels, y=training_baseline_metrics['AUC'], mode='lines', name='Training Baseline-AUC', line=dict(color='green')))
# fig.add_trace(go.Scatter(x=x_labels, y=training_baseline_metrics['F1'], mode='lines+markers', name='Training Baseline-F1', line=dict(color='green')))

# fig.add_trace(go.Scatter(x=x_labels, y=training_NRT_metrics['AUC'], mode='lines', name='Training NRT-AUC', line=dict(color='orange')))
# fig.add_trace(go.Scatter(x=x_labels, y=training_NRT_metrics['F1'], mode='lines+markers', name='Training NRT-F1', line=dict(color='orange')))


# Testing scores
fig.add_trace(go.Scatter(x=x_labels, y=baseline_metrics['AUC'], mode='lines', name='Baseline-AUC', line=dict(color='red')))
fig.add_trace(go.Scatter(x=x_labels, y=baseline_metrics['F1'], mode='lines+markers', name='Baseline-F1', line=dict(color='red')))

fig.add_trace(go.Scatter(x=x_labels, y=NRT_metrics['AUC'], mode='lines', name='NRT-AUC', line=dict(color='blue')))
fig.add_trace(go.Scatter(x=x_labels, y=NRT_metrics['F1'], mode='lines+markers', name='NRT-F1', line=dict(color='blue')))

fig.update_layout(
xaxis = dict(tickmode='linear', dtick = 1, tick0 = 1))

fig.update_layout(
    xaxis_title="Sliding window",
    yaxis_title="Scores"
)

fig.show()


In [None]:
# Parameter tuning
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, f1_score



raw = pandas.read_csv('Sample-1-1/report-trip-hexter-NRT-1-1-2022-0701-0715.csv', header=0)
print(raw.shape)

# Baseline model
# training_x, training_y, testing_x, testing_y = DataPreparation(raw, '2022-07-10', base_features)

# NRT model
training_x, training_y, testing_x, testing_y = DataPreparation(raw, '2022-07-10', base_features + NRT_features + NRT_30_features)

metrics = []


# i is number of trees, d is the max depth
for i in range(2, 30,2):
    for d in range(2, 30):
        print(i, d)
        model = RandomForestClassifier(n_estimators=i, max_depth=d, random_state=919, n_jobs=4)
        model.fit(training_x, training_y)

        predict_y = model.predict(testing_x)
        predict_training_y = model.predict(training_x)
        
        training_roc_auc = roc_auc_score(training_y, predict_training_y)
        training_f1 = f1_score(training_y, predict_training_y)

        roc_auc = roc_auc_score(testing_y, predict_y)
        f1 = f1_score(testing_y, predict_y)

        metrics.append({
            'parameter': str(i)+' - '+str(d),
            'Training_AUC': training_roc_auc,
            'Training_F1': training_f1,
            'Testing_AUC': roc_auc,
            'Testing_F1': f1
        })

metrics = pandas.DataFrame(metrics)
print(metrics)
print('Done')
    
    


In [None]:
# Plot the parameter tunning results
import pandas
import plotly.graph_objects as go

metrics.to_csv('ParamTunning-NRT-1-1.csv', header=True, index=False)
# metrics = pandas.read_csv('ParamTunning-NRT-1-1.csv', header=0)
x_label = str(metrics['parameter'])
print(metrics)

fig = go.Figure()
fig.add_trace(go.Scatter(x=metrics['parameter'], y=metrics['Training_AUC'], mode='lines', name='Training_AUC'))
fig.add_trace(go.Scatter(x=metrics['parameter'], y=metrics['Training_F1'], mode='lines', name='Training_F1'))

fig.add_trace(go.Scatter(x=metrics['parameter'], y=metrics['Testing_AUC'], mode='lines', name='Testing_AUC'))
fig.add_trace(go.Scatter(x=metrics['parameter'], y=metrics['Testing_F1'], mode='lines', name='Testing_F1'))

fig.update_layout(
    xaxis_title="Parameter pairs (#trees - max depth)",
    yaxis_title="Scores"
)

fig.show()