In [49]:
# Notebook for generating partial dependence plot for driver cancel model
# partial dependence is useful understanding model behavior (e.g. feedback loops) and impact on business

# results here: https://drive.google.com/drive/folders/1phCdC9nPplEby7ve1I0wdcm2LGj9asiO

In [43]:
import os 
import pandas as pd, numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

In [44]:
#  model prediction  sample from 10/24 to 10/30 period
region = "apac"
data = pd.read_csv("data/driver_cancel_{}_prediction_sample.csv".format(region)) #https://querybuilder-ea.uberinternal.com/r/meSaQarnf/run/lTpPurG5/edit

  interactivity=interactivity, compiler=compiler, result=result)


In [45]:
data.shape, data.columns

((102717, 268),
 Index(['datestr', 'job_supply_uuid', 'supply_vvid', 'job_uuid', 'supply_uuid',
        'client_uuid', 'client_vvid', 'supply_client_vvid',
        '@published:vehicle_view_id', 'city_id',
        ...
        'auto_transformed_scalar_output_vec', 'probability', 'calibrated_prob',
        '__internal_raw_probability_column_name', '@prediction:predict', 'true',
        'false', '@prediction:prob', 'vehicle_view_id', 'predict'],
       dtype='object', length=268))

In [46]:
# create partial dependence plots
results_dir = "driver_cancel/{}_results".format(region)
if not os.path.exists(results_dir):
    os.mkdir(results_dir)

In [47]:
uniques = [data[col].nunique() for col in data.columns]

In [None]:
# create bucketized dependence plots
for feature, dtype, unique in zip(data.columns, data.dtypes, uniques):
    if (dtype == object and unique < 100) or unique < 20: # if categorical
        try:
            data.groupby(data[feature]).calibrated_prob.mean().plot.bar(rot=10, fontsize=9)
            plt.xlabel(feature)
            plt.ylabel('cancel probability')
            plt.title('Trends in model predicted cancel probability by {}'.format(feature))
            plt.savefig(os.path.join(results_dir, "{}.pdf".format(feature)))
            #plt.show()
        except ValueError:
            pass        
    elif dtype not in (str, object) and feature not in ('job_supply_uuid', '@published:vehicle_view_id', 'city_id', 'country_id', 'timestamp'): # if numeric
        a = data[feature].quantile(0.01) #.min()
        b = data[feature].quantile(0.99) #max()
        fig = plt.figure(figsize=(11,5))
        try:
            data.groupby(pd.cut(data[feature], np.linspace(a,b,11))).calibrated_prob.mean().plot.bar(rot=10, fontsize=9)
            plt.xlabel(feature)
            plt.ylabel('cancel probability')
            plt.title('Trends in model predicted cancel probability by {}'.format(feature))
            plt.savefig(os.path.join(results_dir, "{}.pdf".format(feature)))
            #plt.show()
        except ValueError:
            pass