In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost
from sklearn.metrics import roc_auc_score, roc_curve

In [2]:
data_raw = pd.read_csv('taxi.csv')

In [3]:
data_raw.head(10)

Unnamed: 0,offer_gk,weekday_key,hour_key,driver_gk,order_gk,driver_latitude,driver_longitude,origin_order_latitude,origin_order_longitude,distance_km,duration_min,offer_class_group,ride_type_desc,driver_response
0,1105373,5,20,6080,174182,55.818842,37.334562,55.814567,37.35501,-1.0,-1.0,Economy,private,0
1,759733,5,14,6080,358774,55.805342,37.515023,55.819329,37.466398,18.802,25.217,Standard,private,1
2,416977,6,14,6080,866260,55.813978,37.347688,55.814827,37.354074,6.747,9.8,Economy,private,0
3,889660,2,6,6080,163522,55.745922,37.421748,55.743469,37.43113,-1.0,-1.0,Economy,private,1
4,1120055,4,16,6080,506710,55.803578,37.521602,55.812559,37.527407,12.383,19.25,Economy,private,1
5,1058203,6,15,6080,360328,55.818963,37.361033,55.814827,37.354074,6.747,9.8,Economy,private,0
6,103326,2,11,6080,615584,55.753508,37.663742,55.757251,37.659064,-1.0,-1.0,Standard,business,1
7,128861,5,20,6080,254374,55.839318,37.248862,55.839134,37.304101,1.517,3.933,Economy,private,0
8,493006,5,20,6080,753429,55.818748,37.33824,55.835764,37.29471,6.222,15.433,Economy,private,0
9,953983,6,7,6080,533820,55.839155,37.251947,55.833198,37.312197,10.41,23.083,Economy,private,0


In [4]:
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
offer_gk                  100000 non-null int64
weekday_key               100000 non-null int64
hour_key                  100000 non-null int64
driver_gk                 100000 non-null int64
order_gk                  100000 non-null int64
driver_latitude           100000 non-null float64
driver_longitude          100000 non-null float64
origin_order_latitude     100000 non-null float64
origin_order_longitude    100000 non-null float64
distance_km               100000 non-null float64
duration_min              100000 non-null float64
offer_class_group         100000 non-null object
ride_type_desc            100000 non-null object
driver_response           100000 non-null int64
dtypes: float64(6), int64(6), object(2)
memory usage: 10.7+ MB


In [5]:
data_raw['offer_class_group'].value_counts()

Economy     48335
Standard    47288
Premium      1773
Delivery     1290
Kids          588
XL            464
VIP           225
VIP+           29
Test            8
Name: offer_class_group, dtype: int64

In [10]:
def roc_auc_score_func(data_train, data_test, y_train, y_test):
    xgb.fit(data_train, np.array(y_train).reshape(-1,1))
    train_scores = xgb.predict_proba(data_train)[:,1]
    test_scores = xgb.predict_proba(data_test)[:,1]
    print(roc_auc_score(y_train, train_scores))
    print(roc_auc_score(y_test, test_scores))

**One Hot Encode**

In [6]:
dummies = pd.get_dummies(data_raw.offer_class_group, prefix="offer_class", drop_first=True)

In [7]:
data_1_hot = pd.concat([data_raw.drop(['offer_class_group','ride_type_desc'], axis=1),dummies], axis=1)

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
xgb = xgboost.XGBClassifier(max_depth=5, n_jobs=-1)

In [27]:
%time
y = data_1_hot['driver_response']
data_train, data_test, y_train, y_test = train_test_split(data_1_hot.drop('driver_response', axis=1), y, test_size=0.3, random_state=42)
roc_auc_score_func(data_train, data_test, y_train, y_test)

CPU times: user 1e+03 ns, sys: 1 µs, total: 2 µs
Wall time: 4.05 µs


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.8857078108853049
0.8758153332894616


**Target Encoding**

In [13]:
df = (data_raw[data_raw['driver_response'] == 1]['offer_class_group'].value_counts()/
      data_raw['offer_class_group'].value_counts()
     ).to_dict()

In [21]:
data_target_encode = data_raw.copy()

In [15]:
df

{'Delivery': 0.6775193798449612,
 'Economy': 0.4390400331023068,
 'Kids': 0.8401360544217688,
 'Premium': 0.7715736040609137,
 'Standard': 0.5234943326002368,
 'Test': 0.75,
 'VIP': 0.7733333333333333,
 'VIP+': 0.896551724137931,
 'XL': 0.7112068965517241}

In [22]:
data_target_encode['offer_class_group'] = data_target_encode['offer_class_group'].apply(df.get)

In [28]:
%time
y = data_target_encode['driver_response']
data_train, data_test, y_train, y_test = train_test_split(data_target_encode.drop(['driver_response','ride_type_desc'], axis=1), y, test_size=0.3, random_state=42)
roc_auc_score_func(data_train, data_test, y_train, y_test)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.81 µs


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.8880648287433465
0.8782894837472699


**Weights of Evidence**

In [29]:
def get_woe_stat(df_train, col, target_col):
    stat = df_train.groupby(col)[target_col].agg(
        [np.mean, np.size, np.count_nonzero], sort=False
    )
    stat.rename(columns={'count_nonzero': 'good', 'size': 'obs', 'mean': 'event_rate'}, inplace=True)
    stat['all_good'] = stat['good'].sum()
    stat['p_good'] = stat['good'] / stat['all_good']
    stat['bad'] = stat['obs'] - stat['good']
    stat['all_bad'] = stat['bad'].sum()
    stat['p_bad'] = stat['bad'] / stat['all_bad']
    stat['WOE'] = np.log(stat['p_good'] / stat['p_bad'])
    stat['IV'] = ((stat['p_good'] - stat['p_bad']) * stat['WOE']).sum()
    stat['variable'] = col
    stat['category'] = stat.index
    stat.reset_index(inplace=True)
    stat.drop(labels=col, inplace=True, axis=1)
    return stat

In [30]:
test = get_woe_stat(data_raw, 'offer_class_group', 'driver_response')

In [34]:
data_woe = pd.concat([data_raw.drop(['offer_class_group','ride_type_desc'], axis=1), test.WOE], axis=1)

In [35]:
%time
y = data_woe['driver_response']
data_train, data_test, y_train, y_test = train_test_split(data_woe.drop('driver_response', axis=1), y, test_size=0.3, random_state=42)
roc_auc_score_func(data_train, data_test, y_train, y_test)

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 3.81 µs


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.8798867816250889
0.8715563171861327
