# X5 Retail Hero dataset

In [118]:
from pprint import pprint

import pandas as pd
import matplotlib.pyplot as plt
from sklift.metrics import uplift_at_k
from sklift.viz import plot_uplift_preds
import numpy as np

from pymongo import MongoClient

In [10]:
client = MongoClient("localhost", 27017)
db = client['x5']

In [16]:
df_clients = pd.read_csv('../../data/x5-retail-hero/clients.csv', index_col='client_id')
df_train = pd.read_csv('../../data/x5-retail-hero/uplift_train.csv', index_col='client_id')
df_test = pd.read_csv('../../data/x5-retail-hero/uplift_test.csv', index_col='client_id')

In [137]:
pd.to_datetime(df_clients['first_issue_date']).min()

Timestamp('2017-04-04 18:24:18')

In [163]:
from sklearn.model_selection import train_test_split

df_features = df_clients.copy()
df_features['first_issue_time'] = \
    (pd.to_datetime(df_features['first_issue_date'])
     - pd.to_datetime(df_features['first_issue_date']).min()) / pd.Timedelta('365d')
#      - pd.Timestamp('1970-01-01')) // pd.Timedelta('1d')
df_features['first_redeem_time'] = \
    (pd.to_datetime(df_features['first_redeem_date'])
     - pd.to_datetime(df_features['first_redeem_date']).min()) / pd.Timedelta('365d')
#      - pd.Timestamp('1970-01-01')) // pd.Timedelta('1d')
df_features['issue_redeem_delay'] = df_features['first_redeem_time'] \
    - df_features['first_issue_time']

df_features = df_features.join(pd.get_dummies(df_features['gender']))
df_features['first_redeem_time'] = df_features['first_redeem_time'].fillna(df_features['first_redeem_time'].mean())
df_features['issue_redeem_delay'] = df_features['issue_redeem_delay'].fillna(df_features['issue_redeem_delay'].mean())

df_features = df_features.drop(['first_issue_date', 'first_redeem_date', 'gender'], axis=1)

indices_train = df_train.index
indices_test = df_test.index
indices_learn, indices_valid = train_test_split(df_train.index, test_size=0.3, random_state=123)

In [164]:
df_features

Unnamed: 0_level_0,age,first_issue_time,first_redeem_time,issue_redeem_delay,F,M,U
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
000012768d,45,0.336675,0.735365,0.398690,0,0,1
000036f903,72,0.015925,0.033211,0.017286,1,0,0
000048b7a6,68,1.698076,1.208236,0.476130,1,0,0
000073194a,60,0.133622,0.622100,0.488477,1,0,0
00007c7133,67,0.131265,1.724154,1.592889,0,0,1
...,...,...,...,...,...,...,...
fffece623e,67,1.106479,1.713402,0.606924,0,0,1
ffff3dfff8,56,1.577821,1.658431,0.080611,1,0,0
ffffaab9da,23,0.391817,0.661208,0.269391,1,0,0
ffffeb5619,62,0.670666,1.208236,0.476130,0,0,1


In [165]:
X_train = df_features.loc[indices_learn, :]
y_train = df_train.loc[indices_learn, 'target']
treat_train = df_train.loc[indices_learn, 'treatment_flg']

X_val = df_features.loc[indices_valid, :]
y_val = df_train.loc[indices_valid, 'target']
treat_val =  df_train.loc[indices_valid, 'treatment_flg']

X_train_full = df_features.loc[indices_train, :]
y_train_full = df_train.loc[:, 'target']
treat_train_full = df_train.loc[:, 'treatment_flg']

X_test = df_features.loc[indices_test, :]

cat_features = ['gender']

# ATE

In [178]:
def get_mean_target(treatment_flg):
    return df_train.loc[indices_valid][df_train['treatment_flg']==treatment_flg]['target'].mean()

get_mean_target(1) - get_mean_target(0)

  


0.0325018461092893

# Double machine learning

In [172]:
from econml.dml import LinearDMLCateEstimator
from lightgbm import LGBMClassifier

est = LinearDMLCateEstimator(model_y=LGBMClassifier(max_depth=5), 
                             model_t=LGBMClassifier(max_depth=5),
                             discrete_treatment=True)
est.fit(y_train, treat_train, X_train)
uplift = est.const_marginal_effect(X_val)[:, 0]
score = uplift_at_k(y_true=y_val, uplift=uplift, treatment=treat_val, strategy='by_group', k=0.3)

print(f"Uplift at 30: {score}")

Uplift at 30: 0.037343776750556334




In [173]:
from econml.dml import NonParamDMLCateEstimator
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor

est = NonParamDMLCateEstimator(model_y=LGBMRegressor(),
                               model_t=LGBMRegressor(),
                               model_final=LGBMRegressor())
est.fit(y_train, treat_train, X_train)
uplift = est.effect(X_val.values)
score = uplift_at_k(y_true=y_val, uplift=uplift, treatment=treat_val, strategy='by_group', k=0.3)

print(f"Uplift at 30: {score}")

Uplift at 30: 0.0335854809309446


# Meta Learners

## T-learner

In [168]:
from econml.metalearners import TLearner

est = TLearner(LGBMRegressor())
est.fit(y_train, treat_train, X_train)
uplift = np.squeeze(est.const_marginal_effect(X_val.values))
score = uplift_at_k(y_true=y_val, uplift=uplift, treatment=treat_val, strategy='by_group', k=0.3)

print(f"Uplift at 30: {score}")

Uplift at 30: 0.0456427434867116


## S-learner

In [169]:
from econml.metalearners import SLearner

est = SLearner(LGBMRegressor())
est.fit(y_train, treat_train, X_train)
uplift = np.squeeze(est.const_marginal_effect(X_val.values))
score = uplift_at_k(y_true=y_val, uplift=uplift, treatment=treat_val, strategy='by_group', k=0.3)

print(f"Uplift at 30: {score}")

Uplift at 30: 0.052801192382448625


## X-Learner

In [182]:
from econml.metalearners import XLearner

est = XLearner(LGBMClassifier(), )
est.fit(y_train, treat_train, X_train)
uplift = np.squeeze(est.const_marginal_effect(X_val.values))
score = uplift_at_k(y_true=y_val, uplift=uplift, treatment=treat_val, strategy='by_group', k=0.3)

print(f"Uplift at 30: {score}")



Uplift at 30: 0.03716187296695772
