In [4]:
import pandas as pd

In [5]:
import numpy as np

In [6]:
# import three files, takes very long
df_all = pd.read_csv('criteo-uplift-v2.1.csv', encoding='latin_1')
df_treated = pd.read_csv('criteo_treated.csv', encoding='latin_1')
df_untreated = pd.read_csv('criteo_not_treated.csv', encoding='latin_1')

In [7]:
df_all.columns

Index(['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10',
       'f11', 'treatment', 'conversion', 'visit', 'exposure'],
      dtype='object')

In [8]:
#These are the number of rows in each file. Treated and Untreated data were extracted from the total data.
count_all = df_all['conversion'].count()
count_treated = df_treated['conversion'].count()
count_untreated = df_untreated['conversion'].count()
print(count_all,count_treated,count_untreated)

13979592 11882655 2096937


#### By the number of untreated data, select these numbers of data randomly from all/treated/untreated
* 60% train = 1258163
* 20% validation = 419387
* 20% test = 419387
#### To do this, I shuffle each dataset and choose corresponding number of rows from them, so that we are in fact randomly selecting our training, validating and testing set

In [9]:
#Shuffle data to make sure random selection
from sklearn.utils import shuffle
df_untreated = shuffle(df_untreated)
df_treated = shuffle(df_treated)
df_all = shuffle(df_all)

In [10]:
#Set the number of train, validation and test data we want to use
train = 1258163   
val = 419387
test = 419387

## Model 1: All data, predict conversion

In [140]:
#Model 1: run logistic regression for all data (including treated and untreated) to predict conversion, if we use the default solver, the model does not converge
from sklearn.tree import DecisionTreeClassifier
X, y = df_all[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']][:train],df_all['conversion'][:train]
X_val,y_val = df_all[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']][train:train+val],df_all['conversion'][train:train+val]
clf = DecisionTreeClassifier(min_samples_leaf = 10000).fit(X, y)
clf.score(X_val, y_val)

0.9970623791390768

In [143]:
# AUC for this model
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_val, clf.predict_proba(val_set[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']])[:,1]
)
metrics.auc(fpr, tpr)

0.9490438439608007

## Model 2: All data, predict visit

In [144]:
#Model 2: run logistic regression for all data (including treated and untreated) to predict visit
y_visit = df_all['visit'][:train]
y_visit_val = df_all['visit'][train:train+val]
clf_visit = DecisionTreeClassifier(min_samples_leaf = 10000).fit(X, y_visit)
clf_visit.score(X_val, y_visit_val)

0.957614327578108

## Model 3: Treated data, predict conversion

In [145]:
X_tr, y_tr = df_treated[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']][:train],df_treated['conversion'][:train]
X_tr_val,y_tr_val = df_treated[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']][train:train+val],df_treated['conversion'][train:train+val]
clf_treated = DecisionTreeClassifier(min_samples_leaf = 10000).fit(X_tr, y_tr)
clf_treated.score(X_tr_val, y_tr_val)

0.9970075371911862

In [142]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y_val, clf_treated.predict_proba(val_set[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']])[:,1])
metrics.auc(fpr, tpr)

0.6085638857740412

## Model 4: Treated data, predict visit

In [146]:
X_tr_visit, y_tr_visit = df_treated[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']][:train],df_treated['visit'][:train]
X_tr_val,y_tr_val = df_treated[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']][train:train+val],df_treated['visit'][train:train+val]
clf_treated_visit = DecisionTreeClassifier(min_samples_leaf = 10000).fit(X_tr_visit, y_tr_visit)
clf_treated_visit.score(X_tr_val, y_tr_val)

0.9568680002002923

## Model 5: Untreated data, predict conversion

In [147]:
X_un, y_un = df_untreated[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']][:train],df_untreated['conversion'][:train]
X_un_val,y_un_val = df_untreated[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']][train:train+val],df_untreated['conversion'][train:train+val]
clf_untreated = DecisionTreeClassifier(min_samples_leaf = 10000).fit(X_un, y_un)
clf_untreated.score(X_un_val, y_un_val)

0.9980471497685908

In [123]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y_val, clf_untreated.predict_proba(val_set[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']])[:,1])
metrics.auc(fpr, tpr)

0.9445403244454027

## Model 6: Untreated data, predict visit

In [148]:
X_un_visit, y_un_visit = df_untreated[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']][:train],df_untreated['visit'][:train]
X_un_val,y_un_val = df_untreated[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']][train:train+val],df_untreated['visit'][train:train+val]
clf_untreated_visit = DecisionTreeClassifier(min_samples_leaf = 10000).fit(X_un_visit, y_un_visit)
clf_untreated_visit.score(X_un_val, y_un_val)

0.9646150214479705

## Model 1 Prediction
### After building the models, we do evaluation and prediction using validation data
* some data in val_set might be the same with training data in untreated/treated data set
* val_set is completely separated from training data of model 1

In [141]:
val_set = df_all[train:train+val]
prediction = clf.predict_proba(val_set[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']])

In [149]:
m1 = val_set[['treatment','conversion']]
m1['prediction']= 0
m1['1-prediction']= 0
m1[['prediction','1-prediction']]= prediction

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [150]:
m1.head()

Unnamed: 0,treatment,conversion,prediction,1-prediction
4598634,1,0,0.999623,0.000377
12648370,1,0,1.0,0.0
4077297,1,0,0.999873,0.000127
13715930,1,0,0.98268,0.01732
6951978,1,0,1.0,0.0


## Now we see the features of the validation set

In [151]:
#porportion of people treated
targeted = sum(m1['treatment'])/len(m1)
targeted

0.8492490229787761

In [152]:
#conversion rate on treated
treat_all = m1['conversion'].dot(m1['treatment'])/sum(m1['treatment'])
treat_all

0.0031193495131456297

In [153]:
#conversion rate on not treated
treat_none = m1['conversion'].dot(1-m1['treatment'])/sum(1-m1['treatment'])
treat_none

0.001913860462173576

In [154]:
#conversion rate if treat randomly (according to the percentage treated)
treat_random = treat_all*targeted+treat_none*(1-targeted)
treat_random

0.0029376208609232044

## Now, we target top 10% and see the result

In [155]:
target_start = 0
target_end = 41939 #10%
target_decision = np.zeros(len(val_set))
target_decision[target_start:target_end] = 1
target_decision

array([1., 1., 1., ..., 0., 0., 0.])

In [156]:
m1 = m1.sort_values(by = '1-prediction',ascending= False)

In [157]:
m1['target_decision'] = target_decision

In [158]:
m1['treat=assign'] = (m1['target_decision'] == m1['treatment'])


In [159]:
m1['treat=assign'] = m1['treat=assign'].replace(True,1)
m1['treat=assign'] = m1['treat=assign'].replace(False,0)
m1.head()

Unnamed: 0,treatment,conversion,prediction,1-prediction,target_decision,treat=assign
4814137,1,0,0.842551,0.157449,1.0,1.0
6083574,0,0,0.842551,0.157449,1.0,0.0
6008946,0,0,0.842551,0.157449,1.0,0.0
378823,1,0,0.842551,0.157449,1.0,1.0
1719901,1,0,0.842551,0.157449,1.0,1.0


In [160]:
#calculte the reward by treat=assign
reward = m1['conversion']*m1['treat=assign']
for i in range(len(m1)):
    if m1['treatment'].iloc[i] == 1:
        reward.iloc[i] = reward.iloc[i]/targeted
    else:
        reward.iloc[i] = reward.iloc[i]/(1-targeted)     

In [161]:
reward.value_counts()

0.000000    418398
1.177511       972
6.633456        17
dtype: int64

In [162]:
m1['reward'] = reward

In [163]:
#calculate effects
def effect(m):
    treat_with_model = sum(m['reward'])/len(m)
    for_target_treated = m['treatment'].dot(m['conversion']*m['target_decision'])/(m['treatment'].dot(m['target_decision']))
    for_target_untreated = (1-m['treatment']).dot(m['conversion']*m['target_decision'])/((1-m['treatment']).dot(m['target_decision']))
    avg_effect = treat_all-treat_none
    effect_for_targeted = for_target_treated - for_target_untreated
    return treat_with_model,for_target_treated,for_target_untreated,avg_effect,effect_for_targeted

In [164]:
effect1 = effect(m1)

In [165]:
effect1

(0.002997969348507079,
 0.026944613849309754,
 0.01773231031543052,
 0.0012054890509720537,
 0.009212303533879234)

## Model 3 Prediction 

In [166]:
val_set = df_all[train:train+val]
prediction3 = clf_treated.predict_proba(val_set[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']])

In [167]:
m3 = val_set[['treatment','conversion']]
m3['prediction']= 0
m3['1-prediction']= 0
m3[['prediction','1-prediction']]= prediction3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [168]:
m3 = m3.sort_values(by='1-prediction',ascending = False)

### targeted, treat_all, treat_none, treat_random are the same for the same validation set

In [169]:
target_start = 0
target_end = 41939 #10%
target_decision = np.zeros(len(val_set))
target_decision[target_start:target_end] = 1
target_decision

array([1., 1., 1., ..., 0., 0., 0.])

In [170]:
m3['target_decision'] = target_decision

In [171]:
m3['treat=assign'] = (m3['target_decision'] == m3['treatment'])
m3['treat=assign'] = m3['treat=assign'].replace(True,1)
m3['treat=assign'] = m3['treat=assign'].replace(False,0)
m3.head()

Unnamed: 0,treatment,conversion,prediction,1-prediction,target_decision,treat=assign
2260365,1,0,0.833365,0.166635,1.0,1.0
3146085,1,0,0.833365,0.166635,1.0,1.0
13694484,1,1,0.833365,0.166635,1.0,1.0
2298279,1,0,0.833365,0.166635,1.0,1.0
1670617,1,0,0.833365,0.166635,1.0,1.0


In [172]:
reward3 = m3['conversion']*m3['treat=assign']
for i in range(len(m3)): 
    if m3['treatment'].iloc[i] == 1:
        reward3.iloc[i] = reward3.iloc[i]/targeted
    else:
        reward3.iloc[i] = reward3.iloc[i]/(1-targeted)     

In [173]:
reward3.value_counts()

0.000000    418395
1.177511       977
6.633456        15
dtype: int64

In [174]:
m3['reward'] = reward

In [175]:
effect3 = effect(m3)
effect3

(0.002997969348507079,
 0.02703226163466327,
 0.018285319993099878,
 0.0012054890509720537,
 0.008746941641563394)

## Model 5 Prediction

In [176]:
val_set = df_all[train:train+val]
prediction5 = clf_untreated.predict_proba(val_set[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']])

In [177]:
m5 = val_set[['treatment','conversion']]
m5['prediction']= 0
m5['1-prediction']= 0
m5[['prediction','1-prediction']]= prediction5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [178]:
m5 = m5.sort_values(by='1-prediction',ascending = False)

In [179]:
m5['target_decision'] = target_decision

In [180]:
m5['treat=assign'] = (m5['target_decision'] == m5['treatment'])
m5['treat=assign'] = m5['treat=assign'].replace(True,1)
m5['treat=assign'] = m5['treat=assign'].replace(False,0)
m5.head()

Unnamed: 0,treatment,conversion,prediction,1-prediction,target_decision,treat=assign
482842,1,0,0.878548,0.121452,1.0,1.0
582807,1,0,0.878548,0.121452,1.0,1.0
785310,1,0,0.878548,0.121452,1.0,1.0
2507008,1,0,0.878548,0.121452,1.0,1.0
2663541,1,0,0.878548,0.121452,1.0,1.0


In [181]:
reward5 = m5['conversion']*m5['treat=assign']
for i in range(len(m5)):
    if m5['treatment'].iloc[i] == 1:
        reward5.iloc[i] = reward5.iloc[i]/targeted
    else:
        reward5.iloc[i] = reward5.iloc[i]/(1-targeted)     

In [182]:
reward5.value_counts()

0.000000    418393
1.177511       978
6.633456        16
dtype: int64

In [183]:
m5['reward'] = reward

In [184]:
effect5 = effect(m5)
effect5

(0.002997969348507078,
 0.027165157491250487,
 0.017685699848408287,
 0.0012054890509720537,
 0.0094794576428422)

## Uplift Prediction
* Create an uplift model by using prediction of treated - untreated

In [185]:
uplift_prediction = m3['1-prediction']-m5['1-prediction']

In [186]:
up = val_set[['treatment','conversion']]
#up['prediction']= 0
up['1-prediction']= 0
up['1-prediction']= uplift_prediction

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [187]:
up = up.sort_values(by='1-prediction',ascending = False)

In [188]:
up['target_decision'] = target_decision
up['treat=assign'] = (up['target_decision'] == up['treatment'])
up['treat=assign'] = up['treat=assign'].replace(True,1)
up['treat=assign'] = up['treat=assign'].replace(False,0)

In [189]:
reward_up = up['conversion']*up['treat=assign']
for i in range(len(up)):
    if up['treatment'].iloc[i] == 1:
        reward_up.iloc[i] = reward_up.iloc[i]/targeted
    else:
        reward_up.iloc[i] = reward_up.iloc[i]/(1-targeted)     

In [190]:
reward_up.value_counts()

0.000000    418443
1.177511       925
6.633456        19
dtype: int64

In [191]:
up['reward'] = reward
effect_up = effect(up)
effect_up

(0.002997969348507078,
 0.025644579983365678,
 0.017379451354574884,
 0.0012054890509720537,
 0.008265128628790794)

### Use Spearmanr correlation to see how closely these models are ranking things

In [192]:
from scipy import stats
stats.spearmanr(m1['1-prediction'], m3['1-prediction'])

SpearmanrResult(correlation=0.995839161291198, pvalue=0.0)

In [193]:
stats.spearmanr(m1['1-prediction'], m5['1-prediction'])

SpearmanrResult(correlation=0.9738691323092424, pvalue=0.0)

In [194]:
stats.spearmanr(m3['1-prediction'], m5['1-prediction'])

SpearmanrResult(correlation=0.9698717227214069, pvalue=0.0)

In [195]:
stats.spearmanr(m1['prediction'], up['1-prediction'])

SpearmanrResult(correlation=-0.9545499500526204, pvalue=0.0)

In [196]:
stats.spearmanr(m3['1-prediction'], up['1-prediction'])

SpearmanrResult(correlation=0.9567715817366147, pvalue=0.0)

In [197]:
stats.spearmanr(m5['1-prediction'], up['1-prediction'])

SpearmanrResult(correlation=0.9502433542360443, pvalue=0.0)

## Summarize effect for Tree models

In [198]:
result = pd.DataFrame(np.array([np.array(effect1),np.array(effect3),np.array(effect5),np.array(effect_up)]),index = ['model_all','model_treated','model_untreated','uplift'],columns = ['treat_with_model','for_target_treated','for_target_untreated','avg_effect','effect_for_targeted'])

In [199]:
result
#treat_with_model = avg of reward column
#for_target_treated = conversion rate of the treated and targeted
#for_target_untreated = conversion rate of the untreated but targeted
#avg_effect = treat_all-treat_none
#effect_for_targeted = for_target_treated - for_target_untreated

Unnamed: 0,treat_with_model,for_target_treated,for_target_untreated,avg_effect,effect_for_targeted
model_all,0.002998,0.026945,0.017732,0.001205,0.009212
model_treated,0.002998,0.027032,0.018285,0.001205,0.008747
model_untreated,0.002998,0.027165,0.017686,0.001205,0.009479
uplift,0.002998,0.025645,0.017379,0.001205,0.008265


* min_samples_leaf=5000/10000
* Tree vs logistic, random forest, etc.
* good at predicting who to target
# Tasks
* untreated/treated is as good as uplift? (data on purchases/new product(cold start): mostly untreated; paper/treated is the common case (don't know the population untreated out there): treated and need to stop treating)

### If not, how much data point needed to catch up? (learning curves vs data points) 
* learning curve for each model (treated, untreated)
* learning curve for all (85% treated)
* learning curve uplift model
* starting uplift with all untreated, increase treated (y uplift, x treated observations/cost, when to reach horizontal(untreated)



### in which settings?

### uplift model: uplift modeling procedure. (different options) eg. 
