In [2]:
import pandas as pd

In [1]:
import numpy as np

In [3]:
# import three files, takes very long
df_all = pd.read_csv('criteo-uplift-v2.1.csv', encoding='latin_1')
df_treated = pd.read_csv('criteo_treated.csv', encoding='latin_1')
df_untreated = pd.read_csv('criteo_not_treated.csv', encoding='latin_1')

In [4]:
df_all.columns

Index(['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10',
       'f11', 'treatment', 'conversion', 'visit', 'exposure'],
      dtype='object')

In [5]:
#These are the number of rows in each file. Treated and Untreated data were extracted from the total data.
count_all = df_all['conversion'].count()
count_treated = df_treated['conversion'].count()
count_untreated = df_untreated['conversion'].count()
print(count_all,count_treated,count_untreated)

13979592 11882655 2096937


#### By the number of untreated data, select these numbers of data randomly from all/treated/untreated
* 60% train = 1258163
* 20% validation = 419387
* 20% test = 419387
#### To do this, I shuffle each dataset and choose corresponding number of rows from them, so that we are in fact randomly selecting our training, validating and testing set

In [11]:
#Shuffle data to make sure random selection
from sklearn.utils import shuffle
df_untreated = shuffle(df_untreated)
df_treated = shuffle(df_treated)
df_all = shuffle(df_all)

In [7]:
#Set the number of train, validation and test data we want to use
train = 1258163   
val = 419387
test = 419387

## Model 1: All data, predict conversion

In [12]:
#Model 1: run logistic regression for all data (including treated and untreated) to predict conversion, if we use the default solver, the model does not converge
from sklearn.linear_model import LogisticRegression
X, y = df_all[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']][:train],df_all['conversion'][:train]
X_val,y_val = df_all[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']][train:train+val],df_all['conversion'][train:train+val]
clf = LogisticRegression(penalty='l2', C=10.0,solver='newton-cg').fit(X, y)
clf.score(X_val, y_val)

0.9971553720072391

In [114]:
# AUC for this model
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_val, clf.predict_proba(val_set[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']])[:,1]
)
metrics.auc(fpr, tpr)

0.9504248429979815

## Model 2: All data, predict visit

In [14]:
#Model 2: run logistic regression for all data (including treated and untreated) to predict visit
y_visit = df_all['visit'][:train]
y_visit_val = df_all['visit'][train:train+val]
clf_visit = LogisticRegression(penalty='l2', C=10.0,solver='newton-cg').fit(X, y_visit)
clf_visit.score(X_val, y_visit_val)

0.9576834761211006

## Model 3: Treated data, predict conversion

In [15]:
X_tr, y_tr = df_treated[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']][:train],df_treated['conversion'][:train]
X_tr_val,y_tr_val = df_treated[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']][train:train+val],df_treated['conversion'][train:train+val]
clf_treated = LogisticRegression(penalty='l2', C=10.0,solver='newton-cg').fit(X_tr, y_tr)
clf_treated.score(X_tr_val, y_tr_val)

0.9968597023751332

In [177]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y_val, clf_treated.predict_proba(val_set[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']])[:,1])
metrics.auc(fpr, tpr)

0.9432228828309657

## Model 4: Treated data, predict visit

In [16]:
X_tr_visit, y_tr_visit = df_treated[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']][:train],df_treated['visit'][:train]
X_tr_val,y_tr_val = df_treated[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']][train:train+val],df_treated['visit'][train:train+val]
clf_treated_visit = LogisticRegression(penalty='l2', C=10.0,solver='newton-cg').fit(X_tr_visit, y_tr_visit)
clf_treated_visit.score(X_tr_val, y_tr_val)

0.9567082432216545

## Model 5: Untreated data, predict conversion

In [17]:
X_un, y_un = df_untreated[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']][:train],df_untreated['conversion'][:train]
X_un_val,y_un_val = df_untreated[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']][train:train+val],df_untreated['conversion'][train:train+val]
clf_untreated = LogisticRegression(penalty='l2', C=10.0,solver='newton-cg').fit(X_un, y_un)
clf_untreated.score(X_un_val, y_un_val)

0.9979374658728096

In [123]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y_val, clf_untreated.predict_proba(val_set[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']])[:,1])
metrics.auc(fpr, tpr)

0.9445403244454027

## Model 6: Untreated data, predict visit

In [18]:
X_un_visit, y_un_visit = df_untreated[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']][:train],df_untreated['visit'][:train]
X_un_val,y_un_val = df_untreated[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']][train:train+val],df_untreated['visit'][train:train+val]
clf_untreated_visit = LogisticRegression(penalty='l2', C=10.0,solver='newton-cg').fit(X_un_visit, y_un_visit)
clf_untreated_visit.score(X_un_val, y_un_val)

0.9640904462942342

## Model 1 Prediction
### After building the models, we do evaluation and prediction using validation data
* some data in val_set might be the same with training data in untreated/treated data set
* val_set is completely separated from training data of model 1

In [19]:
val_set = df_all[train:train+val]
prediction = clf.predict_proba(val_set[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']])

In [20]:
m1 = val_set[['treatment','conversion']]
m1['prediction']= 0
m1['1-prediction']= 0
m1[['prediction','1-prediction']]= prediction

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [21]:
m1.head()

Unnamed: 0,treatment,conversion,prediction,1-prediction
2225300,1,0,0.999491,0.000509
12579350,0,0,0.999553,0.000447
13875,1,0,0.999558,0.000442
6915631,1,0,0.999444,0.000556
9285534,1,0,0.99962,0.00038


## Now we see the features of the validation set

In [22]:
#porportion of people treated
targeted = sum(m1['treatment'])/len(m1)
targeted

0.8502886355561808

In [23]:
#conversion rate on treated
treat_all = m1['conversion'].dot(m1['treatment'])/sum(m1['treatment'])
treat_all

0.0029977565900168256

In [24]:
#conversion rate on not treated
treat_none = m1['conversion'].dot(1-m1['treatment'])/sum(1-m1['treatment'])
treat_none

0.0019271505247901637

In [25]:
#conversion rate if treat randomly (according to the percentage treated)
treat_random = treat_all*targeted+treat_none*(1-targeted)
treat_random

0.0028374746952099135

## Now, we target top 10% and see the result

In [26]:
target_start = 0
target_end = 41939 #10%
target_decision = np.zeros(len(val_set))
target_decision[target_start:target_end] = 1
target_decision

array([1., 1., 1., ..., 0., 0., 0.])

In [27]:
m1 = m1.sort_values(by = '1-prediction',ascending= False)

In [28]:
m1['target_decision'] = target_decision

In [29]:
m1['treat=assign'] = (m1['target_decision'] == m1['treatment'])


In [30]:
m1['treat=assign'] = m1['treat=assign'].replace(True,1)
m1['treat=assign'] = m1['treat=assign'].replace(False,0)
m1.head()

Unnamed: 0,treatment,conversion,prediction,1-prediction,target_decision,treat=assign
4087313,1,1,0.006513,0.993487,1.0,1.0
4567293,1,1,0.007244,0.992756,1.0,1.0
2605301,1,1,0.010164,0.989836,1.0,1.0
3045084,1,1,0.011428,0.988572,1.0,1.0
6500346,0,1,0.017164,0.982836,1.0,0.0


In [32]:
#calculte the reward by treat=assign
reward = m1['conversion']*m1['treat=assign']
for i in range(len(m1)):
    if m1['treatment'].iloc[i] == 1:
        reward.iloc[i] = reward.iloc[i]/targeted
    else:
        reward.iloc[i] = reward.iloc[i]/(1-targeted)     

In [33]:
reward.value_counts()

0.000000    418433
1.176071       942
6.679520        12
dtype: int64

In [34]:
m1['reward'] = reward

In [106]:
#calculate effects
def effect(m):
    treat_with_model = sum(m['reward'])/len(m)
    for_target_treated = m['treatment'].dot(m['conversion']*m['target_decision'])/(m['treatment'].dot(m['target_decision']))
    for_target_untreated = (1-m['treatment']).dot(m['conversion']*m['target_decision'])/((1-m['treatment']).dot(m['target_decision']))
    avg_effect = treat_all-treat_none
    effect_for_targeted = for_target_treated - for_target_untreated
    return treat_with_model,for_target_treated,for_target_untreated,avg_effect,effect_for_targeted

In [107]:
effect1 = effect(m1)

In [108]:
effect1

(0.002832737621282799,
 0.026024975135374075,
 0.01897962737245342,
 0.001070606065226662,
 0.007045347762920654)

## Model 3 Prediction 

In [54]:
val_set = df_all[train:train+val]
prediction3 = clf_treated.predict_proba(val_set[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']])

In [55]:
m3 = val_set[['treatment','conversion']]
m3['prediction']= 0
m3['1-prediction']= 0
m3[['prediction','1-prediction']]= prediction3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [56]:
m3 = m3.sort_values(by='1-prediction',ascending = False)

### targeted, treat_all, treat_none, treat_random are the same for the same validation set

In [50]:
target_start = 0
target_end = 41939 #10%
target_decision = np.zeros(len(val_set))
target_decision[target_start:target_end] = 1
target_decision

array([1., 1., 1., ..., 0., 0., 0.])

In [57]:
m3['target_decision'] = target_decision

In [58]:
m3['treat=assign'] = (m3['target_decision'] == m3['treatment'])
m3['treat=assign'] = m3['treat=assign'].replace(True,1)
m3['treat=assign'] = m3['treat=assign'].replace(False,0)
m3.head()

Unnamed: 0,treatment,conversion,prediction,1-prediction,target_decision,treat=assign
4087313,1,1,0.00618,0.99382,1.0,1.0
4567293,1,1,0.007431,0.992569,1.0,1.0
2605301,1,1,0.009852,0.990148,1.0,1.0
3045084,1,1,0.01491,0.98509,1.0,1.0
6500346,0,1,0.016636,0.983364,1.0,0.0


In [59]:
reward3 = m3['conversion']*m3['treat=assign']
for i in range(len(m3)):
    if m3['treatment'].iloc[i] == 1:
        reward3.iloc[i] = reward3.iloc[i]/targeted
    else:
        reward3.iloc[i] = reward3.iloc[i]/(1-targeted)     

In [60]:
reward3.value_counts()

0.000000    418434
1.176071       941
6.679520        12
dtype: int64

In [61]:
m3['reward'] = reward

In [109]:
effect3 = effect(m3)
effect3

(0.002832737621282799,
 0.02600093946008676,
 0.018963117606123868,
 0.001070606065226662,
 0.007037821853962893)

## Model 5 Prediction

In [63]:
val_set = df_all[train:train+val]
prediction5 = clf_untreated.predict_proba(val_set[['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']])

In [64]:
m5 = val_set[['treatment','conversion']]
m5['prediction']= 0
m5['1-prediction']= 0
m5[['prediction','1-prediction']]= prediction5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [65]:
m5 = m5.sort_values(by='1-prediction',ascending = False)

In [66]:
m5['target_decision'] = target_decision

In [67]:
m5['treat=assign'] = (m5['target_decision'] == m5['treatment'])
m5['treat=assign'] = m5['treat=assign'].replace(True,1)
m5['treat=assign'] = m5['treat=assign'].replace(False,0)
m5.head()

Unnamed: 0,treatment,conversion,prediction,1-prediction,target_decision,treat=assign
4087313,1,1,0.012153,0.987847,1.0,1.0
4567293,1,1,0.014287,0.985713,1.0,1.0
3045084,1,1,0.017977,0.982023,1.0,1.0
2605301,1,1,0.018832,0.981168,1.0,1.0
3801469,1,0,0.027074,0.972926,1.0,1.0


In [68]:
reward5 = m5['conversion']*m5['treat=assign']
for i in range(len(m5)):
    if m5['treatment'].iloc[i] == 1:
        reward5.iloc[i] = reward5.iloc[i]/targeted
    else:
        reward5.iloc[i] = reward5.iloc[i]/(1-targeted)     

In [69]:
reward5.value_counts()

0.000000    418436
1.176071       939
6.679520        12
dtype: int64

In [70]:
m5['reward'] = reward

In [110]:
effect5 = effect(m5)
effect5

(0.002832737621282799,
 0.02595571772120408,
 0.018917042693509196,
 0.001070606065226662,
 0.007038675027694882)

## Uplift Prediction
* Create an uplift model by using prediction of treated - untreated

In [85]:
uplift_prediction = m3['1-prediction']-m5['1-prediction']

In [86]:
up = val_set[['treatment','conversion']]
#up['prediction']= 0
up['1-prediction']= 0
up['1-prediction']= uplift_prediction

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [88]:
up = up.sort_values(by='1-prediction',ascending = False)

In [89]:
up['target_decision'] = target_decision
up['treat=assign'] = (up['target_decision'] == up['treatment'])
up['treat=assign'] = up['treat=assign'].replace(True,1)
up['treat=assign'] = up['treat=assign'].replace(False,0)

In [90]:
reward_up = up['conversion']*up['treat=assign']
for i in range(len(up)):
    if up['treatment'].iloc[i] == 1:
        reward_up.iloc[i] = reward_up.iloc[i]/targeted
    else:
        reward_up.iloc[i] = reward_up.iloc[i]/(1-targeted)     

In [91]:
reward_up.value_counts()

0.000000    418449
1.176071       926
6.679520        12
dtype: int64

In [111]:
#up['reward'] = reward
effect_up = effect(up)
effect_up

(0.002832737621282799,
 0.025510344637593323,
 0.019326241134751773,
 0.001070606065226662,
 0.0061841035028415495)

### Use Spearmanr correlation to see how closely these models are ranking things

In [93]:
from scipy import stats
stats.spearmanr(m1['1-prediction'], m3['1-prediction'])

SpearmanrResult(correlation=0.9999999999998346, pvalue=0.0)

In [95]:
stats.spearmanr(m1['1-prediction'], m5['1-prediction'])

SpearmanrResult(correlation=0.9999999999998341, pvalue=0.0)

In [96]:
stats.spearmanr(m3['1-prediction'], m5['1-prediction'])

SpearmanrResult(correlation=0.9999999999998339, pvalue=0.0)

In [98]:
stats.spearmanr(m1['prediction'], up['1-prediction'])

SpearmanrResult(correlation=-0.9999999999998325, pvalue=0.0)

In [100]:
stats.spearmanr(m3['1-prediction'], up['1-prediction'])

SpearmanrResult(correlation=0.9999999999998326, pvalue=0.0)

In [101]:
stats.spearmanr(m5['1-prediction'], up['1-prediction'])

SpearmanrResult(correlation=0.9999999999998341, pvalue=0.0)

## Summarize effect for models

In [112]:
result = pd.DataFrame(np.array([np.array(effect1),np.array(effect3),np.array(effect5),np.array(effect_up)]),index = ['model_all','model_treated','model_untreated','uplift'],columns = ['treat_with_model','for_target_treated','for_target_untreated','avg_effect','effect_for_targeted'])

In [113]:
result
#treat_with_model = avg of reward column
#for_target_treated = conversion rate of the treated and targeted
#for_target_untreated = conversion rate of the untreated but targeted
#avg_effect = treat_all-treat_none
#effect_for_targeted = for_target_treated - for_target_untreated

Unnamed: 0,treat_with_model,for_target_treated,for_target_untreated,avg_effect,effect_for_targeted
model_all,0.002833,0.026025,0.01898,0.001071,0.007045
model_treated,0.002833,0.026001,0.018963,0.001071,0.007038
model_untreated,0.002833,0.025956,0.018917,0.001071,0.007039
uplift,0.002833,0.02551,0.019326,0.001071,0.006184
