In [1]:
from copy import deepcopy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from synthetic_data import UpliftDataSet

Generate training dataset

In [2]:
n_days = 1
dataset = UpliftDataSet(age_min = 16,
                        age_max = 60,
                        n_cities = 6,
                        n_users = 100000,
                        share_negative_people=0.2)
train = dataset.get_train(sorted_type_of_communication=(5,10,15), 
                          subsample_for_train=0.3,
                          share_communicate=0.75,
                          n_days=n_days)
users = dataset.get_user_data

### One model

Treatment Dummy approach, Solo model approach, Single model approach, S-Learner (see more detail [here](https://www.uplift-modeling.com/en/latest/user_guide/models/solo_model.html))

![image info](./images/one-model-approach.PNG)
*picture was taken from  [this source](https://www.uplift-modeling.com/en/latest/user_guide/models/solo_model.html)

In [3]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), ['day', 'age']),
        ("cat", OneHotEncoder(handle_unknown="ignore"), ['gender', 'city', 'communication']),
    ]
)
full_model = Pipeline([('preprocessing', preprocessor),
                       ('model', LinearRegression())])

x = train[['gender','age','city','day', 'communication']]
y = train['purchases']

model =  deepcopy(full_model).fit(x, y)

AB test

Final uplift will calculate like this:
![image info](./images/one-model-approach-2.PNG)
*picture was taken from  [this source](https://www.uplift-modeling.com/en/latest/user_guide/models/solo_model.html)

or we can calculate probability of each strategy and choose the maximum one

In [4]:
users['day'] = n_days + 1
test = users.copy()
communications = train['communication'].unique()

for key in communications:
    test['communication'] = key
    test[key] = model.predict(test[['gender','age','city','day', 'communication']]) *(1-test['communication']/100)
    
col_comm = np.sort(list(communications))
max_vals_ind = np.where(test[col_comm] == np.array(np.max(test[col_comm],axis=1)).reshape(-1,1))
_, un = np.unique(max_vals_ind[0], return_index=True)
col = max_vals_ind[1][un]
test['communication'] = col_comm[col]

test['test_group'] = np.random.choice(['test','control'], p=[0.5,0.5], size=test.shape[0])
test.loc[test['test_group'] == 'control', 'communication'] = 0

test['purchases'] = dataset.check_test(test, add_purchases_value=True, check_median=60, check_std=10)
test['net_income'] = test['purchases'] *(1-test['communication']/100)
test.groupby('test_group')['net_income'].mean()

test_group
control    15.094594
test       15.269744
Name: net_income, dtype: float64

In [5]:
test.loc[test['test_group'] == 'test', 'communication'].value_counts()

15.0    49844
0.0        48
Name: communication, dtype: int64

### Two models

Two models approach, T-learner, difference two models (see more details [here](https://www.uplift-modeling.com/en/latest/user_guide/models/two_models.html))


In this situation we separate treatments and and for each group
![image info](./images/two-model-approach.PNG)
*picture was taken from  [this source](https://www.uplift-modeling.com/en/latest/user_guide/models/two_models.html)

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), ['day', 'age']),
        ("cat", OneHotEncoder(handle_unknown="ignore"), ['gender', 'city']),
    ]
)
full_model = Pipeline([('preprocessing', preprocessor),
                       ('model', LinearRegression())])

model_collection = {}
x = train[['gender','age','city','day']]
y = train['purchases']
for communication in tqdm(train['communication'].unique()):
    ind = train['communication'] == communication
    model_collection[communication] = deepcopy(full_model).fit(x[ind], y[ind])

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




AB test

In this situation uplift will calculate like this
![image info](./images/two-model-approach-2.PNG)
*picture was taken from  [this source](https://www.uplift-modeling.com/en/latest/user_guide/models/two_models.html)

or we can calculate probability of each strategy and choose the maximum one

In [7]:
users['day'] = n_days + 1
test = users.copy()

for key in model_collection.keys():
    test[key] = model_collection[key].predict(users[['gender','age','city','day']])*(1-key/100)
    
col_comm = np.sort(list(model_collection.keys()))
col = np.where(test[col_comm] == np.array(np.max(test[col_comm],axis=1)).reshape(-1,1))[1]
test['communication'] = col_comm[col]

test['test_group'] = np.random.choice(['test','control'], p=[0.5,0.5], size=test.shape[0])
test.loc[test['test_group'] == 'control', 'communication'] = 0

test['purchases'] = dataset.check_test(test, add_purchases_value=True, check_median=60, check_std=10)
test['net_income'] = test['purchases'] *(1-test['communication']/100)
test.groupby('test_group')['net_income'].mean()

test_group
control    15.109771
test       17.704993
Name: net_income, dtype: float64

In [8]:
test.loc[test['test_group'] == 'test', 'communication'].value_counts()

0.0     20962
15.0    15078
10.0    13959
Name: communication, dtype: int64

## ! For the following examples we will select communication = 5 to present how it works

### Two models (dependant)

Dependent Data Representation, Dependent Feature Representation (see more [here](https://www.uplift-modeling.com/en/latest/user_guide/models/two_models.html))


![image info](./images/two-model-approach-dependant.PNG)
*picture was taken from  [this source](https://www.uplift-modeling.com/en/latest/user_guide/models/two_models.html)

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), ['day', 'age']),
        ("cat", OneHotEncoder(handle_unknown="ignore"), ['gender', 'city']),
    ]
)
full_model = Pipeline([('preprocessing', preprocessor),
                       ('model', LinearRegression())])

ind_control = train['communication'] == 0
x_control = train.loc[ind_control, ['gender','age','city','day']]
y_control = train.loc[ind_control, 'purchases']

# we choose only one type of treatment, but it's up to you what to do in this situation, 
# you can include 'communication' as feature to your model with treatment
ind_treat = train['communication'] == 5
x_treat = train.loc[ind_treat, ['gender','age','city','day']]
y_treat = train.loc[ind_treat, 'purchases']

model_control = deepcopy(full_model).fit(x_control, y_control)
x_treat['result_from_control_model'] = model_control.predict(x_treat)
model_treat = deepcopy(full_model).fit(x_treat, y_treat)

AB test

The final predictions will look like this:
![image info](./images/two-model-approach-dependant-2.PNG)
*picture was taken from  [this source](https://www.uplift-modeling.com/en/latest/user_guide/models/two_models.html)

In [10]:
users['day'] = n_days + 1
test = users.copy()
communications = train['communication'].unique()

test['result_from_control_model'] = model_control.predict(test[['gender','age','city','day']])
test['result_from_treat_model'] = model_treat.predict(test[['gender','age','city','day', 
                                                             'result_from_control_model']])*(1-5/100)

test['communication'] = ((test['result_from_treat_model'] - test['result_from_control_model'] ) > 0)*5

test['test_group'] = np.random.choice(['test','control'], p=[0.5,0.5], size=test.shape[0])
test.loc[test['test_group'] == 'control', 'communication'] = 0

test['purchases'] = dataset.check_test(test, add_purchases_value=True, check_median=60, check_std=10)
test['net_income'] = test['purchases'] *(1-test['communication']/100)
test.groupby('test_group')['net_income'].mean()

test_group
control    15.198776
test       15.360468
Name: net_income, dtype: float64

In [11]:
test.loc[test['test_group'] == 'test', 'communication'].value_counts()

0    47858
5     2262
Name: communication, dtype: int64

### Two models (cross dependant)

X-learner

![image info](./images/two-model-approach-cross-dependant.PNG)
![image info](./images/two-model-approach-cross-dependant-2.PNG)
![image info](./images/two-model-approach-cross-dependant-3.PNG)
*picture was taken from  [this source](https://habr.com/ru/companies/ru_mts/articles/485980/)

In [12]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), ['day', 'age']),
        ("cat", OneHotEncoder(handle_unknown="ignore"), ['gender', 'city']),
    ]
)
full_model = Pipeline([('preprocessing', preprocessor),
                       ('model', LinearRegression())])

ind_control = train['communication'] == 0
x_control = train.loc[ind_control, ['gender','age','city','day']]
y_control = train.loc[ind_control, 'purchases']

# we choose only one type of treatment, but it's up to you what to do in this situation, 
# you can include 'communication' as feature to your model with treatment
ind_treat = train['communication'] == 5
x_treat = train.loc[ind_treat, ['gender','age','city','day']]
y_treat = train.loc[ind_treat, 'purchases']

model_control_1 = deepcopy(full_model).fit(x_control, y_control)
y_for_treat_train_control = model_control_1.predict(x_treat)
model_treat_1 = deepcopy(full_model).fit(x_treat, y_treat)
y_for_control_train_treat = model_treat_1.predict(x_control)

uplift_control = y_for_control_train_treat * (1-5/100) - y_control
uplift_treat = y_treat * (1-5/100) - y_for_treat_train_control


model_control_2 = deepcopy(full_model).fit(x_control, uplift_control)
model_treat_2 = deepcopy(full_model).fit(x_treat, uplift_treat)

AB test

The final uplift wwill calculate with the following formula:
![image info](./images/two-model-approach-cross-dependant-4.PNG)
*picture was taken from  [this source](https://habr.com/ru/companies/ru_mts/articles/485980/)

In [13]:
g = 1
users['day'] = n_days + 1
test = users.copy()

test['communication'] = (g * (model_control_2.predict(test[['gender','age','city','day']])
                             ) + (1-g)*(model_treat_2.predict(test[['gender','age','city','day']])) > 0 )*5

test['test_group'] = np.random.choice(['test','control'], p=[0.5,0.5], size=test.shape[0])
test.loc[test['test_group'] == 'control', 'communication'] = 0

test['purchases'] = dataset.check_test(test, add_purchases_value=True, check_median=60, check_std=10)
test['net_income'] = test['purchases'] *(1-test['communication']/100)
test.groupby('test_group')['net_income'].mean()

test_group
control    15.273787
test       15.397655
Name: net_income, dtype: float64

In [14]:
test.loc[test['test_group'] == 'test', 'communication'].value_counts()

0    47810
5     2336
Name: communication, dtype: int64

### Class transformation 

Class Transformation approaсh, Class Variable Transformation approach, Revert Label approach (see details [here](https://www.uplift-modeling.com/en/latest/user_guide/models/revert_label.html))

![image info](./images/target-transform.PNG)
![image info](./images/target-transform-1.PNG)
*picture was taken from  [this source](https://www.uplift-modeling.com/en/latest/user_guide/models/revert_label.html)

##### non-binary and communication type like yes/no

In [15]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), ['day', 'age']),
        ("cat", OneHotEncoder(handle_unknown="ignore"), ['gender', 'city']),
    ]
)
full_model = Pipeline([('preprocessing', preprocessor),
                       ('model', LinearRegression())])

ind_treat = train['communication'].isin([0, 5])
x = train.loc[ind_treat, ['gender','age','city','day']]
y = train.loc[ind_treat, 'purchases'] * (1-train.loc[ind_treat,'communication']/100)
# you can use binary target
w = (train.loc[ind_treat, 'communication'] > 0) * 1
p = np.sum(w)/w.shape[0]

z = y * (w-p)/(p*(1-p))

model = deepcopy(full_model).fit(x, z)

AB test

![image info](./images/target-transform-2.PNG)
*picture was taken from  [this source](https://www.uplift-modeling.com/en/latest/user_guide/models/revert_label.html), you can find more details about why it looks like this if you follow this link

In [16]:
g = 1
users['day'] = n_days + 1
test = users.copy()

test['uplift'] = model.predict(test[['gender','age','city','day']])
test['communication'] = (test['uplift'] > 0) * 5

test['test_group'] = np.random.choice(['test','control'], p=[0.5,0.5], size=test.shape[0])
test.loc[test['test_group'] == 'control', 'communication'] = 0

test['purchases'] = dataset.check_test(test, add_purchases_value=True, check_median=60, check_std=10)
test['net_income'] = test['purchases'] *(1-test['communication']/100)
test.groupby('test_group')['net_income'].mean()

test_group
control    15.282697
test       14.969160
Name: net_income, dtype: float64

In [17]:
test.loc[test['test_group'] == 'test', 'communication'].value_counts()

0    47240
5     2533
Name: communication, dtype: int64