In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


### Задание 1

скачать набор данных маркетинговых кампаний отсюда https://www.kaggle.com/davinwijaya/customer-retention

In [3]:
df = pd.read_csv('../ML_в_бизнесе/data.csv')
df.head(5)

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,offer,conversion
0,10,142.44,1,0,Surburban,0,Phone,Buy One Get One,0
1,6,329.08,1,1,Rural,1,Web,No Offer,0
2,7,180.65,0,1,Surburban,1,Web,Buy One Get One,0
3,9,675.83,1,0,Rural,1,Web,Discount,0
4,2,45.34,1,0,Urban,0,Web,Buy One Get One,0


### Задание 2

поле conversion - это целевая переменная, а offer - коммуникация. Переименовать поля (conversion -> target, offer -> treatment) и привести поле treatment к бинарному виду (1 или 0, т.е было какое-то предложение или нет) - значение No Offer означает отсутствие коммуникации, а все остальные - наличие.

In [4]:
# переименование полей
df.rename(columns = {'conversion': 'target', 'offer': 'treatment'}, inplace = True)

In [5]:
df['treatment'].value_counts()

Buy One Get One    21387
Discount           21307
No Offer           21306
Name: treatment, dtype: int64

In [6]:
# поле 'treatment' к бинарному виду
df['treatment'] = df['treatment'].replace({'Buy One Get One':1, 'Discount':1, 'No Offer':0})

In [7]:
df

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,treatment,target
0,10,142.44,1,0,Surburban,0,Phone,1,0
1,6,329.08,1,1,Rural,1,Web,0,0
2,7,180.65,0,1,Surburban,1,Web,1,0
3,9,675.83,1,0,Rural,1,Web,1,0
4,2,45.34,1,0,Urban,0,Web,1,0
...,...,...,...,...,...,...,...,...,...
63995,10,105.54,1,0,Urban,0,Web,1,0
63996,5,38.91,0,1,Urban,1,Phone,1,0
63997,6,29.99,1,0,Urban,1,Phone,1,0
63998,1,552.94,1,0,Surburban,1,Multichannel,1,0


### Задание 3

сделать разбиение набора данных не тренировочную и тестовую выборки

In [9]:
indices_train = df.index

indices_learn, indices_valid = train_test_split(indices_train, test_size=0.33, random_state=21)

In [10]:
df.columns

Index(['recency', 'history', 'used_discount', 'used_bogo', 'zip_code',
       'is_referral', 'channel', 'treatment', 'target'],
      dtype='object')

In [11]:
features = ['recency', 'history', 'used_discount', 'used_bogo', 'zip_code',
       'is_referral', 'channel']
X_train = df.loc[indices_learn, features]
y_train = df.loc[indices_learn, 'target']
treat_train = df.loc[indices_learn, 'treatment']

X_val = df.loc[indices_valid, features] 
y_val = df.loc[indices_valid, 'target']
treat_val =  df.loc[indices_valid, 'treatment']


cat_features = ['zip_code', 'channel']

### Задание 4

провести uplift-моделирование 3 способами:
- одна модель с признаком коммуникации (S-learner)
- модель с трансформацией таргета
- вариант с двумя независимыми моделями

In [12]:
from sklift.metrics import uplift_at_k
from sklift.viz import plot_uplift_preds
from sklift.models import SoloModel

# sklift поддерживает любые модели, 
# которые удовлетворяют соглашениями scikit-learn
# Для примера воспользуемся catboost
from catboost import CatBoostClassifier

##### одна модель с признаком коммуникации (S-learner)

In [13]:
dict_score = {
    'uplift@10%': [],
    'uplift@20%': []
}

sm = SoloModel(
    CatBoostClassifier(iterations=20, random_state=42, silent=True, cat_features=cat_features)
)
 
sm = sm.fit(X_train, y_train, treat_train)

uplift_sm = sm.predict(X_val)

for i in [0.1, 0.2]:
    sm_score = uplift_at_k(y_true=y_val, uplift=uplift_sm, treatment=treat_val, strategy='by_group', k=i)
    if i == 0.1:
        dict_score['uplift@10%'] = sm_score
    else:
        dict_score['uplift@20%'] = sm_score           


modal_results = pd.DataFrame({'model': ['SoloModel'],
                    'uplift@10%': [dict_score['uplift@10%']],
                    'uplift@20%': [dict_score['uplift@20%']]})


##### модель с трансформацией таргета

In [15]:
from sklift.models import ClassTransformation


ct = ClassTransformation(
    CatBoostClassifier(iterations=20, random_state=42, silent=True, cat_features=cat_features)
)
ct = ct.fit(X_train, y_train, treat_train)

uplift_ct = ct.predict(X_val)


for i in [0.1, 0.2]:
    ct_score = uplift_at_k(y_true=y_val, uplift=uplift_ct, treatment=treat_val, strategy='by_group', k=i)
    if i == 0.1:
        dict_score['uplift@10%'] = ct_score
    else:
        dict_score['uplift@20%'] = ct_score 

In [16]:
modal_results = modal_results.append({'model': 'ClassTransformation',
                    'uplift@10%': round(dict_score['uplift@10%'], 6),
                    'uplift@20%': round(dict_score['uplift@20%'], 6)}, ignore_index=True)


##### вариант с двумя независимыми моделями

In [18]:
from sklift.models import TwoModels

tm = TwoModels(
    CatBoostClassifier(iterations=20, random_state=42, silent=True, cat_features=cat_features),
    CatBoostClassifier(iterations=20, random_state=42, silent=True, cat_features=cat_features),
    method='vanilla'  # независимые модели
)
tm = tm.fit(
    X_train, y_train, treat_train
)

uplift_tm = tm.predict(X_val)


for i in[0.1, 0.2]:
    tm_score = uplift_at_k(y_true=y_val, uplift=uplift_tm, treatment=treat_val, strategy='by_group', k=i)
    if i == 0.1:
        dict_score['uplift@10%'] = tm_score
    else:
        dict_score['uplift@20%'] = tm_score 

In [19]:
modal_results = modal_results.append({'model': 'TwoModels',
                    'uplift@10%': round(dict_score['uplift@10%'], 6),
                    'uplift@20%': round(dict_score['uplift@20%'], 6)}, ignore_index=True)

### Задание 5

в конце вывести единую таблицу сравнения метрик uplift@10%, uplift@20% 3 моделей

In [21]:
modal_results

Unnamed: 0,model,uplift@10%,uplift@20%
0,SoloModel,0.096185,0.080318
1,ClassTransformation,0.090682,0.077581
2,TwoModels,0.075499,0.076362
