In [1]:
import pandas as pd; pd.set_option('display.max_columns', None)
import numpy as np

from sklearn.model_selection import train_test_split

from sklift.metrics import uplift_at_k
from sklift.viz import plot_uplift_preds
from sklift.models import SoloModel
from sklift.models import ClassTransformation

import lightgbm as lgbm
from catboost import CatBoostClassifier
from sklift.models import ClassTransformation

from sklift.models import TwoModels
from IPython.display import Image
#from causalml.inference.tree import UpliftTreeClassifier, UpliftRandomForestClassifier
#from causalml.inference.tree import uplift_tree_string, uplift_tree_plot

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.base import BaseEstimator, TransformerMixin

from scipy.sparse import hstack
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.pipeline import FeatureUnion

%matplotlib inline

In [2]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in test_columns:
            if col_ not in self.columns:
                X[col_] = 0
        return X[self.columns]

### 1. скачать набор данных маркетинговых кампаний отсюда https://www.kaggle.com/davinwijaya/customer-retention

In [3]:
df = pd.read_csv('./data/data.csv')
df.head()

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,offer,conversion
0,10,142.44,1,0,Surburban,0,Phone,Buy One Get One,0
1,6,329.08,1,1,Rural,1,Web,No Offer,0
2,7,180.65,0,1,Surburban,1,Web,Buy One Get One,0
3,9,675.83,1,0,Rural,1,Web,Discount,0
4,2,45.34,1,0,Urban,0,Web,Buy One Get One,0


### 2.Поле conversion - это целевая переменная, а offer - коммуникация. Переименовать поля (conversion -> target, offer -> treatment) и привести поле treatment к бинарному виду (1 или 0, т.е было какое-то предложение или нет) - значение No Offer означает отсутствие коммуникации, а все остальные - наличие.

In [4]:
df = df.rename(columns={df.columns[-1]: "target", df.columns[-2]: "treatment" })

In [5]:
df['treatment'].value_counts()

Buy One Get One    21387
Discount           21307
No Offer           21306
Name: treatment, dtype: int64

In [6]:
df['treatment'] = df["treatment"].replace({"No Offer": 0, "Discount": 1, "Buy One Get One": 1})

In [7]:
df['treatment'].value_counts()

1    42694
0    21306
Name: treatment, dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64000 entries, 0 to 63999
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   recency        64000 non-null  int64  
 1   history        64000 non-null  float64
 2   used_discount  64000 non-null  int64  
 3   used_bogo      64000 non-null  int64  
 4   zip_code       64000 non-null  object 
 5   is_referral    64000 non-null  int64  
 6   channel        64000 non-null  object 
 7   treatment      64000 non-null  int64  
 8   target         64000 non-null  int64  
dtypes: float64(1), int64(6), object(2)
memory usage: 4.4+ MB


In [9]:
numerical_features = df.select_dtypes(include=[np.number]).drop('target',1)
print(f"count of numeric_features {numerical_features.shape[1]}")
numerical_features = numerical_features.columns.to_list()
numerical_features

count of numeric_features 6


['recency',
 'history',
 'used_discount',
 'used_bogo',
 'is_referral',
 'treatment']

In [10]:
cat_feature_num = [
    feature for feature in numerical_features
    if len(df[feature].unique())<20
]
cat_feature = df.select_dtypes(include=[np.object]).columns.to_list()
cat_feature = list(cat_feature + cat_feature_num)
print(f"Discrete Variables Count: {cat_feature}")

Discrete Variables Count: ['zip_code', 'channel', 'recency', 'used_discount', 'used_bogo', 'is_referral', 'treatment']


In [11]:
numerical_features = list(set(numerical_features) - set(cat_feature))
numerical_features

['history']

In [12]:
continuos_transformers = []
cat_transformers = []

for cont_col in numerical_features:
    transfomer =  Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('standard', StandardScaler())
            ])
    continuos_transformers.append((cont_col, transfomer))
    
for cat_col in cat_feature:
    cat_transformer = Pipeline([
                ('selector', ColumnSelector(key=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    cat_transformers.append((cat_col, cat_transformer))

### 3 и 4.сделать разбиение набора данных на тренировочную и тестовую выборки
### сделать feature engineering на ваше усмотрение (допускается свобода выбора методов)

In [13]:
indices_train = df.index
indices_learn, indices_valid = train_test_split(df.index, test_size=0.3, random_state=123)

In [14]:
X_train = df.loc[indices_learn, :]
y_train = df.loc[indices_learn, 'target']
treat_train = df.loc[indices_learn, 'treatment']

X_val = df.loc[indices_valid, :]
y_val = df.loc[indices_valid, 'target']
treat_val =  df.loc[indices_valid, 'treatment']

X_train_full = df.loc[indices_train, :]
y_train_full = df.loc[:, 'target']
treat_train_full = df.loc[:, 'treatment']

models_results = {
    'approach': [],
    'uplift@30%': [],
    'uplift@20%': [],
    'uplift@10%': []
}

In [15]:
feats = FeatureUnion(continuos_transformers+cat_transformers)
feature_processing = Pipeline([('feats', feats)])

X_train = feature_processing.fit_transform(X_train)
X_val = feature_processing.fit_transform(X_val)
X_train_full = feature_processing.fit_transform(X_train_full)

### 5. провести uplift-моделирование 3 способами: одна модель с признаком коммуникации (S learner), модель с трансформацией таргета (трансформация классов п. 2. 1) и вариант с двумя независимыми моделями

In [16]:
sm = SoloModel(CatBoostClassifier(iterations=20, thread_count=2, random_state=42, silent=True))
sm = sm.fit(X_train, y_train, treat_train)

uplift_sm = sm.predict(X_val)

sm_score_at_30 = uplift_at_k(y_true=y_val, uplift=uplift_sm, treatment=treat_val, strategy='by_group', k=0.3)
sm_score_at_20 = uplift_at_k(y_true=y_val, uplift=uplift_sm, treatment=treat_val, strategy='by_group', k=0.2)
sm_score_at_10 = uplift_at_k(y_true=y_val, uplift=uplift_sm, treatment=treat_val, strategy='by_group', k=0.1)

models_results['approach'].append('SoloModel')
models_results['uplift@30%'].append(sm_score_at_30)
models_results['uplift@20%'].append(sm_score_at_20)
models_results['uplift@10%'].append(sm_score_at_10)

# Получим условные вероятности выполнения целевого действия при взаимодействии для каждого объекта
sm_trmnt_preds = sm.trmnt_preds_
# И условные вероятности выполнения целевого действия без взаимодействия для каждого объекта
sm_ctrl_preds = sm.ctrl_preds_
models_results

{'approach': ['SoloModel'],
 'uplift@30%': [0.08271988463538538],
 'uplift@20%': [0.07937884671746775],
 'uplift@10%': [0.07829969702892883]}

In [17]:
ct = ClassTransformation(CatBoostClassifier(iterations=20, thread_count=2, random_state=42, silent=True))
ct = ct.fit(X_train, y_train, treat_train)

uplift_ct = ct.predict(X_val)

sm_score_at_30 = uplift_at_k(y_true=y_val, uplift=uplift_ct, treatment=treat_val, strategy='by_group', k=0.3)
sm_score_at_20 = uplift_at_k(y_true=y_val, uplift=uplift_ct, treatment=treat_val, strategy='by_group', k=0.2)
sm_score_at_10 = uplift_at_k(y_true=y_val, uplift=uplift_ct, treatment=treat_val, strategy='by_group', k=0.1)

models_results['approach'].append('ClassTransformation')
models_results['uplift@30%'].append(sm_score_at_30)
models_results['uplift@20%'].append(sm_score_at_20)
models_results['uplift@10%'].append(sm_score_at_10)
models_results

  ct = ct.fit(X_train, y_train, treat_train)


{'approach': ['SoloModel', 'ClassTransformation'],
 'uplift@30%': [0.08271988463538538, 0.17436867251754357],
 'uplift@20%': [0.07937884671746775, 0.20361438794450956],
 'uplift@10%': [0.07829969702892883, 0.24006059421422984]}

In [18]:
tm = TwoModels(
    estimator_trmnt=CatBoostClassifier(iterations=20, thread_count=2, random_state=42, silent=True), 
    estimator_ctrl=CatBoostClassifier(iterations=20, thread_count=2, random_state=42, silent=True), 
    method='vanilla'
)
tm = tm.fit(
    X_train, y_train, treat_train
)

uplift_tm = tm.predict(X_val)

sm_score_at_30 = uplift_at_k(y_true=y_val, uplift=uplift_tm, treatment=treat_val, strategy='by_group', k=0.3)
sm_score_at_20 = uplift_at_k(y_true=y_val, uplift=uplift_tm, treatment=treat_val, strategy='by_group', k=0.2)
sm_score_at_10 = uplift_at_k(y_true=y_val, uplift=uplift_tm, treatment=treat_val, strategy='by_group', k=0.1)

models_results['approach'].append('TwoModels')
models_results['uplift@30%'].append(sm_score_at_30)
models_results['uplift@20%'].append(sm_score_at_20)
models_results['uplift@10%'].append(sm_score_at_10)
models_results


{'approach': ['SoloModel', 'ClassTransformation', 'TwoModels'],
 'uplift@30%': [0.08271988463538538, 0.17436867251754357, 0.07516440841048407],
 'uplift@20%': [0.07937884671746775, 0.20361438794450956, 0.08289431906994921],
 'uplift@10%': [0.07829969702892883, 0.24006059421422984, 0.08455214034401878]}

### 6. в конце вывести единую таблицу сравнения метрик uplift@10%, uplift@20% этих 3 моделей

In [19]:
models_results = pd.DataFrame(models_results)
pd.pivot_table(models_results, columns = 'approach').reset_index()

approach,index,ClassTransformation,SoloModel,TwoModels
0,uplift@10%,0.240061,0.0783,0.084552
1,uplift@20%,0.203614,0.079379,0.082894
2,uplift@30%,0.174369,0.08272,0.075164


7.построить модель UpliftTreeClassifier и попытаться описать словами полученное дерево