In [1]:
import numpy as np, matplotlib as mpl, matplotlib.pyplot as plt, pandas as pd
import seaborn as sns, math, os, warnings

In [2]:
df_data = pd.read_csv('uplift_synthetic_data_100trials.csv')
df_model = df_data.copy()

# Проверяем общую информацию о датасете
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 43 columns):
 #   Column                      Non-Null Count    Dtype  
---  ------                      --------------    -----  
 0   Unnamed: 0                  1000000 non-null  int64  
 1   trial_id                    1000000 non-null  int64  
 2   treatment_group_key         1000000 non-null  object 
 3   conversion                  1000000 non-null  int64  
 4   control_conversion_prob     1000000 non-null  float64
 5   treatment1_conversion_prob  1000000 non-null  float64
 6   treatment1_true_effect      1000000 non-null  float64
 7   x1_informative              1000000 non-null  float64
 8   x2_informative              1000000 non-null  float64
 9   x3_informative              1000000 non-null  float64
 10  x4_informative              1000000 non-null  float64
 11  x5_informative              1000000 non-null  float64
 12  x6_informative              1000000 non-null  float64
 13

In [3]:
# Выведем первые 5 строк

df_model.head()

Unnamed: 0.1,Unnamed: 0,trial_id,treatment_group_key,conversion,control_conversion_prob,treatment1_conversion_prob,treatment1_true_effect,x1_informative,x2_informative,x3_informative,...,x27_irrelevant,x28_irrelevant,x29_irrelevant,x30_irrelevant,x31_uplift_increase,x32_uplift_increase,x33_uplift_increase,x34_uplift_increase,x35_uplift_increase,x36_uplift_increase
0,0,0,control,1,0.516606,0.572609,0.056002,-1.926651,1.233472,-0.47512,...,-0.378145,-0.110782,1.08718,-1.222069,-0.279009,1.013911,-0.570859,-1.158216,-1.336279,-0.708056
1,1,0,treatment1,1,0.304005,0.73646,0.432454,0.904364,0.868705,-0.285977,...,-0.742847,0.700239,0.001867,-0.069362,0.045789,1.364182,-0.261643,0.478074,0.531477,0.402723
2,2,0,treatment1,0,0.134277,0.480985,0.346709,1.680978,1.320889,0.059273,...,0.748884,-0.856898,-0.268034,-2.181874,1.473214,-1.256641,0.901139,2.029204,-0.280445,0.87397
3,3,0,treatment1,1,0.801968,0.858532,0.056563,-0.335774,-2.940232,-0.302521,...,0.151074,0.067547,-0.839246,0.587575,0.412081,0.141189,0.369611,-0.364984,-1.509045,-1.335023
4,4,0,control,0,0.063552,0.060142,-0.00341,-0.475881,-0.485793,0.978582,...,-1.287117,1.256396,-1.155307,-0.414787,1.163851,0.698114,0.088157,0.478717,-0.680588,-2.73085


In [4]:
df_model.describe(include=object)

Unnamed: 0,treatment_group_key
count,1000000
unique,2
top,control
freq,500000


In [5]:
object_cols = [col for col in df_model.columns if df_model[col].dtype == "object"]
for obj in object_cols:
    print('\n', obj)
    for unique in df_model[obj].unique():
        print("{} {}".format(unique,sum(df_model[obj] == unique)))


 treatment_group_key
control 500000
treatment1 500000


## Предобработка данных

In [6]:
df_model = df_model.rename(columns={'conversion': 'target'})
# df_model = df_model.rename(columns={'treatment_group_key': 'treatment'})

df_model.treatment_group_key = df_model.treatment_group_key.replace({'control': 0, 'treatment1': 1})

In [7]:
df_model = pd.get_dummies(df_model)

In [8]:
df_model_control = df_model.copy()
df_model_treatment = df_model.copy().loc[df_model.treatment_group_key >=0].reset_index(drop=True)

In [9]:
def declare_tc(df:pd.DataFrame):
    #CN:
    df['target_class'] = 0 
    #CR:
    df.loc[(df.treatment_group_key == 0) & (df.target != 0),'target_class'] = 1 
    #TN:
    df.loc[(df.treatment_group_key != 0) & (df.target == 0),'target_class'] = 2 
    #TR:
    df.loc[(df.treatment_group_key != 0) & (df.target != 0),'target_class'] = 3 
    return df

In [10]:
df_model_control = declare_tc(df_model_control)
df_model_treatment = declare_tc(df_model_treatment)

Unnamed: 0.1,Unnamed: 0,trial_id,treatment_group_key,target,control_conversion_prob,treatment1_conversion_prob,treatment1_true_effect,x1_informative,x2_informative,x3_informative,...,x28_irrelevant,x29_irrelevant,x30_irrelevant,x31_uplift_increase,x32_uplift_increase,x33_uplift_increase,x34_uplift_increase,x35_uplift_increase,x36_uplift_increase,target_class
0,0,0,0,1,0.516606,0.572609,0.056002,-1.926651,1.233472,-0.47512,...,-0.110782,1.08718,-1.222069,-0.279009,1.013911,-0.570859,-1.158216,-1.336279,-0.708056,1
1,1,0,1,1,0.304005,0.73646,0.432454,0.904364,0.868705,-0.285977,...,0.700239,0.001867,-0.069362,0.045789,1.364182,-0.261643,0.478074,0.531477,0.402723,3
2,2,0,1,0,0.134277,0.480985,0.346709,1.680978,1.320889,0.059273,...,-0.856898,-0.268034,-2.181874,1.473214,-1.256641,0.901139,2.029204,-0.280445,0.87397,2
3,3,0,1,1,0.801968,0.858532,0.056563,-0.335774,-2.940232,-0.302521,...,0.067547,-0.839246,0.587575,0.412081,0.141189,0.369611,-0.364984,-1.509045,-1.335023,3
4,4,0,0,0,0.063552,0.060142,-0.00341,-0.475881,-0.485793,0.978582,...,1.256396,-1.155307,-0.414787,1.163851,0.698114,0.088157,0.478717,-0.680588,-2.73085,0
5,5,0,0,0,0.079143,0.068283,-0.01086,-0.440743,0.681489,-0.612391,...,-0.090788,-0.827456,1.578622,0.920988,0.257002,1.079024,-0.96625,0.007283,-0.880016,0
6,6,0,0,1,0.820826,0.864836,0.044009,-0.039945,1.317796,-2.270599,...,0.058808,-0.830246,0.0536,-1.053466,0.181414,0.325558,0.238529,-1.153355,-0.2589,1
7,7,0,0,1,0.804691,0.965391,0.160701,0.570992,0.76356,0.782622,...,0.557018,-0.369998,0.623468,0.597901,-1.013172,0.38538,0.767098,0.041324,0.128311,1
8,8,0,1,1,0.103244,0.090507,-0.012737,-0.013684,1.139568,0.667846,...,-0.000784,-0.892132,0.374006,-1.952403,-1.333484,0.044452,-1.641482,0.642491,0.631101,3
9,9,0,0,0,0.144498,0.03924,-0.105258,0.384454,-0.111373,-0.662927,...,-1.283026,1.93617,1.48218,0.978432,-0.143453,-2.082775,0.358376,1.844002,-1.223142,0


## Uplift Modeling

In [11]:
# Functions for Uplift
from sklearn.model_selection import train_test_split
import xgboost as xgb
def uplift_split(df_model:pd.DataFrame):
    """Train-Test Split
    """
    X = df_model.drop(['target','target_class'],axis=1)
    y = df_model.target_class
    X_train, X_test, \
    y_train, y_test  = train_test_split(X,
                                       y,
                                       test_size=0.3,
                                       random_state=42,
                                       stratify=df_model['treatment_group_key'])
    return X_train,X_test, y_train, y_test


def uplift_model(X_train:pd.DataFrame,
                 X_test:pd.DataFrame,
                 y_train:pd.DataFrame,
                 y_test:pd.DataFrame):
    """Using XGB to get the uplift score
    """
    # Create new dataframe
    result = pd.DataFrame(X_test).copy()    
    # Fit the model
    uplift_model \
    = xgb.XGBClassifier().fit(X_train.drop('treatment_group_key', axis=1), y_train)
    
    
    # Predict using test-data
    uplift_proba \
    = uplift_model.predict_proba(X_test.drop('treatment_group_key', axis=1))
    result['proba_CN'] = uplift_proba[:,0] 
    result['proba_CR'] = uplift_proba[:,1] 
    result['proba_TN'] = uplift_proba[:,2] 
    result['proba_TR'] = uplift_proba[:,3]
    result['uplift_score'] = result.eval('\
    proba_CN/(proba_CN+proba_CR) \
    + proba_TR/(proba_TN+proba_TR) \
    - proba_TN/(proba_TN+proba_TR) \
    - proba_CR/(proba_CN+proba_CR)')  
    # Put the result 
    result['target_class'] = y_test
    return result


def uplift(df_model:pd.DataFrame):
    """Combine the split and Modeling function|
    """
    X_train, X_test, y_train, y_test = uplift_split(df_model)
    result = uplift_model(X_train, X_test, y_train, y_test)
    return result

In [12]:
treatment_uplift = uplift(df_model_treatment)

In [13]:
control_uplift = uplift(df_model_control)