In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb

random_state = 31

In [None]:
from sklift.datasets import fetch_megafon
data, target, treatment = fetch_megafon(return_X_y_t=True)

Megafon dataset:   0%|          | 0.00/276M [00:00<?, ?iB/s]

In [None]:
data.shape

(600000, 50)

In [None]:
data.head()

Unnamed: 0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,...,X_41,X_42,X_43,X_44,X_45,X_46,X_47,X_48,X_49,X_50
0,39.396577,-0.186548,19.524505,21.250208,55.291264,182.966712,-5.385606,144.573379,-12.534344,-58.279429,...,90.877638,134.363458,-213.584582,-2.092461,-93.973258,-0.155597,-312.130733,44.798182,-125.682413,16.231365
1,38.987694,0.819522,-42.064512,-48.270949,-33.171257,179.459341,-87.15181,-162.693257,20.651652,181.635081,...,-183.840746,72.864779,559.783584,1.142391,80.037124,-1.216185,-111.473936,-127.737977,-117.501171,10.732234
2,-16.693093,1.844558,-8.615192,-18.81874,-22.271188,-116.290369,-63.816746,-38.340763,24.968496,-136.340629,...,-203.637766,2.480242,96.998504,1.100962,-33.275159,0.920926,-679.492242,-91.009397,-18.173358,14.367636
3,-72.040154,-0.226921,39.802607,16.441262,-1.112509,68.128008,23.073147,4.688858,-49.383641,-91.866107,...,172.906875,83.951551,-323.642557,-0.369182,93.221948,-1.96238,-442.466684,-22.298302,-75.916603,11.634299
4,18.296973,0.996437,24.465307,-34.151971,24.623458,-155.455558,-12.159787,26.705778,105.864805,258.607252,...,125.577535,-208.531112,118.902324,-0.808578,-117.497906,1.770635,627.395611,122.019189,194.091195,-11.883858


In [None]:
treatment.value_counts()

treatment    300368
control      299632
Name: treatment_group, dtype: int64

In [None]:
treatment = treatment.map({'treatment': 1, 'control': 0})
treatment.value_counts()

1    300368
0    299632
Name: treatment_group, dtype: int64

In [None]:
stratify_cols = pd.concat([treatment, target], axis=1)

X_train, X_val, trtmnt_train, trtmnt_val, y_train, y_val = train_test_split(
    data,
    treatment,
    target,
    stratify=stratify_cols,
    test_size=.3,
    random_state=random_state
)

print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")

X_train shape: (420000, 50)
X_val shape: (180000, 50)


### Solo Model

In [None]:
from sklift.models import SoloModel

estimator = lgb.LGBMClassifier(random_state=random_state)
slearner = SoloModel(estimator=estimator, method="dummy")

slearner = slearner.fit(X_train, y_train, trtmnt_train)
preds = slearner.predict(X_val)

In [None]:
preds[:3]

array([0.06341812, 0.01077546, 0.09784032])

In [None]:
estimator = lgb.LGBMClassifier(random_state=random_state)
slearner_ti = SoloModel(estimator=estimator, method="treatment_interaction") #2x features

slearner_ti = slearner_ti.fit(X_train, y_train, trtmnt_train)
preds2 = slearner_ti.predict(X_val)

In [None]:
preds2[:3]

array([0.03888616, 0.04865184, 0.08312888])

### T-learner

In [None]:
from sklift.models import TwoModels

estimator_treatment = lgb.LGBMClassifier(random_state=random_state)
estimator_control = lgb.LGBMClassifier(random_state=random_state)

t_learner = TwoModels(
    estimator_trmnt=estimator_treatment,
    estimator_ctrl=estimator_control,
    method="vanilla" # 2 независимые модели. Альтернативы: ddr_control или ddr_treatment
)

t_learner = t_learner.fit(X_train, y_train, trtmnt_train)
preds3 = t_learner.predict(X_val)

In [None]:
preds3[:3]

array([0.04460189, 0.04201339, 0.03639569])

### ClassTransformation

In [None]:
from sklift.models import ClassTransformation

estimator_ct = lgb.LGBMClassifier(random_state=random_state)
ct_learner = ClassTransformation(estimator_ct)
ct_learner = ct_learner.fit(X_train, y_train, trtmnt_train)
preds4 = ct_learner.predict(X_val)

In [None]:
preds4[:3]

array([-0.00650106, -0.0172078 ,  0.12447006])

### МЕТРИКИ

In [None]:
from sklift.metrics import uplift_at_k, uplift_by_percentile, average_squared_deviation, weighted_average_uplift, uplift_auc_score, qini_auc_score

all_preds = [preds, preds2, preds3, preds4]
all_learners = [slearner, slearner_ti, t_learner, ct_learner]

In [None]:
# uplift_at_k

In [None]:
for p in all_preds:
    uplift_k = uplift_at_k(
        y_true=y_val,
        uplift=p,
        treatment=trtmnt_val,
        strategy="overall",
        k=.3)
    print(uplift_k)

0.21252033889247546
0.2166602618155922
0.2157994292441908
0.2077604133256158


In [None]:
# uplift_by_percentile

In [None]:
for p in all_preds:
    uplift_p = uplift_by_percentile(
        y_true=y_val,
        uplift=p,
        treatment=trtmnt_val,
        strategy="overall",
        bins=10
        )
    print(uplift_p)

            n_treatment  n_control  response_rate_treatment  \
percentile                                                    
0-10               8948       9052                 0.578453   
10-20              9065       8935                 0.260121   
20-30              9070       8930                 0.223264   
30-40              8895       9105                 0.209444   
40-50              8966       9034                 0.208566   
50-60              9077       8923                 0.195659   
60-70              9098       8902                 0.180149   
70-80              8960       9040                 0.127902   
80-90              8928       9072                 0.091062   
90-100             9103       8897                 0.215533   

            response_rate_control    uplift  
percentile                                   
0-10                     0.152232  0.426222  
10-20                    0.141690  0.118431  
20-30                    0.127212  0.096052  
30-40        

In [None]:
# average_squared_deviation

In [None]:
for l, pred in zip(all_learners, all_preds):   
    print(average_squared_deviation(
        y_train,
        l.predict(X_train),
        trtmnt_train,
        y_val,
        pred,
        trtmnt_val
))

7.30058470551649e-05
0.0001341121931916846
0.0012216105207456466
0.001923269328779707


In [None]:
# weighted_average_uplift

In [None]:
for p in all_preds:
    wa_uplift = weighted_average_uplift(
        y_true=y_val,
        uplift=p,
        treatment=trtmnt_val,
        strategy="overall"
        )
    print(wa_uplift)

0.04893844793744406
0.048957196080265386
0.04866010925727283
0.04918148446888456


In [None]:
# uplift_auc_score

In [None]:
for p in all_preds:
    auuc = uplift_auc_score(
        y_true=y_val,
        uplift=p,
        treatment=trtmnt_val
        )
    print(auuc)

0.16034963885181358
0.16532009908557585
0.1650810743542226
0.15509672012544673


In [None]:
# qini_auc_score

In [None]:
for p in all_preds:
    auqc = qini_auc_score(
        y_true=y_val,
        uplift=p,
        treatment=trtmnt_val
        )
    print(auqc)

0.23260639980450665
0.24007343199519318
0.23952807980525728
0.2248584984050581
