### Проверяем работу модели на практике: A/B-тестирование

##### Задание
Взять датасет из google диска: https://drive.google.com/file/d/1MpWBFIbqu4mbiD0BBKYX6YhS-f4mN3Z_. Проверить гипотезу о том, в каком варианте теста (control/personalization) больше конверсия (converted) и значимо ли это отличие статистически.

In [22]:
import pandas as pd

In [23]:
df = pd.read_csv('marketing_campaign.csv')
df

Unnamed: 0,user_id,date_served,marketing_channel,variant,language_displayed,converted
0,a1000,1/1/18,House Ads,personalization,English,True
1,a1001,1/1/18,House Ads,personalization,English,True
2,a1002,1/1/18,House Ads,personalization,English,True
3,a1003,1/1/18,House Ads,personalization,English,True
4,a1004,1/1/18,House Ads,personalization,English,True
...,...,...,...,...,...,...
10032,a11032,1/17/18,Email,control,German,True
10033,a11033,1/17/18,Email,control,German,True
10034,a11034,1/5/18,Instagram,control,German,False
10035,a11035,1/17/18,Email,control,German,True


In [24]:
df.variant.value_counts()

control            5091
personalization    4946
Name: variant, dtype: int64

In [None]:
дисбаланса нет

In [25]:
df.converted.value_counts()

False    8946
True     1076
Name: converted, dtype: int64

In [None]:
дисбаланс есть

In [26]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix
from scipy.stats import mannwhitneyu
import itertools

import matplotlib.pyplot as plt

import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [27]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [32]:
# разделим данные на train/test
# разделим данные на train/test
# разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(df.drop(['converted'],axis=1), df['converted'], random_state=2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=2)

In [33]:
# соберем наш простой pipeline, но нам понадобится написать класс для выбора нужного поля
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    

class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]


class OHEEncoderBin(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        B = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        self.columns = B[:1]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
    
        return X[self.columns ]  


class NumericPower(BaseEstimator, TransformerMixin):
    def __init__(self, key, p = 2):
        self.key = key
        self.columns = []
        self.p = p+1

    def fit(self, X, y=None):
        B = [self.key+str(i) for i in range(1,self.p) ]
        self.columns = B+['log']
        return self

    def transform(self, X):
        Xp = X.values.reshape(-1,1)
        for i in range(2, self.p):
            Xp = np.hstack([Xp,(X.values.reshape(-1,1) ** i).astype(float)])

        Xp = np.hstack([Xp, np.log(X.values.reshape(-1,1) + 1).astype(float)])    
        B = pd.DataFrame(data = Xp, index = X.index,columns =[self.columns] )
        return B[self.columns]     

In [34]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5645 entries, 8202 to 858
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   user_id             5645 non-null   object
 1   date_served         5642 non-null   object
 2   marketing_channel   5642 non-null   object
 3   variant             5645 non-null   object
 4   language_displayed  5645 non-null   object
dtypes: object(5)
memory usage: 264.6+ KB


In [14]:
X_train.describe()

Unnamed: 0,user_id,date_served,marketing_channel,variant,language_displayed
count,5645,5642,5642,5645,5645
unique,5645,31,5,2,4
top,a5266,1/15/18,House Ads,control,English
freq,1,425,2663,2847,5502


In [15]:
X_train.columns

Index(['user_id', 'date_served', 'marketing_channel', 'variant',
       'language_displayed'],
      dtype='object')

In [35]:
X_train.drop(['user_id'],axis=1) # user_id не несёт никакой информации

Unnamed: 0,date_served,marketing_channel,variant,language_displayed
8202,1/11/18,House Ads,personalization,English
6376,1/9/18,Facebook,personalization,English
2147,1/15/18,House Ads,control,English
5471,1/14/18,Facebook,control,English
5441,1/12/18,Instagram,control,English
...,...,...,...,...
7539,1/19/18,Facebook,personalization,English
4989,1/10/18,House Ads,control,English
1824,1/2/18,Push,control,English
2192,1/17/18,House Ads,control,English


In [36]:
categorical_columns = ['marketing_channel', 'language_displayed']
continuous_columns = ['date_served']
categorical_bin_columns = ['variant']


from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

variant = Pipeline([
                ('selector', FeatureSelector(column='variant')),
                ('ohe', OHEEncoderBin(key='variant'))
            ])

variant.fit(X_train)
variant.transform(X_test).head(3)

Unnamed: 0,variant_control
1711,0
1715,0
1415,0


In [38]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

marketing_channel = Pipeline([
                ('selector', FeatureSelector(column='marketing_channel')),
                ('ohe', OHEEncoder(key='marketing_channel'))
            ])
marketing_channel.fit(X_train)
marketing_channel.transform(X_test).head(3)

Unnamed: 0,marketing_channel_Email,marketing_channel_Facebook,marketing_channel_House Ads,marketing_channel_Instagram,marketing_channel_Push
1711,1,0,0,0,0
1715,0,1,0,0,0
1415,0,0,0,1,0


In [41]:
language_displayed = Pipeline([
                ('selector', FeatureSelector(column='language_displayed')),
                ('ohe', OHEEncoder(key='language_displayed'))
            ])

language_displayed.fit(X_train)
language_displayed.transform(X_test).head(3)

Unnamed: 0,language_displayed_Arabic,language_displayed_English,language_displayed_German,language_displayed_Spanish
1711,0,1,0,0
1715,0,1,0,0
1415,0,1,0,0


In [42]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))





for bin_col in categorical_bin_columns:
    bin_transformer = Pipeline([
                ('selector', FeatureSelector(column=bin_col)),
                ('ohe', OHEEncoderBin(key=bin_col))
            ])
    final_transformers.append((bin_col, bin_transformer))

In [None]:
final_transformers

In [43]:
feats = FeatureUnion(final_transformers)

pipeline = Pipeline([
    ('features', feats),
    ('classifier', RandomForestClassifier(random_state=42)),
])

In [44]:
# обучим наш пайплайн
pipeline.fit(X_train, y_train)

ValueError: Input contains NaN