# Linearization

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.stats import ttest_ind

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [21]:
def get_data(N: int, lift: int=0) -> pd.DataFrame:
    df = pd.DataFrame()
    df['views'] = np.random.randint(700, 1300, N)
    df['clicks'] = np.random.binomial(1000, 0.05, N)
    df = df[df['views'] > df['clicks']]
    
    df['group'] = 'A1'
    df.loc[np.random.choice(N, N//3, replace=False), 'group'] = 'A2'
    df.loc[np.random.choice( df[df['group'] == 'A1'].index, N//3, replace=False ), 'group'] = 'B'
    df.loc[df['group'] == 'B', 'clicks'] += lift
    
    return df

In [22]:
df = get_data(N=3000, lift=5)
display(df.head())
df['group'].value_counts()

Unnamed: 0,views,clicks,group
0,1290,49,B
1,1102,60,B
2,1024,51,A1
3,961,55,A1
4,1079,42,A2


B     1000
A1    1000
A2    1000
Name: group, dtype: int64

In [4]:
def linearization(df: pd.DataFrame, a1: bool=True) -> pd.DataFrame:
    
    if a1:
        df_ = df[df['group'] == 'A1']
    else: 
        df_ = df[df['group'] == 'A']
    K = np.sum(df_['clicks']) / np.sum(df_['views'])
    
    df['L'] = df['clicks'] - K * df['views']
    
    return df

In [5]:
def simulate(N: int, generation_conf: dict) -> tuple:
    ctrs_diff = []  # default ctr, sum(all clicks) / sum(all views). For both groups, take diff
    ctrs_per_user_diff = []  # compute ctr for each user, then take avarage. For both groups, take diff
    l_diffs = []  # average of linearized metric. For both groups, take diff
    pvalues_per_user = []  # p-values from t-test from a group per user ctrs and b group
    pvalues_lin = [] # p-values from t-test from a group L-metric and b group
    
    for _ in tqdm(range(N)):
        df = get_data(**generation_conf)
        df = linearization(df)
        a = df[df['group'] == 'A2']
        b = df[df['group'] == 'B']

        ctr_a = np.sum(a['clicks']) / np.sum(a['views'])
        ctr_b = np.sum(b['clicks']) / np.sum(b['views'])
        ctrs_diff.append(ctr_b - ctr_a)
        
        ctrs_per_user_diff.append( np.mean(b['clicks']/b['views']) - np.mean(a['clicks']/a['views']) )
        
        l_diffs.append(np.mean(b['L']) - np.mean(a['L']))
        
        pvalues_per_user.append(ttest_ind( a['clicks']/a['views'], b['clicks']/b['views'] ).pvalue)
        pvalues_lin.append(ttest_ind(b['L'], a['L']).pvalue)
        
    return (np.array(ctrs_diff), 
            np.array(ctrs_per_user_diff),
            np.array(l_diffs), 
            np.array(pvalues_per_user),
            np.array(pvalues_lin))

In [6]:
def print_stats(res):
    print(f'Mean lift from ctrs = {np.mean(res[0])}, variance = {np.var(res[0])}')
    print(f'Mean lift from per user ctrs = {np.mean(res[1])}, variance = {np.var(res[1])}')
    print(f'Mean lift from linearized metric = {np.mean(res[2])}, variance = {np.var(res[2])}')
    print(f'P-values for per user ctrs: less than 0.05 = {np.sum(res[3] < 0.05)}, mean p-value = {np.mean(res[3])}, variance = {np.var(res[3])}')
    print(f'P-values for per linearized metric: less than 0.05 = {np.sum(res[4] < 0.05)}, mean p-value = {np.mean(res[4])}, variance = {np.var(res[4])}')

In [7]:
x = simulate(1000, {'N': 3000, 'lift': 3})
print_stats(x)

100%|███████████████████████████████████| 1000/1000 [00:05<00:00, 170.21it/s]

Mean lift from ctrs = 0.003003985773104913, variance = 2.721898783603382e-07
Mean lift from per user ctrs = 0.0031014393390969264, variance = 3.079870500280316e-07
Mean lift from linearized metric = 3.002164588258871, variance = 0.2618106053409646
P-values for per user ctrs: less than 0.05 = 1000, mean p-value = 5.609417526025244e-05, variance = 2.309107789701252e-07
P-values for per linearized metric: less than 0.05 = 1000, mean p-value = 1.691884998663985e-05, variance = 2.0518024730075442e-08





In [8]:
x = simulate(1000, {'N': 3000, 'lift': 2})
print_stats(x)

100%|███████████████████████████████████| 1000/1000 [00:05<00:00, 175.07it/s]

Mean lift from ctrs = 0.00199881915987601, variance = 2.5060429761344223e-07
Mean lift from per user ctrs = 0.0020621390261612743, variance = 2.8019821492745906e-07
Mean lift from linearized metric = 1.9975660257388559, variance = 0.24399116851093688
P-values for per user ctrs: less than 0.05 = 972, mean p-value = 0.006016465561094144, variance = 0.0006862740632206404
P-values for per linearized metric: less than 0.05 = 979, mean p-value = 0.004542552930052972, variance = 0.0005590658856373009





In [9]:
x = simulate(1000, {'N': 3000, 'lift': 1})
print_stats(x)

100%|███████████████████████████████████| 1000/1000 [00:05<00:00, 168.64it/s]

Mean lift from ctrs = 0.000992460941150469, variance = 2.639472189940837e-07
Mean lift from per user ctrs = 0.001030175873182916, variance = 2.9954464012048275e-07
Mean lift from linearized metric = 0.9920278566909171, variance = 0.2606455622361018
P-values for per user ctrs: less than 0.05 = 511, mean p-value = 0.15569796243012327, variance = 0.05265565159381121
P-values for per linearized metric: less than 0.05 = 528, mean p-value = 0.14599633442900115, variance = 0.050323616860643655





In [24]:
x = simulate(10000, {'N': 9000, 'lift': 1})
print_stats(x)

100%|█████████████████████████████████| 10000/10000 [01:25<00:00, 117.31it/s]

Mean lift from ctrs = 0.0010025836242332923, variance = 8.186673850142936e-08
Mean lift from per user ctrs = 0.0010345302442816946, variance = 9.219969308423104e-08
Mean lift from linearized metric = 1.0020565649365227, variance = 0.08078100889639063
P-values for per user ctrs: less than 0.05 = 9217, mean p-value = 0.016477723075798006, variance = 0.003488095164584525
P-values for per linearized metric: less than 0.05 = 9394, mean p-value = 0.013108677599769768, variance = 0.0026805048680486714



