# A/Б-тесты

In [1]:
import numpy as np
import pandas as pd

import scipy
import statsmodels

from statsmodels.stats.weightstats import *
from statsmodels.stats.proportion import proportion_confint

## Загрузка данных

In [2]:
data_green = pd.read_csv('Green_btn.csv')
data_red = pd.read_csv('Red_btn.csv')

In [109]:
data_green.tail()

Unnamed: 0,Green
995,0
996,0
997,0
998,0
999,0


In [108]:
data_red.tail()

Unnamed: 0,Red
995,0
996,0
997,0
998,0
999,0


In [111]:
from pandas import concat

In [120]:
pieces = [data_red, data_green]
data = concat(pieces, axis=1)

In [121]:
data.head(10)

Unnamed: 0,Red,Green
0,0,0
1,1,1
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


## Интервальные оценки долей

$$\frac1{ 1 + \frac{z^2}{n} } \left( \hat{p} + \frac{z^2}{2n} \pm z \sqrt{ \frac{ \hat{p}\left(1-\hat{p}\right)}{n} + \frac{z^2}{4n^2} } \right), \;\; z \equiv z_{1-\frac{\alpha}{2}}$$ 

In [124]:
conf_interval_red = proportion_confint(sum(data.Red), 
                                            data.shape[0],
                                            method = 'wilson')
conf_interval_green = proportion_confint(sum(data.Green), 
                                            data.shape[0],
                                            method = 'wilson')

In [126]:
print('95%% confidence interval for a click probability, banner a: [%f, %f]' % conf_interval_red)
print('95%% confidence interval for a click probability, banner b [%f, %f]' % conf_interval_green)

95% confidence interval for a click probability, banner a: [0.026961, 0.050582]
95% confidence interval for a click probability, banner b [0.040747, 0.068675]


## Z-критерий для разности долей (независимые выборки)

   | $X_1$ | $X_2$  
  ------------- | -------------|
  1  | a | b 
  0  | c | d 
  $\sum$ | $n_1$| $n_2$
  
$$ \hat{p}_1 = \frac{a}{n_1}$$

$$ \hat{p}_2 = \frac{b}{n_2}$$


$$\text{Доверительный интервал для }p_1 - p_2\colon \;\; \hat{p}_1 - \hat{p}_2 \pm z_{1-\frac{\alpha}{2}}\sqrt{\frac{\hat{p}_1(1 - \hat{p}_1)}{n_1} + \frac{\hat{p}_2(1 - \hat{p}_2)}{n_2}}$$

$$Z-статистика: Z({X_1, X_2}) =  \frac{\hat{p}_1 - \hat{p}_2}{\sqrt{P(1 - P)(\frac{1}{n_1} + \frac{1}{n_2})}}$$
$$P = \frac{\hat{p}_1{n_1} + \hat{p}_2{n_2}}{{n_1} + {n_2}} $$

In [131]:
def proportions_diff_confint_ind(sample1, sample2, alpha = 0.05):    
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    
    return (left_boundary, right_boundary)

In [132]:
def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [133]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)

In [135]:
print("95%% confidence interval for a difference between proportions: [%f, %f]" %\
      proportions_diff_confint_ind(data.Red, data.Green))

95% confidence interval for a difference between proportions: [-0.034157, 0.002157]


In [136]:
print("p-value: %f" % proportions_diff_z_test(proportions_diff_z_stat_ind(data.Red, data.Green)))

p-value: 0.084379


In [137]:
print("p-value: %f" % proportions_diff_z_test(proportions_diff_z_stat_ind(data.Red, data.Green), 'less'))

p-value: 0.042189


Также нужно убедиться, что наши группы собраны корректно и не содержат в себе случайно внесенных различий. 

Для этого проверим, что между группами А и А' нет значимой разницы.

In [144]:
data_red_2 = pd.read_csv('Red_btn_2.csv')

In [145]:
data_red_2.head()

Unnamed: 0,Red_2
0,0
1,1
2,0
3,0
4,0


In [148]:
print("p-value: %f" % proportions_diff_z_test(proportions_diff_z_stat_ind(data.Red, 
                                                                          data_red_2.Red_2), 'less'))

p-value: 0.363673


Подробнее в курсе: 

https://www.coursera.org/learn/stats-for-data-analysis/lecture/p5iWT/primier-provierka-ghipotiez-o-doliakh