# Examples of ambrosia core classes usage

In [2]:
import yaml

import numpy as np
import pandas as pd
import scipy.stats as sps

## AggregatePreprocessor

In [3]:
from ambrosia.preprocessing import AggregatePreprocessor

In [4]:
table = pd.read_csv('../tests/test_data/week_metrics.csv')
table

Unnamed: 0,id,gender,watched,sessions,day,platform
0,0,Male,28.440846,4,1,android
1,1,Female,1.825271,2,1,ios
2,2,Female,46.995606,0,1,web
3,3,Female,37.310264,1,1,ios
4,4,Female,147.513105,0,1,web
...,...,...,...,...,...,...
34995,4995,Male,116.832887,0,7,web
34996,4996,Female,13.558919,2,7,web
34997,4997,Male,24.035386,2,7,android
34998,4998,Female,18.482640,2,7,android


#### Create an instance of the class

In [5]:
# These arguments values are also default ones
transformer = AggregatePreprocessor(categorial_method='mode', real_method='sum')

In [6]:
transformer.fit_transform(
    dataframe=table,
    groupby_columns='id',
    real_cols=['watched', 'sessions'],
    categorial_cols=['gender', 'platform']
)

Unnamed: 0,id,watched,sessions,gender,platform
0,0,772.597224,8,Male,ios
1,1,538.076739,15,Female,android
2,2,288.492353,20,Female,android
3,3,373.620408,9,Female,ios
4,4,630.238862,14,Female,ios
...,...,...,...,...,...
4995,4995,390.133588,14,Male,android
4996,4996,544.423724,25,Female,ios
4997,4997,204.713032,19,Male,android
4998,4998,1088.642872,25,Female,web


#### More detailed aggregation is possible

In [7]:
transformer.fit_transform(table, groupby_columns='id', agg_params={
    'watched' : 'sum',
    'sessions' : 'max',
    'gender' : 'simple', # simple - choose the first possible value
    'platform' : 'mode'
})

Unnamed: 0,id,watched,sessions,gender,platform
0,0,772.597224,4,Male,ios
1,1,538.076739,6,Female,android
2,2,288.492353,7,Female,android
3,3,373.620408,3,Female,ios
4,4,630.238862,8,Female,ios
...,...,...,...,...,...
4995,4995,390.133588,9,Male,android
4996,4996,544.423724,15,Female,ios
4997,4997,204.713032,6,Male,android
4998,4998,1088.642872,10,Female,web


#### Let's add a column whether this day is a weekend and group by

In [8]:
table['is_holiday'] = table['day'].apply(lambda x: 0 if x < 6 else 1)
table

Unnamed: 0,id,gender,watched,sessions,day,platform,is_holiday
0,0,Male,28.440846,4,1,android,0
1,1,Female,1.825271,2,1,ios,0
2,2,Female,46.995606,0,1,web,0
3,3,Female,37.310264,1,1,ios,0
4,4,Female,147.513105,0,1,web,0
...,...,...,...,...,...,...,...
34995,4995,Male,116.832887,0,7,web,1
34996,4996,Female,13.558919,2,7,web,1
34997,4997,Male,24.035386,2,7,android,1
34998,4998,Female,18.482640,2,7,android,1


In [9]:
transformer.fit_transform(table, groupby_columns=['id', 'is_holiday'], agg_params={
    'watched' : 'sum',
    'sessions' : 'max',
    'gender' : 'simple',
    'platform' : 'mode'
})

Unnamed: 0,id,is_holiday,watched,sessions,gender,platform
0,0,0,601.893096,4,Male,ios
1,0,1,170.704127,1,Male,android
2,1,0,327.533247,3,Female,web
3,1,1,210.543492,6,Female,ios
4,2,0,271.548875,7,Female,web
...,...,...,...,...,...,...
9995,4997,1,65.368574,2,Male,ios
9996,4998,0,1051.360035,4,Female,web
9997,4998,1,37.282837,10,Female,android
9998,4999,0,245.553217,6,Male,android


## Designer

In [10]:
from ambrosia.designer import Designer
from ambrosia.designer import load_from_config
from ambrosia.designer import design

### Load dataframe

In [11]:
data = table.copy()
table.head()

Unnamed: 0,id,gender,watched,sessions,day,platform,is_holiday
0,0,Male,28.440846,4,1,android,0
1,1,Female,1.825271,2,1,ios,0
2,2,Female,46.995606,0,1,web,0
3,3,Female,37.310264,1,1,ios,0
4,4,Female,147.513105,0,1,web,0


### Use the standard constructor to instantiate a class

In [12]:
designer = Designer(dataframe=data, effects=[1.05, 1.1, 1.2], sizes=[500, 700, 1000], metrics=['watched'])

In [13]:
designer.run('size')

errors,(0.05; 0.2)
effects,Unnamed: 1_level_1
5.0%,6295
10.0%,1575
20.0%,395


In [14]:
designer.run('effect')

errors,(0.05; 0.2)
sample_sizes,Unnamed: 1_level_1
500,17.8%
700,15.0%
1000,12.5%


In [15]:
designer.run('power')

Unnamed: 0_level_0,sample sizes,500,700,1000
First type error,Effect,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.05,5.0%,12.4%,15.4%,20.0%
0.05,10.0%,35.1%,46.3%,60.7%
0.05,20.0%,88.4%,96.2%,99.4%


### Design ``method`` could be chosen and special ``kwargs`` passed

In [16]:
designer.run('power', method='empiric', bs_samples=400)

Unnamed: 0,size_A,size_B,effect,alpha,watched_correctness,watched_power
0,500,500,1.05,0.05,0.97,0.11
1,500,500,1.1,0.05,0.9325,0.3175
2,500,500,1.2,0.05,0.9625,0.81
3,500,700,1.05,0.05,0.955,0.145
4,500,700,1.1,0.05,0.955,0.41
5,500,700,1.2,0.05,0.96,0.87
6,500,1000,1.05,0.05,0.9525,0.15
7,500,1000,1.1,0.05,0.9475,0.43
8,500,1000,1.2,0.05,0.9725,0.9525
9,700,500,1.05,0.05,0.955,0.135


### Class instance attributes could be overrided directly in ``run()`` method

In [17]:
designer

<ambrosia.designer.designer.Designer at 0x12fafdfd0>

### Let's create a binary dataframe

In [19]:
df_binary = pd.DataFrame([[1, 2], [0, 3], [0, 1], [1, 22], [0, 9]], columns=['retention', 'some metric'])
df_binary

Unnamed: 0,retention,some metric
0,1,2
1,0,3
2,0,1
3,1,22
4,0,9


### Use setters to set attributes

In [20]:
designer.set_dataframe(df_binary)

In [21]:
designer.run('size', metrics=['retention'], method='binary', interval_type='yule')

Unnamed: 0_level_0,$\delta$-relative,1.05,1.10,1.20
$\alpha$,$\beta$,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.05,0.2,9572,2378,608


### Designer class could be created from yaml config

In [22]:
config = '''
    !designer
        effects: 
            - 1.2
            - 0.9
        sizes: 
            1000
        first_type_errors: 
            - 0.05
            - 0.01
        second_type_errors:
            - 0.2
            - 0.1
        metrics: watched
'''

In [23]:
designer = yaml.load(config, Loader=yaml.Loader)

In [24]:
designer.set_dataframe(data)

In [25]:
designer.run('size')

errors,(0.05; 0.2),(0.05; 0.1),(0.01; 0.2),(0.01; 0.1)
effects,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20.0%,395,528,587,748
-10.0%,1575,2108,2343,2985


### Stand-alone design function is implemented

In [26]:
designer = load_from_config(config)
designer.set_dataframe(data)
designer.run('size')

errors,(0.05; 0.2),(0.05; 0.1),(0.01; 0.2),(0.01; 0.1)
effects,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20.0%,395,528,587,748
-10.0%,1575,2108,2343,2985


## CUPED

In [28]:
from ambrosia.preprocessing import Cuped

In [29]:
data_cuped = pd.read_csv('../tests/test_data/var_table.csv')

In [30]:
cuped = Cuped()

In [34]:
cuped.fit_transform(
    dataframe=data_cuped,
    target_column='target',
    covariate_column='feature_2',
    transformed_name='target_cuped',
    inplace=False,
)

ambrosia LOGGER: After transformation СUPED for target, the variance is 67.0818 % of the original
ambrosia LOGGER: Variance transformation 2982.4627 ===> 2000.6892


Unnamed: 0,feature_1,feature_2,feature_3,target,target_cuped
0,-2.426916,5.575498,43.505323,187.385459,204.513107
1,-2.745189,7.995822,19.942889,99.691566,109.350175
2,2.437555,17.254237,33.091612,188.880782,169.968233
3,6.202871,28.913551,25.026746,199.532560,144.639755
4,3.099725,3.771417,26.403917,121.956238,144.651222
...,...,...,...,...,...
2995,1.277060,22.630330,36.479685,216.416345,180.913351
2996,5.124652,58.120888,13.836445,239.307014,94.281340
2997,-0.654616,3.930848,32.036205,139.957720,162.160705
2998,0.401016,29.254561,38.268808,240.608496,184.663346


### For a better design of the experiment, let's try to reduce metric variance using CUPED 

In [35]:
after_cuped = cuped.transform(
    dataframe=data_cuped,
    inplace=False,
)

ambrosia LOGGER: After transformation СUPED for target, the variance is 67.0818 % of the original
ambrosia LOGGER: Variance transformation 2982.4627 ===> 2000.6892


In [36]:
designer = Designer(dataframe=after_cuped,
                    effects=[1.05, 1.1, 1.2],
                    metrics=['target', 'target_cuped']
)


In [37]:
design = designer.run('size', method='empiric')

Group sizes calculation:   0%|          | 0/3 [00:00<?, ?it/s]

In [40]:
design['target']

errors,"(0.05, 0.2)"
effect,Unnamed: 1_level_1
20.0%,61
10.0%,232
5.0%,852


In [41]:
design['target_cuped']

errors,"(0.05, 0.2)"
effect,Unnamed: 1_level_1
20.0%,45
10.0%,163
5.0%,609


## Multi CUPED

**This data is artificial, so everything turned out very well**

In [42]:
from ambrosia.preprocessing import MultiCuped

In [46]:
transformer = MultiCuped()
transformer.fit_transform(data_cuped, 
                          target_column='target', 
                          covariate_columns=['feature_2', 'feature_3'],
                          transformed_name='target_multicuped',
                         ).head(5)

ambrosia LOGGER: After transformation Multi СUPED for target, the variance is 1.2779 % of the original
ambrosia LOGGER: Variance transformation 2982.4627 ===> 38.1133


Unnamed: 0,feature_1,feature_2,feature_3,target,target_multicuped
0,-2.426916,5.575498,43.505323,187.385459,141.715314
1,-2.745189,7.995822,19.942889,99.691566,140.948473
2,2.437555,17.254237,33.091612,188.880782,149.436534
3,6.202871,28.913551,25.026746,199.53256,156.975607
4,3.099725,3.771417,26.403917,121.956238,150.181834


In [48]:
after_cuped = transformer.transform(data_cuped)

ambrosia LOGGER: After transformation Multi СUPED for target, the variance is 1.2779 % of the original
ambrosia LOGGER: Variance transformation 2982.4627 ===> 38.1133


In [49]:
designer = Designer(dataframe=after_cuped,
                    effects=[1.05, 1.1, 1.2],
                    metrics=['target', 'target_multicuped']
)
design = designer.run('size', method='empiric')

Group sizes calculation:   0%|          | 0/3 [00:00<?, ?it/s]

In [50]:
design['target']

errors,"(0.05, 0.2)"
effect,Unnamed: 1_level_1
20.0%,67
10.0%,228
5.0%,875


In [51]:
design['target_multicuped']

errors,"(0.05, 0.2)"
effect,Unnamed: 1_level_1
20.0%,3
10.0%,5
5.0%,13


### Parameters could be saved to evaluate the later experiment 

In [52]:
transformer.store_params('examples_configs/multicuped_coef.json')

In [53]:
evaluating_transformer = MultiCuped()

In [54]:
evaluating_transformer.load_params('examples_configs/multicuped_coef.json')

In [57]:
evaluating_transformer.params['theta']

array([[3.03444797],
       [4.00091935]])

## Splitter

In [51]:
from ambrosia.splitter import Splitter
from ambrosia.splitter import load_from_config

Your CPU supports instructions that this binary was not compiled to use: AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


### Create an another table with some metrics

In [57]:
df2 = pd.DataFrame({
    'm': np.zeros((200000,)),
    'a': np.random.normal(size=200000),
    'b': np.random.normal(size=200000)
}).reset_index()
df2['l'] = np.where(df2['a'] > 0, 1, 0)
df2['e'] = np.where(df2['b'] > 0, 1, 0)
df2['sub_index'] = np.random.choice(df2['index'], size=df2.shape[0])
df2.head(10)

Unnamed: 0,index,m,a,b,l,e,sub_index
0,0,0.0,-0.289951,0.368164,0,1,118902
1,1,0.0,0.119381,-1.645094,1,0,120625
2,2,0.0,0.305199,1.592921,1,1,2743
3,3,0.0,-0.079907,-1.066632,0,0,186075
4,4,0.0,-0.419471,1.32471,0,1,175752
5,5,0.0,-1.201202,-1.33467,0,0,9306
6,6,0.0,0.301942,-0.737741,1,0,93625
7,7,0.0,-0.348287,1.606529,0,1,28251
8,8,0.0,-0.871868,-0.179864,0,0,31085
9,9,0.0,-0.298841,0.814715,0,1,126084


In [58]:
splitter = Splitter(dataframe=df2, id_column='index')

In [60]:
splitter.run(method='simple', groups_size=275, strat_columns=['l', 'e'])

Unnamed: 0,index,m,a,b,l,e,sub_index,group
7379,7379,0.0,-0.616350,-1.023245,0,0,105626,A
181290,181290,0.0,-0.194203,-0.474174,0,0,188938,A
100503,100503,0.0,-0.218029,-1.256727,0,0,152305,A
67419,67419,0.0,-0.020085,-0.484707,0,0,180153,A
121730,121730,0.0,-0.499549,-1.775768,0,0,40121,A
...,...,...,...,...,...,...,...,...
13989,13989,0.0,0.070487,0.304757,1,1,119146,B
46441,46441,0.0,1.407474,0.038372,1,1,168831,B
84370,84370,0.0,0.830372,1.074070,1,1,20393,B
36817,36817,0.0,0.046332,0.408929,1,1,155098,B


### When using the hashing split method, the number of objects in groups  may differ from the specified one due to their bucketization

In [61]:
splitter.run(groups_size=20000, id_column='index', method='hash')

Unnamed: 0,index,m,a,b,l,e,sub_index,group
0,0,0.0,-0.289951,0.368164,0,1,118902,A
11,11,0.0,0.756690,-0.082379,1,0,50358,A
13,13,0.0,-0.733714,0.056818,0,1,178392,A
15,15,0.0,0.807368,-1.252378,1,0,123966,A
19,19,0.0,0.637358,0.715775,1,1,94269,A
...,...,...,...,...,...,...,...,...
199982,199982,0.0,-0.800075,-0.922606,0,0,69494,B
199985,199985,0.0,-0.323606,0.988292,0,1,49567,B
199987,199987,0.0,0.324330,0.995925,1,1,58494,B
199991,199991,0.0,-0.687163,-0.489333,0,0,12826,B


### Hash split reproducibility

In [63]:
splitter.run(groups_size=500, id_column='index', method='hash', salt='custom salt')

Unnamed: 0,index,m,a,b,l,e,sub_index,group
1296,1296,0.0,0.047592,-1.508111,1,0,34820,A
1646,1646,0.0,-1.182158,0.301553,0,1,129573,A
1854,1854,0.0,0.349794,2.388506,1,1,194624,A
2500,2500,0.0,-0.289352,-0.129468,0,0,86183,A
4205,4205,0.0,1.807681,2.039412,1,1,94609,A
...,...,...,...,...,...,...,...,...
198797,198797,0.0,-0.017162,-0.697513,0,0,130339,B
199001,199001,0.0,1.722714,1.186159,1,1,174343,B
199052,199052,0.0,-0.928821,0.217333,0,1,61964,B
199397,199397,0.0,-0.286315,-2.136864,0,0,115955,B


In [66]:
splitter.run(groups_size=500, id_column='index', method='hash', salt='custom salt')

Unnamed: 0,index,m,a,b,l,e,sub_index,group
1296,1296,0.0,0.047592,-1.508111,1,0,34820,A
1646,1646,0.0,-1.182158,0.301553,0,1,129573,A
1854,1854,0.0,0.349794,2.388506,1,1,194624,A
2500,2500,0.0,-0.289352,-0.129468,0,0,86183,A
4205,4205,0.0,1.807681,2.039412,1,1,94609,A
...,...,...,...,...,...,...,...,...
198797,198797,0.0,-0.017162,-0.697513,0,0,130339,B
199001,199001,0.0,1.722714,1.186159,1,1,174343,B
199052,199052,0.0,-0.928821,0.217333,0,1,61964,B
199397,199397,0.0,-0.286315,-2.136864,0,0,115955,B


### Metric split (NN search)

In [67]:
splitter.run(groups_size=7777, id_column='index', method='metric', fit_columns=['m', 'a'])

Unnamed: 0,index,m,a,b,l,e,sub_index,group
87497,87497,0.0,-0.869553,-0.074045,0,0,96914,A
103039,103039,0.0,0.149356,-1.522059,1,0,101221,A
124215,124215,0.0,-2.139364,1.443533,0,1,96930,A
79995,79995,0.0,1.520384,0.198299,1,1,174364,A
18507,18507,0.0,0.674251,0.493568,1,1,151762,A
...,...,...,...,...,...,...,...,...
117379,117379,0.0,1.124355,-1.064331,1,0,113811,B
192143,192143,0.0,-0.155300,-0.077405,0,0,8550,B
15874,15874,0.0,-1.141718,-0.441516,0,0,48412,B
8848,8848,0.0,-1.896951,-0.310161,0,0,78597,B


### Split with predefined test group ids

In [68]:
test_group_ids = df2.sample(n=1003, random_state=228).index

In [69]:
splitter.run(method='metric', fit_columns=['a', 'm'], test_group_ids=test_group_ids)

Unnamed: 0,index,m,a,b,l,e,sub_index,group
168100,168100,0.0,1.324884,-1.517466,1,0,69900,A
158921,158921,0.0,0.262511,0.445665,1,1,162987,A
31690,31690,0.0,-0.782612,-0.002481,0,0,61338,A
16945,16945,0.0,1.910879,-0.578645,1,0,33190,A
187465,187465,0.0,-3.059154,2.834729,0,1,103666,A
...,...,...,...,...,...,...,...,...
198890,198890,0.0,1.859995,0.365390,1,1,187248,B
199247,199247,0.0,0.402327,-0.531317,1,0,109889,B
199373,199373,0.0,-0.100691,-0.591498,0,0,70854,B
199641,199641,0.0,-1.564912,0.679526,0,1,181214,B


In [70]:
splitter.run(method='simple', test_group_ids=test_group_ids)

Unnamed: 0,index,m,a,b,l,e,sub_index,group
147676,147676,0.0,-1.669761,0.311105,0,1,190340,A
25316,25316,0.0,-1.553770,-1.394555,0,0,94014,A
69464,69464,0.0,-1.231809,0.192928,0,1,108791,A
35681,35681,0.0,-0.509907,-0.439847,0,0,121517,A
150069,150069,0.0,2.339546,0.163894,1,1,138443,A
...,...,...,...,...,...,...,...,...
198890,198890,0.0,1.859995,0.365390,1,1,187248,B
199247,199247,0.0,0.402327,-0.531317,1,0,109889,B
199373,199373,0.0,-0.100691,-0.591498,0,0,70854,B
199641,199641,0.0,-1.564912,0.679526,0,1,181214,B


### Split example with all parameters

In [71]:
splitter.run(
    dataframe=df2,
    id_column='index',
    fit_columns=['a', 'm'],
    strat_columns=['l', 'e'],
    method='metric',
    test_group_ids=test_group_ids,
)

Unnamed: 0,index,m,a,b,l,e,sub_index,group
31690,31690,0.0,-0.782612,-0.002481,0,0,61338,A
154024,154024,0.0,-1.041908,-0.434319,0,0,110126,A
103433,103433,0.0,-0.807263,-0.655817,0,0,72281,A
30931,30931,0.0,-0.519306,-0.581060,0,0,34842,A
119092,119092,0.0,-0.812751,-1.539174,0,0,10308,A
...,...,...,...,...,...,...,...,...
197371,197371,0.0,0.663420,0.180037,1,1,45700,B
197498,197498,0.0,0.499418,1.659127,1,1,180510,B
198363,198363,0.0,0.362169,0.572062,1,1,161440,B
198412,198412,0.0,1.229890,0.912350,1,1,84315,B


### It is possible to create multiple groups

In [76]:
splitter = Splitter(dataframe=df2, strat_columns=['l', 'e'],) 

In [97]:
splitter.run(method='simple',
             groups_size=500,
             groups_number=5,
             salt='salt',
)

Unnamed: 0,index,m,a,b,l,e,sub_index,group
122549,122549,0.0,-2.152860,-0.414842,0,0,110485,A
48824,48824,0.0,-0.977800,-0.729266,0,0,75132,A
197135,197135,0.0,-0.913496,-1.522673,0,0,51717,A
114862,114862,0.0,-0.240139,-0.381745,0,0,122956,A
141893,141893,0.0,-1.387978,-0.664802,0,0,193221,A
...,...,...,...,...,...,...,...,...
116347,116347,0.0,2.074705,0.479152,1,1,112966,E
40511,40511,0.0,0.660019,0.586889,1,1,38100,E
43238,43238,0.0,1.192898,1.098787,1,1,12420,E
192713,192713,0.0,0.025110,0.716599,1,1,46520,E


In [99]:
splitter.run(method='hash',
             groups_size=500,
             groups_number=5,
             salt='salt',
             labels=['first_group', 'second_group', 'third_group', 'fourth_group', 'fith_group']
)

Unnamed: 0,index,m,a,b,l,e,sub_index,group
688,688,0.0,-1.496174,-0.280367,0,0,28460,first_group
1376,1376,0.0,-0.955806,-0.788998,0,0,170162,first_group
3235,3235,0.0,-0.404273,-0.075367,0,0,199597,first_group
3779,3779,0.0,-0.058839,-0.518457,0,0,155747,first_group
7908,7908,0.0,-0.789847,-0.680956,0,0,79319,first_group
...,...,...,...,...,...,...,...,...
184864,184864,0.0,1.506341,1.191761,1,1,105789,fith_group
193874,193874,0.0,1.744473,1.668112,1,1,24137,fith_group
196093,196093,0.0,0.333060,0.395639,1,1,439,fith_group
198015,198015,0.0,0.195784,0.011904,1,1,90441,fith_group


### Splitter class could be created from yaml config

In [100]:
config = '''
    !splitter
        groups_size: 
             500
        id_column: 
            index
'''

In [101]:
splitter = load_from_config(config)

### Use setters set data frame(and other parameters)

In [102]:
splitter.set_dataframe(df2)

### Full table could be splitted into parts, for example 30% / 70%

In [104]:
# 30 % элементов будет A, 70 B
splitter.run('hash', part_of_table=0.3)

Unnamed: 0,index,m,a,b,l,e,sub_index,group
0,0,0.0,-0.289951,0.368164,0,1,118902,A
2,2,0.0,0.305199,1.592921,1,1,2743,A
7,7,0.0,-0.348287,1.606529,0,1,28251,A
9,9,0.0,-0.298841,0.814715,0,1,126084,A
11,11,0.0,0.756690,-0.082379,1,0,50358,A
...,...,...,...,...,...,...,...,...
199992,199992,0.0,-0.950282,-1.929851,0,0,157363,B
199996,199996,0.0,-0.480249,1.660715,0,1,19622,B
199997,199997,0.0,-0.665017,0.961507,0,1,149331,B
199998,199998,0.0,-0.679980,-0.080122,0,0,31180,B


## Tester

In [105]:
from ambrosia.tester import Tester

### Prepare  data and use the standard constructor

In [106]:
table_result = pd.read_csv('../tests/test_data/watch_result.csv')

In [109]:
transformer = AggregatePreprocessor(real_method='sum')

to_test = transformer.fit_transform(table_result, groupby_columns='id', real_cols='watched', categorial_cols='group')
to_test

Unnamed: 0,id,watched,group
0,6,597.833362,A
1,11,549.314234,A
2,20,564.401942,A
3,21,248.735358,A
4,23,926.048946,B
...,...,...,...
1795,4987,454.662125,A
1796,4988,404.600192,B
1797,4997,594.629770,B
1798,4998,1025.918249,B


In [110]:
tester = Tester(dataframe=to_test, metrics='watched', column_groups='group')

### Set on what scale we measure the effect: in the form of an absolute value or relative one

In [111]:
tester.run(effect_type='relative')

Unnamed: 0,first_type_error,pvalue,effect,confidence_interval,metric name,group A label,group B label
0,0.05,4e-05,0.079901,"(0.0419, 0.1183)",watched,A,B


In [112]:
tester.run(effect_type='absolute')

Unnamed: 0,first_type_error,pvalue,effect,confidence_interval,metric name,group A label,group B label
0,0.05,2.2e-05,55.314679,"(26.54, 84.0893)",watched,A,B


### Tester result could be obtained as list of mesurements

In [113]:
tester.run(as_table=False)

[{'first_type_error': 0.05,
  'pvalue': 2.2238711360266617e-05,
  'effect': 55.31467945319761,
  'confidence_interval': (26.54, 84.0893),
  'metric name': 'watched',
  'group A label': 'A',
  'group B label': 'B'}]

### Criterion could be chosen from wide range

In [114]:
tester.run(method='theory', criterion='mw')

Unnamed: 0,first_type_error,pvalue,effect,confidence_interval,metric name,group A label,group B label
0,0.05,3.5e-05,43.598116,"(None, None)",watched,A,B


### Bootstrap criteria are also supported

In [115]:
tester.run(method='empiric', bootstrap_size=10)

Unnamed: 0,first_type_error,pvalue,effect,confidence_interval,metric name,group A label,group B label
0,0.05,4e-06,55.314679,"(46.2759, 78.5322)",watched,A,B


### For many criteria, you can build confidence intervals for the relative effect

In [116]:
tester.run(effect_type='relative', method='empiric', bootstrap_size=10)

Unnamed: 0,first_type_error,pvalue,effect,confidence_interval,metric name,group A label,group B label
0,0.05,4e-06,0.079901,"(0.0468, 0.125)",watched,A,B


In [117]:
tester.run(effect_type='relative', method='theory')

Unnamed: 0,first_type_error,pvalue,effect,confidence_interval,metric name,group A label,group B label
0,0.05,4e-05,0.079901,"(0.0419, 0.1183)",watched,A,B


In [119]:
tester.run(effect_type='relative', method='empiric', bootstrap_size=10)

Unnamed: 0,first_type_error,pvalue,effect,confidence_interval,metric name,group A label,group B label
0,0.05,4e-06,0.079901,"(0.0387, 0.1054)",watched,A,B


### For binary data, special criteria are implemented

In [120]:
tester.run(method='binary')

  shift = -quantile * np.sqrt(variation)


Unnamed: 0,first_type_error,pvalue,effect,confidence_interval,metric name,group A label,group B label
0,0.05,4e-06,55.314679,"(nan, nan)",watched,A,B


### If the data is non-binary, then a warning appears

In [123]:
a_retention = sps.bernoulli.rvs(p=0.3, size=1000)
b_retention = sps.bernoulli.rvs(p=0.4, size=1000)
a_conversions = sps.bernoulli.rvs(p=0.6, size=1000)
b_conversions = sps.bernoulli.rvs(p=0.6, size=1000)
df_bin = pd.DataFrame(
    np.vstack([np.hstack([a_retention, b_retention]), np.hstack([a_conversions, b_conversions])]).T,
    columns=['retention', 'conversions']
)
df_bin['group'] = ['A'] * 1000 + ['B'] * 1000
tester.set_dataframe(dataframe=df_bin, column_groups='group')

In [126]:
tester.run(method='binary', metrics=['retention', 'conversions'], first_errors=[0.05, 0.01, 0.001])

Unnamed: 0,first_type_error,pvalue,effect,confidence_interval,metric name,group A label,group B label
0,0.05,0.000256,0.077,"(0.0357, 0.1183)",retention,A,B
1,0.01,0.000256,0.077,"(0.0228, 0.1312)",retention,A,B
2,0.001,0.000256,0.077,"(0.0077, 0.1463)",retention,A,B
3,0.05,0.18298,-0.029,"(-0.0717, 0.0137)",conversions,A,B
4,0.01,0.18298,-0.029,"(-0.0851, 0.0271)",conversions,A,B
5,0.001,0.18298,-0.029,"(-0.1007, 0.0427)",conversions,A,B
