# Example of sequential usage of preprocessing classes

In [2]:
import pandas as pd
from tqdm.notebook import tqdm

## Load data

In [3]:
dataframe = pd.read_csv('../tests/test_data/pipeline_test.csv')
dataframe.head()

Unnamed: 0,id,gender,watched,audio,day,platform
0,0,Male,7.912889,2.210973,1,web
1,1,Male,6.67869,0.020715,1,ios
2,2,Female,721.434299,59.99687,1,ios
3,3,Male,135.248218,18.982887,1,ios
4,4,Female,38.962917,8.324667,1,android


Let's try to design an experiment on views of some content, with a priori information that the ``audio`` metric affects ``watched``

## ``Preprocessor`` class allows to perform sequential data transformations

In [4]:
ambroziarom ambrozia.preprocessing import Preprocessor

In [5]:
transformer = Preprocessor(dataframe, verbose=True)

In [6]:
data = transformer.aggregate(groupby_columns='id', 
                             agg_params={
                              'watched' : 'sum',
                              'audio' : 'sum',
                              'gender' : 'simple', # simple - choose the first possible value
                              'platform' : 'mode'
                          })\
                  .robust(['watched', 'audio'], alpha=0.001) \
                  .cuped('watched', by='audio', name='watched_cuped') \
                  .data()

ambrozia LOGGER: Make robust transformation with alpha = 0.001
ambrozia LOGGER: 

ambrozia LOGGER: Change Mean watched: 5343.8899 ===> 5315.7543
ambrozia LOGGER: Change Variance watched: 10951522.1717 ===> 10188504.7263
ambrozia LOGGER: Change IQR watched: 3958.8107 ===> 3937.3920
ambrozia LOGGER: Change Range watched: 35983.1570 ===> 20404.2903
ambrozia LOGGER: 

ambrozia LOGGER: Change Mean audio: 350.3962 ===> 349.8219
ambrozia LOGGER: Change Variance audio: 17724.3973 ===> 16951.7830
ambrozia LOGGER: Change IQR audio: 176.0167 ===> 175.1697
ambrozia LOGGER: Change Range audio: 1098.9677 ===> 777.8102
ambrozia LOGGER: After transformation СUPED for watched, the variance is 7.9445 % of the original
ambrozia LOGGER: Variance transformation 10190551.0217 ===> 809587.6709


In [7]:
data

Unnamed: 0,id,watched,audio,gender,platform,watched_cuped
0,0,2489.224016,213.817130,Male,web,5688.285911
1,1,3970.775664,281.958297,Male,ios,5567.041663
2,2,5900.186483,416.944150,Female,ios,4321.356123
3,3,5557.860998,384.782010,Male,web,4735.538754
4,4,7588.374990,448.263748,Female,android,5272.854524
...,...,...,...,...,...,...
4995,4995,1647.603060,167.552826,Male,web,5934.879927
4996,4996,7403.347846,423.972130,Female,android,5659.207480
4997,4997,3243.170373,287.159499,Male,android,4717.095277
4998,4998,12538.349029,615.502371,Female,ios,6289.092310


### We can get a list of all transformations

In [8]:
transformer.transformations()

[<ambrozia.preprocessing.aggregate.AggregatePreprocessor at 0x131b77c10>,
 <ambrozia.preprocessing.robust.RobustPreprocessor at 0x129a28c40>,
 <ambrozia.preprocessing.cuped.Cuped at 0x129b01d30>]

### For transformations that are fitted, it is important to save their parameters

In [9]:
transformer.transformations()[2].get_params_dict()

{'theta': 23.521697221599357, 'bias': 349.821852624589}

In [10]:
transformer.transformations()[2].store_params('examples_configs/params_cuped.json')

## Now let's design an experiment

In [11]:
from ambrozia.designer import Designer

In [12]:
designer = Designer(dataframe=data, effects=1.05, first_type_errors=0.05, second_type_errors=0.7)

In [13]:
designer.run('size', method='empiric', metrics='watched')

Group sizes calculation:   0%|          | 0/1 [00:00<?, ?it/s]

errors,"(0.05, 0.7)"
effect,Unnamed: 1_level_1
5.0%,610.0


In [14]:
designer.run('size', method='empiric', metrics='watched_cuped')

Group sizes calculation:   0%|          | 0/1 [00:00<?, ?it/s]

errors,"(0.05, 0.7)"
effect,Unnamed: 1_level_1
5.0%,49.0


## ambrozia library can be used not only as a tool, but also for conducting your own research
## Let's check that this sample size will really be enough

In [15]:
from ambrozia.preprocessing import Cuped
from ambrozia.splitter import Splitter
from ambrozia.tester import Tester

Your CPU supports instructions that this binary was not compiled to use: AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


In [16]:
transformer = Preprocessor(dataframe=dataframe)

In [17]:
data_exp = transformer.aggregate(groupby_columns='id', 
                                 agg_params={
                                     'watched' : 'sum',
                                     'audio' : 'sum',
                                     'gender' : 'simple',
                                     'platform' : 'mode'
                                 }) \
                      .robust(['watched', 'audio'], alpha=0.001)\
                      .data()

ambrozia LOGGER: Make robust transformation with alpha = 0.001
ambrozia LOGGER: 

ambrozia LOGGER: Change Mean watched: 5343.8899 ===> 5315.7543
ambrozia LOGGER: Change Variance watched: 10951522.1717 ===> 10188504.7263
ambrozia LOGGER: Change IQR watched: 3958.8107 ===> 3937.3920
ambrozia LOGGER: Change Range watched: 35983.1570 ===> 20404.2903
ambrozia LOGGER: 

ambrozia LOGGER: Change Mean audio: 350.3962 ===> 349.8219
ambrozia LOGGER: Change Variance audio: 17724.3973 ===> 16951.7830
ambrozia LOGGER: Change IQR audio: 176.0167 ===> 175.1697
ambrozia LOGGER: Change Range audio: 1098.9677 ===> 777.8102


In [18]:
data_exp

Unnamed: 0,id,watched,audio,gender,platform
0,0,2489.224016,213.817130,Male,web
1,1,3970.775664,281.958297,Male,ios
2,2,5900.186483,416.944150,Female,ios
3,3,5557.860998,384.782010,Male,web
4,4,7588.374990,448.263748,Female,android
...,...,...,...,...,...
4995,4995,1647.603060,167.552826,Male,web
4996,4996,7403.347846,423.972130,Female,android
4997,4997,3243.170373,287.159499,Male,android
4998,4998,12538.349029,615.502371,Female,ios


In [19]:
tests_amounts: int = 500
group_size: int = 50
amount_first_type_errors: int = 0
alpha: float = 0.05
    
for exp_num in tqdm(range(tests_amounts)):
    # Checking for I type error
    splitter = Splitter(data_exp, fit_columns='audio')
    groups = splitter.run(method='metric', salt=f'exp {exp_num}', groups_size=group_size, strat_columns='gender')
    
    transformer = Preprocessor(groups, verbose=False)
    exp_data = transformer.cuped(
        target='watched',
        by='audio',
        name='watched_cuped',
        load_path='examples_configs/params_cuped.json').data()

    tester = Tester(exp_data, metrics='watched_cuped', column_groups='group')
    pvalue = tester.run(method='empiric')['pvalue']
    amount_first_type_errors += (pvalue < alpha) # Reject equality of means when it is true

  0%|          | 0/500 [00:00<?, ?it/s]

In [20]:
print('Empirical I type error: {}'.format(amount_first_type_errors.loc[0] / tests_amounts))

Empirical I type error: 0.038
