# [Дизайн теста](https://wiki.sbmt.io/display/ANLT/%5BDRAFT%5D+AB-surge+block+coefficient+OOS)

# Contents

* [Импорты](#Импорты)
* [Функции](#Функции)
* [Константы](#Константы)
* [Метрики](#Метрики)
  * [Отмены/замены](#Отмены/замены)
    * [Исходная логика](#Исходная_логика)
    * [По-юзерная логика](#По-юзерная_логика)
  * [Время сборки](#Время_сборки)
    * [Исходная логика](#Исходная__логика)
    * [По-юзерная логика](#По-юзерная__логика)
  * [AOV](#AOV)
  * [AOV первичный](#AOV_первичный)
  * [Retention (?)](#Retention) 
* [MDE](#MDE)
  * [Data collection](#Data_collection)
      * [Отмены/замены](#Отмены/замены_load)
      * [Время сборки](#Время_сборки_load)
  * [Data preprocessing](#Data_preprocessing)
      * [Отмены/замены](#Отмены/замены_prep)
      * [Время сборки](#Время_сборки_prep)
      * [Итог](#Итог)
  * [MDE estimation](#MDE_estimation)
* [AA & AB](#AA&AB)

# Импорты

In [1]:
import pandas as pd
import numpy as np
import datetime as dt

from matplotlib import pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
sns.set(rc={'figure.figsize':(15, 5)})

import sys
import os
import warnings

from scipy import stats as st
from scipy.stats import ttest_ind
import statsmodels.api as sm
import statsmodels.stats.api as sms
from statsmodels.stats.power import tt_ind_solve_power 

sys.path.append('/home/jovyan')
from connectors import read_sql_query
from tqdm import tqdm 


pd.set_option('display.max_columns', None)

In [2]:
warnings.filterwarnings("ignore")

In [3]:
!pip install tabulate



In [4]:
import tabulate

# Функции

In [5]:
def deltamethod(x, y, independent = False, bc = False):
    n = len(x)
    mux = np.mean(x)
    muy = np.mean(y)
    
    v11 = np.var(y,ddof=1)
    v22 = np.var(x,ddof=1)
    
    if independent == True:
        v12 = 0
    else: 
        v12 = np.cov(x,y)[0][1]
    
    est = muy / mux
    
    if bc == True:
        est = est + muy/mux**3*v22/n - 1/mux**2*v12/n
    sd = (v11 / mux**2) - (2 * muy / mux**3 * v12) + (muy**2 / mux**4 * v22)
    sd = np.sqrt(sd)
    return est, sd

In [6]:
def calculate_pvalue(test, control, metric, group_id, df, metric_type='proportion'):
        if metric_type == 'average' or metric_type == 'ratio':
            equal_var = False
        else:
            equal_var = True

        if metric_type == 'ratio':
            c_num_values = df[df[group_id] == control][metric[0]].values
            c_denom_values = df[df[group_id] == control][metric[1]].values
            
            t_num_values = df[df[group_id] == test][metric[0]].values
            t_denom_values = df[df[group_id] == test][metric[1]].values
                       
            control_mean = c_num_values.sum() / c_denom_values.sum()
            test_mean = t_num_values.sum() / t_denom_values.sum()

            c_values = c_num_values - c_denom_values * control_mean
            t_values = t_num_values - t_denom_values * control_mean
        else:
            c_values = df[df[group_id] == control][metric].values
            t_values = df[df[group_id] == test][metric].values
            
            control_mean = c_values.mean()
            test_mean = t_values.mean()

        return ttest_ind(c_values, t_values, equal_var=equal_var).pvalue, control_mean, test_mean

# Константы

In [7]:
end_date = dt.date.today()-dt.timedelta(days=1)

In [8]:
platforms = ["android","web"]
lengths = [2,3,4] #weeks
traffic_proportions = [0.125, 0.25, 0.5, 0.75, 1]
thresholds = [0.05, 0.01]

In [9]:
oos_stores = \
read_sql_query("""
select * from sandbox.stores_with_out_of_stock_model
""").store_id.tolist()

In [130]:
oos_stores_df = read_sql_query("""
select * from sandbox.stores_with_out_of_stock_model
""")

In [132]:
oos_stores_df.to_excel('oos_stores.xlsx',index=False)

In [10]:
len(oos_stores)

17783

In [11]:
oos_stores[:10]

[33127, 14191, 14195, 14202, 14203, 14204, 14205, 14209, 14212, 14214]

# Фильтрующее

event_filter = f"event = 'Shop Selected' and store_id IN {tuple(map(int,oos_stores))}"

In [12]:
event_filter = f"event IN ('Category Viewed', 'Search Results Viewed') and store_id IN {tuple(map(int,oos_stores))}"

In [13]:
sources_new_app = (
    "Order Completed",
    "Landing Viewed",
    "Product Added",
    "Login",
    "Shop Selected",
    "Address Change Initiated",
    "Product Viewed",
    "Search Results Viewed",
    "Category Viewed",
    "Shop Selection Started",
    "Pickup Map Opened",
    "Retailer Selection Started",
    "Checkout Sber Spasibo Form Opened",
    "Map Pickup Shop Selected",
    "Checkout Button Clicked", 
    "Checkout Loaded", 
    "Main Page Viewed", 
    "Checkout Slot Selection Started",
    "Checkout Delivery Slot Selection Started",
    "Checkout Delivery Slot Selected", 
    "Cart Viewed", 
    "Search Started", 
    "Banner Viewed", 
    "Recommendation Block View", 
    "Product Category Viewed", 
    "Catalogue Tab Clicked",
    "Add To Cart Clicked", 
    "Onboarding Banner Viewed", 
    "Order Info Viewed", 
    "Product Sort Initiated",
    "Product Filter Initiated",
    "Product Sorted", 
    "Cart Params Loaded",
    "Address Selected", 
    "Address Search Opened",
    "Address Suggest Clicked",
    "Redirected From SBOL",
    "Sberid Authorization Prompt Shown",
    "Map Pickup Button Clicked",
    "Age Confirmation Viewed",
    "Alcohol Pickup Popup_completed",
    "Favorites Tab Clicked",
    "Address Change Clicked",
    "Retailer Selected"
)

sources_web = (
    "Order Completed",
    "Checkout Button Clicked",
    "Landing Viewed",
    "Product Added",
    "Main Page Viewed",
    "Suggester Viewed",
    "Main Page Viewed",
    "Shop Selected",
    "Product Viewed",
    "Recommendation Product View",
    "Search Results Viewed",
    "Category Viewed",
    "Shop Selection Started",
    "Pickup Map Opened",
    "Cart Viewed",
    "Order Merged",
    "Checkout Loaded",
    "Pre Replacements Product Selected",
    "Pre Replacements Product All Choosed",
    "Address Change Initiated",
    "Retailer Shipping Method Clicked",
    "Button Find Stores Clicked",
    "Favourites Tab Clicked",
    "Checkout Slot Selection Started",
    "Checkout Delivery Slot Selection Started",
    "Checkout Delivery Slot Selected",
    "Search Started",
    "Search Completed",
    "Search Suggest Clicked",
    "Suggester Viewed",
    "Banner Viewed",
    "Recommendation Block View",
    "Product Category Viewed",
    "Catalogue Tab Clicked",
    "Add To Cart Clicked",
    "Onboarding Banner Viewed",
    "Order Info Viewed",
    "Product Sort Initiated",
    "Product Filter Initiated",
    "Product Sorted",
    "Cart Params Loaded",
    "Address Selected",
    "Address Search Opened",
    "Address Suggest Clicked",
    "Redirected From SBOL",
    "Sberid Authorization Prompt Shown",
    "Map Pickup Button Clicked",
    "Age Confirmation Viewed",
    "Alcohol Pickup Button Clicked",
    "Address Change Clicked",
    "Only Pickup Popup Viewed",
    "Retailer Selected"
)

# Метрики

## Отмены/замены

In [14]:
def get_not_found_ratio_oos(platforms, start_date, end_date, sources_web, sources_new_app):

    q = f""" 
    with 
        toDate('{start_date}') as start_date,
        toDate('{end_date}') as end_date,
        
    events as (
        select 
            toString(anonymous_id) as anonymous_id,
            toDate(ts) as dt,
            'web' as platform,
            ifNull(NullIf(tenant, ''), 'sbermarket') as tenant
        from event.web
        where 1=1
            and toDate(ts) between start_date and end_date
            and dwh_dt between start_date and end_date
            and not_bot
            and event in {sources_web}
        group by anonymous_id, dt, platform, tenant

        union all

        select 
            anonymous_id,
            toDate(ts) as dt,
            toString(platform) as platform,
            'sbermarket' as tenant
        from event.new_app
        where 1=1
            and toDate(ts) between start_date and end_date
            and dwh_dt between start_date and end_date
            and event in {sources_new_app}

        group by anonymous_id, dt, platform, tenant
    ),

    filter_events as (
        select
            anonymous_id,
            toDate(min(ts)) as event_dt
        from event.new_app
        where 1=1
            and {event_filter}
            and dwh_dt between start_date and end_date
            and toDate(ts) between start_date and end_date
        group by anonymous_id

        UNION ALL 

        select
            toString(anonymous_id) as anonymous_id,
            toDate(min(ts)) as event_dt
        from event.web
        where 1=1
            and {event_filter}
            and dwh_dt between start_date and end_date
            and toDate(ts) between start_date and end_date
        group by anonymous_id
    ),

    client_id_x_order_id as (
        select distinct
            date,
            uuid,
            anonymous_id,
            if(platform in ('desktop', 'mobile'), 'web', platform) as platform,
            tenant,
            order_id
        from 
            db_bs_ab_admin.ab_platform_materialization_client_id_x_order_id
        where true 
            and date between start_date and end_date
    ),

    ab_groups_x_events as (
        select distinct
            events.anonymous_id as anonymous_id,
            events.platform as platform,
            events.tenant as tenant,
            client_id_x_order_id.order_id as order_id,
            events.dt as events_dt
        from 
            events

            inner join filter_events 
                on events.anonymous_id = filter_events.anonymous_id

            left join client_id_x_order_id
                on events.anonymous_id = client_id_x_order_id.anonymous_id
                and events.dt = client_id_x_order_id.date
                and events.platform = client_id_x_order_id.platform
                and events.tenant = client_id_x_order_id.tenant

        where true
          and filter_events.event_dt <= events_dt
    ),

        shipments as (
            select platform,
                   order_id, 
                   order_number, 
                   completed_at, 
                   sum(item_count) AS total_items,
                   sum(replaced_items_cnt) AS repl,
                   sum(canceled_items_cnt) AS canc ,
                   sum(replaced_items_cnt + canceled_items_cnt) AS not_found,
                   not_found / (total_items + canc) * 100 AS rc_rate
            from analytics.shipments
            where toDate(completed_at) between start_date and end_date
              AND shipment_state = 'shipped'
              AND item_count>0
              AND store_id IN (SELECT store_id FROM sandbox.stores_with_out_of_stock_model)
              and tenant_id = 'sbermarket'
            GROUP BY user_id,
                     order_id,
                     order_number,
                     completed_at,
                     platform
        )

        select platform
             , anonymous_id
             , ab_groups_x_events.order_id as order_number
             , order_id
             , completed_at
             , total_items
             , repl
             , canc
             , not_found
             , rc_rate
          from ab_groups_x_events 
               join shipments on ab_groups_x_events.order_id = shipments.order_number
                             and toDate(shipments.completed_at) = events_dt
        where platform IN {tuple(platforms)}

    """

    return read_sql_query(q)

In [15]:
read_sql_query("""
SELECT user, query, elapsed, read_rows
  FROM system.processes
""")

Unnamed: 0,user,query,elapsed,read_rows
0,airflow_ml,with\n metrics_data_time as (\n sele...,1976.868962,401289077
1,boot_dhw_user,\n with data_fm as (\n ...,224.555099,147301960
2,valeriy_uvarov,"SELECT ""Custom SQL Query"".""Change"" AS ""Change""...",49.9644,34661800
3,airflow_ml,insert into sandbox.ab_tests_results_tmp\nsett...,49.68998,4880362
4,anastasiya_mavrina,with \n\n dates as (\n select disti...,30.003126,328577878
5,boot_dhw_user,truncate TABLE sandbox.new_users_info,20.887456,0
6,airflow_ml,"with slices as (\nselect\n\tanonymous_id,\n\ts...",18.379575,25423857
7,nikita_kryuchenkov,"SELECT ""Custom SQL Query3"".""a"" AS ""a (Custom S...",9.974009,10447415
8,boot_dhw_user,\n with response_data as\n (sele...,1.047232,3079539
9,andrey_cherevets,"\nSELECT user, query, elapsed, read_rows\n FR...",0.001514,0


In [16]:
%%time
test_nf = get_not_found_ratio_oos(['android','web'], end_date, end_date, sources_web, sources_new_app)

CPU times: user 418 ms, sys: 66.7 ms, total: 485 ms
Wall time: 7.69 s


In [17]:
test_nf.shape

(46421, 10)

In [18]:
test_nf.head()

Unnamed: 0,platform,anonymous_id,order_number,order_id,completed_at,total_items,repl,canc,not_found,rc_rate
0,web,88255aa6-7aba-46c2-a117-17ffa297124b,R574410566,R574410566,2023-11-06 09:46:40,6,0,0,0,0.0
1,web,a5a6f05f-13b7-465f-bcc7-14d63d426a07,R680336473,R680336473,2023-11-06 05:14:38,2,0,0,0,0.0
2,web,61bc2cab-afb3-4bc8-9cb4-0996c9c02ca0,R505318176,R505318176,2023-11-06 10:24:34,18,2,0,2,11.111111
3,web,14055d9f-7762-48e0-bc1a-377aa9ff0e5d,R202717654,R202717654,2023-11-06 09:00:46,5,1,0,1,20.0
4,web,55880faa-0ceb-4d44-bbb1-b2476bdacaff,R282634577,R282634577,2023-11-06 03:40:09,38,0,2,2,5.0


In [19]:
(
    test_nf
    .groupby(['platform'])
    [['total_items','not_found']].sum()
    .assign(rc_rate = lambda _df: _df.not_found/_df.total_items*100)
)

Unnamed: 0_level_0,total_items,not_found,rc_rate
platform,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
android,570858,38210,6.693433
web,51115,3340,6.534285


In [20]:
test_nf.platform.value_counts()

android    42430
web         3991
Name: platform, dtype: int64

## Время_сборки

In [21]:
def get_collection_time_ratio_oos(platforms, start_date, end_date, sources_web, sources_new_app):

    req = f""" 
    with toDate('{start_date}') as start_date, 
         toDate('{end_date}') as end_date, 

    events as (
        select 
            toString(anonymous_id) as anonymous_id,
            toDate(ts) as dt,
            'web' as platform,
            ifNull(NullIf(tenant, ''), 'sbermarket') as tenant
        from event.web
        where 1=1
            and toDate(ts) between start_date and end_date
            and dwh_dt between start_date and end_date
            and not_bot
            and event in {sources_web}
        group by anonymous_id, dt, platform, tenant

        union all

        select 
            anonymous_id,
            toDate(ts) as dt,
            toString(platform) as platform,
            'sbermarket' as tenant
        from event.new_app
        where 1=1
            and toDate(ts) between start_date and end_date
            and dwh_dt between start_date and end_date
            and event in {sources_new_app}

        group by anonymous_id, dt, platform, tenant
    ),

    filter_events as (
        select
            anonymous_id,
            toDate(min(ts)) as event_dt
        from event.new_app
        where 1=1
            and {event_filter}
            and dwh_dt between start_date and end_date
            and toDate(ts) between start_date and end_date
        group by anonymous_id

        UNION ALL 

        select
            toString(anonymous_id) as anonymous_id,
            toDate(min(ts)) as event_dt
        from event.web
        where 1=1
            and {event_filter}
            and dwh_dt between start_date and end_date
            and toDate(ts) between start_date and end_date
        group by anonymous_id
    ),

    client_id_x_order_id as (
        select distinct
            date,
            uuid,
            anonymous_id,
            if(platform in ('desktop', 'mobile'), 'web', platform) as platform,
            tenant,
            order_id
        from 
            db_bs_ab_admin.ab_platform_materialization_client_id_x_order_id
        where true 
            and date between start_date and end_date
    ),

    ab_groups_x_events as (
        select distinct
            events.anonymous_id as anonymous_id,
            events.platform as platform,
            events.tenant as tenant,
            client_id_x_order_id.order_id as order_id,
            events.dt as events_dt
        from 
            events

            inner join filter_events 
                on events.anonymous_id = filter_events.anonymous_id

            left join client_id_x_order_id
                on events.anonymous_id = client_id_x_order_id.anonymous_id
                and events.dt = client_id_x_order_id.date
                and events.platform = client_id_x_order_id.platform
                and events.tenant = client_id_x_order_id.tenant

        where true
          and filter_events.event_dt <= events_dt
    ),
         

        shpmnts as (
            select platform,
                   order_id, 
                   order_number, 
                   shipment_number AS shipment_number,
                   completed_at
            from analytics.shipments
            where toDate(completed_at) between start_date and end_date
              AND shipment_state = 'shipped'
              AND store_id IN (SELECT store_id FROM sandbox.stores_with_out_of_stock_model)
        ),
        
        collection_time AS (
        SELECT shipment_number AS shipment_number,
               start_assembling_to_first_item * ifNull(start_to_first_item_coef, 1)   
               + items_collection_time * ifNull(items_collection_time_coef, 1)                            
               + last_item_to_cashing_started * ifNull(last_item_to_cashing_coef, 1) as assembly_speed_positions_num 
          FROM sandbox.cte_decomposition_new
         WHERE toDate(completed_at) between start_date and end_date
           AND overall_time_of_order is not null
           AND shipped_at IS NOT NULL
        ),
        
        prep AS (
        SELECT order_number,
               order_id,
               completed_at,
               count(distinct(shipment_number)) AS n_shipments,
               SUM(assembly_speed_positions_num) AS assembly_speed_positions_num
          FROM shpmnts
               JOIN collection_time ON collection_time.shipment_number = shpmnts.shipment_number
         GROUP BY order_number,
                  order_id,
                  completed_at
        )

        select platform
             , anonymous_id
             , ab_groups_x_events.order_id as order_number
             , order_id
             , completed_at
             , n_shipments
             , assembly_speed_positions_num
          from ab_groups_x_events 
               join prep on ab_groups_x_events.order_id = prep.order_number
                        and toDate(prep.completed_at) = events_dt
         where platform IN {tuple(platforms)}
    """

    all_orders = read_sql_query(req)
    return all_orders

In [22]:
%%time
test_ct = get_collection_time_ratio_oos(['android','web'], end_date, end_date, sources_web, sources_new_app)

CPU times: user 323 ms, sys: 30.9 ms, total: 354 ms
Wall time: 8.66 s


In [23]:
test_ct.shape

(30036, 7)

In [24]:
test_ct.head()

Unnamed: 0,platform,anonymous_id,order_number,order_id,completed_at,n_shipments,assembly_speed_positions_num
0,web,31456cad-b1c2-4094-84c9-879358717131,R001412846,R001412846,2023-11-06 13:33:40,1,921.0
1,web,a3835cd0-a694-4d4e-b8d4-ff6f5c7ff230,R164870386,R164870386,2023-11-06 07:28:22,1,1178.5
2,web,0cdbdcf4-504e-4645-9ec7-d930f74a5db4,R188047662,R188047662,2023-11-06 08:22:50,1,1280.5
3,web,0bb9072b-339a-40be-8a7e-61289b7800fb,R145678416,R145678416,2023-11-06 08:59:41,1,1436.0
4,web,b620a5aa-f97b-4c91-be68-5f95a69a2e84,R250408425,R250408425,2023-11-06 09:21:06,1,1476.0


In [25]:
(
    test_ct
    .groupby(['platform'])
    [['n_shipments','assembly_speed_positions_num']].sum()
    .assign(rc_rate = lambda _df: _df.assembly_speed_positions_num/_df.n_shipments)
)

Unnamed: 0_level_0,n_shipments,assembly_speed_positions_num,rc_rate
platform,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
android,26963,38351860.0,1422.388612
web,3074,4168902.0,1356.181616


In [26]:
test_ct.platform.value_counts()

android    26962
web         3074
Name: platform, dtype: int64

## AOV_goods

In [27]:
def get_aov_goods_ratio_oos(platforms, start_date, end_date, sources_web, sources_new_app):

    q = f""" 
    with 
        toDate('{start_date}') as start_date,
        toDate('{end_date}') as end_date,

    events as (
        select 
            toString(anonymous_id) as anonymous_id,
            toDate(ts) as dt,
            'web' as platform,
            ifNull(NullIf(tenant, ''), 'sbermarket') as tenant
        from event.web
        where 1=1
            and toDate(ts) between start_date and end_date
            and dwh_dt between start_date and end_date
            and not_bot
            and event in {sources_web}
        group by anonymous_id, dt, platform, tenant

        union all

        select 
            anonymous_id,
            toDate(ts) as dt,
            toString(platform) as platform,
            'sbermarket' as tenant
        from event.new_app
        where 1=1
            and toDate(ts) between start_date and end_date
            and dwh_dt between start_date and end_date
            and event in {sources_new_app}

        group by anonymous_id, dt, platform, tenant
    ),

    filter_events as (
        select
            anonymous_id,
            toDate(min(ts)) as event_dt
        from event.new_app
        where 1=1
            and {event_filter}
            and dwh_dt between start_date and end_date
            and toDate(ts) between start_date and end_date
        group by anonymous_id

        UNION ALL 

        select
            toString(anonymous_id) as anonymous_id,
            toDate(min(ts)) as event_dt
        from event.web
        where 1=1
            and {event_filter}
            and dwh_dt between start_date and end_date
            and toDate(ts) between start_date and end_date
        group by anonymous_id
    ),

    client_id_x_order_id as (
        select distinct
            date,
            uuid,
            anonymous_id,
            if(platform in ('desktop', 'mobile'), 'web', platform) as platform,
            tenant,
            order_id
        from 
            db_bs_ab_admin.ab_platform_materialization_client_id_x_order_id
        where true 
            and date between start_date and end_date
    ),

    ab_groups_x_events as (
        select distinct
            events.anonymous_id as anonymous_id,
            events.platform as platform,
            events.tenant as tenant,
            client_id_x_order_id.order_id as order_id,
            events.dt as events_dt
        from 
            events

            inner join filter_events 
                on events.anonymous_id = filter_events.anonymous_id

            left join client_id_x_order_id
                on events.anonymous_id = client_id_x_order_id.anonymous_id
                and events.dt = client_id_x_order_id.date
                and events.platform = client_id_x_order_id.platform
                and events.tenant = client_id_x_order_id.tenant

        where true
          and filter_events.event_dt <= events_dt
    ),

    fin_measures as (
        select 
            order_number,
            completed_at,
            sum(goods) AS goods
        from analytics.financial_measures
        where 1=1
            and toDate(completed_at) between start_date and end_date
            AND store_id IN (SELECT store_id FROM sandbox.stores_with_out_of_stock_model)
            and user_id is not null
            and order_state = 'complete'
            and shipment_state = 'shipped'
            and b2b_order_company_flg != 1
            and b2b_measure = 0
            and order_number not in 
            (
                select order_number
                from analytics.shipments
                where 1=1
                    and toDate(completed_at) between start_date and end_date + interval 14 day
                    and owner_type = 'ServiceAccount'
            )
        group by order_number, completed_at
    )
    
    select platform
         , anonymous_id
         , ab_groups_x_events.order_id as order_number
         , goods
      from ab_groups_x_events 
           join fin_measures ON fin_measures.order_number = ab_groups_x_events.order_id
                             and toDate(fin_measures.completed_at) = events_dt

     where platform IN {tuple(platforms)}

    """

    return read_sql_query(q)

In [28]:
%%time
test_aov = get_aov_goods_ratio_oos(['android','web'], end_date, end_date, sources_web, sources_new_app)

CPU times: user 202 ms, sys: 24.1 ms, total: 226 ms
Wall time: 5.79 s


In [29]:
test_aov.shape

(45022, 4)

In [30]:
test_aov.head()

Unnamed: 0,platform,anonymous_id,order_number,goods
0,web,bae7a329-8395-49a2-baff-17c7d55dcb25,R175335542,1559.579956
1,web,bf028c65-5d44-4130-bbe1-5b454b520c94,R338665354,2630.469971
2,web,d893f643-7c66-4063-9c4d-1afbfa28deab,R401744640,5024.319824
3,web,77374306-461f-451f-bbb5-ff37199c719c,R543260435,1506.900024
4,web,0b2280f4-9be2-4ad0-814b-7bc0dfbecf74,R682548828,3917.280029


In [31]:
test_aov.platform.value_counts()

android    41272
web         3750
Name: platform, dtype: int64

In [32]:
(
    test_aov
    .groupby(['platform','anonymous_id'])
    .agg({
            'order_number':'nunique',
            'goods':'sum'
        })
    .reset_index()
    .groupby('platform')
    [['goods','order_number']].sum()
    .assign(aov = lambda _df: _df.goods/_df.order_number)
)

Unnamed: 0_level_0,goods,order_number,aov
platform,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
android,93382220.0,41272,2262.604773
web,9763435.0,3750,2603.582751


## AOV_первичный

In [33]:
def get_initial_aov_goods_ratio_oos(platforms, start_date, end_date, sources_web, sources_new_app):
        
    req = f""" 
    with 
            toDate('{start_date}') as start_date,
            toDate('{end_date}') as end_date,

        events as (
            select 
                toString(anonymous_id) as anonymous_id,
                toDate(ts) as dt,
                'web' as platform,
                ifNull(NullIf(tenant, ''), 'sbermarket') as tenant
            from event.web
            where 1=1
                and toDate(ts) between start_date and end_date
                and dwh_dt between start_date and end_date
                and not_bot
                and event in {sources_web}
            group by anonymous_id, dt, platform, tenant

            union all

            select 
                anonymous_id,
                toDate(ts) as dt,
                toString(platform) as platform,
                'sbermarket' as tenant
            from event.new_app
            where 1=1
                and toDate(ts) between start_date and end_date
                and dwh_dt between start_date and end_date
                and event in {sources_new_app}

            group by anonymous_id, dt, platform, tenant
        ),

        filter_events as (
            select
                anonymous_id,
                toDate(min(ts)) as event_dt
            from event.new_app
            where 1=1
                and {event_filter}
                and dwh_dt between start_date and end_date
                and toDate(ts) between start_date and end_date
            group by anonymous_id

            UNION ALL 

            select
                toString(anonymous_id) as anonymous_id,
                toDate(min(ts)) as event_dt
            from event.web
            where 1=1
                and {event_filter}
                and dwh_dt between start_date and end_date
                and toDate(ts) between start_date and end_date
            group by anonymous_id
        ),

        client_id_x_order_id as (
            select distinct
                date,
                uuid,
                anonymous_id,
                if(platform in ('desktop', 'mobile'), 'web', platform) as platform,
                tenant,
                order_id
            from 
                db_bs_ab_admin.ab_platform_materialization_client_id_x_order_id
            where true 
                and date between start_date and end_date
        ),

        ab_groups_x_events as (
            select distinct
                events.anonymous_id as anonymous_id,
                events.platform as platform,
                events.tenant as tenant,
                client_id_x_order_id.order_id as order_id,
                events.dt as events_dt

            from 
                events

                inner join filter_events 
                    on events.anonymous_id = filter_events.anonymous_id

                left join client_id_x_order_id
                    on events.anonymous_id = client_id_x_order_id.anonymous_id
                    and events.dt = client_id_x_order_id.date
                    and events.platform = client_id_x_order_id.platform
                    and events.tenant = client_id_x_order_id.tenant

            where true
              and filter_events.event_dt <= events_dt
        ),
        
        li as (

            SELECT order_number,
                   order_completed_at,
                   sumIf(

                   coalesce(coalesce(toFloat32(if(assembly_issue like 'Собран%%шт%%'or assembly_issue like 'Собран%%уп%%', extract(substring(assembly_issue, position(assembly_issue, 'из') + 4), '[\d]*[\d]'), null)),
                    toFloat32(if(assembly_issue like 'Собран%%кг%%', replace(replace(substring(assembly_issue, position(assembly_issue, 'из') + 4), 'кг.', ''), ' ', ''), null))), quantity)*

                    price,

                    li_created_at <= order_completed_at
                   ) AS aov_initial_before,

                   sumIf(price*quantity,li_deleted_at IS NULL) AS aov_initial_after
              FROM line_items
             WHERE store_id IN (SELECT store_id FROM sandbox.stores_with_out_of_stock_model)
               AND order_state = 'complete'
               AND shipment_state = 'shipped'
            -- AND store_id IN (SELECT DISTINCT store_id FROM sandbox.stores_with_out_of_stock_model)
            -- AND li_created_at <= order_completed_at
               AND not (li_deleted_at is not null and assembly_issue is null) -- НЕ сами удалили позицию в рамках заказа
               AND toDate(order_completed_at) between start_date and end_date
               AND order_number not in 
                (
                    select order_number
                    from analytics.shipments
                    where 1=1
                        and toDate(completed_at) between start_date and end_date + interval 14 day
                        and owner_type = 'ServiceAccount'
                    UNION ALL

                    SELECT order_number 
                     from analytics.financial_measures
                    WHERE 1=1
                      AND toDate(completed_at) between start_date and end_date + interval 14 day
                      AND (b2b_order_company_flg = 1 OR b2b_measure != 0)
                )
             GROUP BY order_number, order_completed_at

            )


        select platform
                 , anonymous_id
                 , ab_groups_x_events.order_id as order_number
                 , aov_initial_before
                 , aov_initial_after
              from ab_groups_x_events 
                   join li on ab_groups_x_events.order_id = li.order_number
                           and toDate(li.order_completed_at) = events_dt

             where platform IN {tuple(platforms)}
    """

    all_orders = read_sql_query(req)
    return all_orders

In [34]:
%%time
test_aov_init = get_initial_aov_goods_ratio_oos(['android','web'], end_date, end_date, sources_web, sources_new_app)

CPU times: user 193 ms, sys: 33.2 ms, total: 227 ms
Wall time: 32.4 s


In [35]:
test_aov_init.shape

(45023, 5)

In [36]:
test_aov_init.head()

Unnamed: 0,platform,anonymous_id,order_number,aov_initial_before,aov_initial_after
0,web,88255aa6-7aba-46c2-a117-17ffa297124b,R574410566,2471.149935,2538.379938
1,web,a5a6f05f-13b7-465f-bcc7-14d63d426a07,R680336473,3779.949951,3779.949951
2,web,61bc2cab-afb3-4bc8-9cb4-0996c9c02ca0,R505318176,5587.476456,5102.859966
3,web,14055d9f-7762-48e0-bc1a-377aa9ff0e5d,R202717654,384.819996,442.820004
4,web,55880faa-0ceb-4d44-bbb1-b2476bdacaff,R282634577,2604.940031,2524.960028


In [37]:
test_aov_init.platform.value_counts()

android    41273
web         3750
Name: platform, dtype: int64

In [38]:
(
    test_aov_init
    .groupby(['platform','anonymous_id'])
    .agg({
            'order_number':'nunique',
            'aov_initial_before':'sum',
            'aov_initial_after':'sum'
        })
    .reset_index()
    .groupby('platform')
    [['aov_initial_after','aov_initial_before','order_number']].sum()
    .assign(aov_before = lambda _df: _df.aov_initial_before/_df.order_number)
    .assign(aov_after = lambda _df: _df.aov_initial_after/_df.order_number)
)

Unnamed: 0_level_0,aov_initial_after,aov_initial_before,order_number,aov_before,aov_after
platform,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
android,93345530.0,98890550.0,41273,2396.010599,2261.660788
web,9759176.0,10283810.0,3750,2742.349537,2602.446815


In [39]:
(
    test_aov
    .groupby(['platform','anonymous_id'])
    .agg({
            'order_number':'nunique',
            'goods':'sum'
        })
    .reset_index()
    .groupby('platform')
    [['goods','order_number']].sum()
    .assign(aov = lambda _df: _df.goods/_df.order_number)
)

Unnamed: 0_level_0,goods,order_number,aov
platform,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
android,93382220.0,41272,2262.604773
web,9763435.0,3750,2603.582751


## Retention

In [46]:
def get_general_metrics_oos(platforms, start_date, end_date, sources_web, sources_new_app):
    q = f"""
    with 
        toDate('{start_date}') as start_date,
        toDate('{end_date}') as end_date,

    events as (
        select 
            toString(anonymous_id) as anonymous_id,
            toDate(ts) as dt,
            'web' as platform,
            ifNull(NullIf(tenant, ''), 'sbermarket') as tenant
        from event.web
        where 1=1
            and toDate(ts) between start_date and end_date
            and dwh_dt between start_date and end_date
            and not_bot
            and event in {sources_web}
        group by anonymous_id, dt, platform, tenant

        union all

        select 
            anonymous_id,
            toDate(ts) as dt,
            toString(platform) as platform,
            'sbermarket' as tenant
        from event.new_app
        where 1=1
            and toDate(ts) between start_date and end_date
            and dwh_dt between start_date and end_date
            and event in {sources_new_app}

        group by anonymous_id, dt, platform, tenant
    ),

    filter_events as (
        select
            anonymous_id,
            toDate(min(ts)) as event_dt
        from event.new_app
        where 1=1
            and {event_filter}
            and dwh_dt between start_date and end_date
            and toDate(ts) between start_date and end_date
        group by anonymous_id

        UNION ALL 

        select
            toString(anonymous_id) as anonymous_id,
            toDate(min(ts)) as event_dt
        from event.web
        where 1=1
            and {event_filter}
            and dwh_dt between start_date and end_date
            and toDate(ts) between start_date and end_date
        group by anonymous_id
    ),

    client_id_x_order_id as (
        select distinct
            date,
            uuid,
            anonymous_id,
            if(platform in ('desktop', 'mobile'), 'web', platform) as platform,
            tenant,
            order_id
        from 
            db_bs_ab_admin.ab_platform_materialization_client_id_x_order_id
        where true 
            and date between start_date and end_date
    ),

    ab_groups_x_events as (
        select distinct
            events.anonymous_id as anonymous_id,
            events.platform as platform,
            events.tenant as tenant,
            client_id_x_order_id.order_id as order_id,
            events.dt as events_dt
        from 
            events

            inner join filter_events 
                on events.anonymous_id = filter_events.anonymous_id

            left join client_id_x_order_id
                on events.anonymous_id = client_id_x_order_id.anonymous_id
                and events.dt = client_id_x_order_id.date
                and events.platform = client_id_x_order_id.platform
                and events.tenant = client_id_x_order_id.tenant

        where true
          and filter_events.event_dt <= events_dt
    ),

    late_cancelled_orders as (
     select 
           stateful_id as order_id,
           minIf(toDate(created_at), next_state = 'complete') as completed_at,
           minIf(toDate(created_at), next_state = 'canceled') as canceled_at,
           if((canceled_at < completed_at + interval 2 day or canceled_at < end_date) and canceled_at > toDate('1970-01-01'), 1, 0) as late_cancel_flg
      from (
      select stateful_type,
            created_at,
            next_state,
            stateful_id
        from analytics.int_spree_state_changes  
       where stateful_type = 'Spree::Order'
         and toDate(created_at) between start_date and end_date
         and next_state in ('complete', 'canceled')
        order by created_at) issc
     group by stateful_id
    having completed_at is not null
       and late_cancel_flg
    ),

    financial_measures as (
        select 
            uuid,
            order_number,
            toDate(completed_at) as completed_dt,
            max(shipped_at) as shipped_at,
            sumIf(1, type_delivery=='asap') as express_flg,
            sumIf(1, type_delivery=='planned') as planned_flg,
            sumIf(1, type_delivery=='pickup') as pickup_flg,
            sumIf(gmv_advertising, type_delivery = 'asap') + sumIf(gmv_service_fee_net_promo, type_delivery = 'asap') + sumIf(gmv_goods_net_promo, type_delivery = 'asap') as gmv_net_of_promo_express,
            sumIf(gmv_advertising, type_delivery = 'planned') + sumIf(gmv_service_fee_net_promo, type_delivery = 'planned') + sumIf(gmv_goods_net_promo, type_delivery = 'planned') as gmv_net_of_promo_planned,
            sumIf(gmv_advertising, type_delivery = 'pickup') + sumIf(gmv_service_fee_net_promo, type_delivery = 'pickup') + sumIf(gmv_goods_net_promo, type_delivery = 'pickup') as gmv_net_of_promo_pickup,
            sum(gmv_advertising) + sum(gmv_service_fee_net_promo) + sum(gmv_goods_net_promo) as gmv_net_of_promo,
            sumIf(gross_profit, type_delivery = 'asap') as gross_profit_express,
            sumIf(gross_profit, type_delivery = 'planned') as gross_profit_planned,
            sumIf(gross_profit, type_delivery = 'pickup') as gross_profit_pickup,
            sum(gross_profit) as gross_profit_full

        from analytics.financial_measures
        where 1=1
            and store_id IN (SELECT store_id FROM sandbox.stores_with_out_of_stock_model)
            and toDate(completed_at) between start_date and end_date
            and user_id is not null
            and order_state in ('complete', 'canceled')
            and b2b_order_company_flg != 1
            and b2b_measure = 0
            and order_id not in (select order_id from late_cancelled_orders)
            and order_number not in 
            (
                select order_number
                from analytics.shipments
                where 1=1
                    and toDate(completed_at) between start_date and end_date + interval 14 day
                    and owner_type = 'ServiceAccount'
            )
        group by dictGet('analytics.spree_users_dict', 'uuid', toUInt64(user_id)) as uuid, order_number, completed_at
        )

    select 
        anonymous_id,
        platform,
        tenant,
        sumIf(gmv_net_of_promo, completed_dt = events_dt) as gmv_per_user_within_retilist, -- это значение может быть использовано как числитель для ratio метрики среднего чека
        uniqExactIf(order_id, uuid != '' and completed_dt = events_dt) as orders_within_retilist, -- это значение может быть использовано как знаменатель для ratio метрики среднего чека
        max(if(uuid != '' and completed_dt = events_dt, 1, 0)) as conversion_within_retilist,
        minIf(completed_dt, completed_dt = events_dt) as first_order_date,  
        arrayExists(elem -> assumeNotNull(elem <= toDate(first_order_date + INTERVAL 14 DAY) and elem > first_order_date), groupArray(toDate(shipped_at))) as ret_14d_within_retilist
    from ab_groups_x_events
    left join (select * from financial_measures where completed_dt >= start_date and completed_dt <= end_date + interval 14 day) financial_measures
        on order_id = order_number
    where tenant = 'sbermarket'
    group by anonymous_id, platform, tenant
   HAVING platform IN {tuple(platforms)}
    """
    return read_sql_query(q)

In [47]:
test_general = get_general_metrics_oos(['android','web'], end_date, end_date, sources_web, sources_new_app)

In [48]:
test_general.shape

(261466, 8)

In [49]:
test_general.head()

Unnamed: 0,anonymous_id,platform,tenant,gmv_per_user_within_retilist,orders_within_retilist,conversion_within_retilist,first_order_date,ret_14d_within_retilist
0,9ae7a98c748b90e3,android,sbermarket,1869.057169,1,1,2023-11-06,0
1,b301bae22776e8e1,android,sbermarket,0.0,0,0,,0
2,58a97bf43c4f1376,android,sbermarket,2595.019643,1,1,2023-11-06,0
3,90e5f0104adbf1af,android,sbermarket,0.0,0,0,,0
4,10b0d179d5bbe86d,android,sbermarket,645.739809,1,1,2023-11-06,0


In [50]:
test_general.platform.value_counts()

android    219082
web         42384
Name: platform, dtype: int64

In [53]:
(
    test_general
    .groupby('platform')
    .ret_14d_within_retilist.max()
)

platform
android    0
web        0
Name: ret_14d_within_retilist, dtype: int64

# MDE

## Data_collection

In [61]:
import pickle

def dump_data(data_name, file_path):
    """
    Дампинг данных 
    
    :param data_name: файл 
    :param file_path: str, путь до файла 
    """
    with open(file_path, "wb") as f:
        pickle.dump(data_name, f)

def load_data(data_path):
    """
    Загрузка файла 
    
    :param data_path: str, путь до файла 
    """
    with open(data_path, 'rb') as f:
        data_name = pickle.load(f)
    return data_name

### Отмены/замены_load

In [59]:
not_found_data = {}
for l in tqdm(lengths):
    name = f"{l}w"
    tmp = get_not_found_ratio_oos(platforms, end_date - dt.timedelta(weeks = l), end_date,  sources_web, sources_new_app)
    not_found_data[name] = tmp

100%|██████████| 3/3 [27:07<00:00, 542.46s/it]


In [62]:
dump_data(not_found_data,'not_found_data.pickle')

### Время_сборки_load

In [60]:
collection_data = {}
for l in tqdm(lengths):
    name = f"{l}w"
    tmp = get_collection_time_ratio_oos(platforms, end_date - dt.timedelta(weeks = l), end_date,  sources_web, sources_new_app)
    collection_data[name] = tmp

100%|██████████| 3/3 [27:35<00:00, 551.92s/it]


In [63]:
dump_data(collection_data,'collection_data.pickle')

### AOV_goods_load

In [68]:
aov_data = {}
for l in tqdm(lengths):
    name = f"{l}w"
    tmp = get_aov_goods_ratio_oos(platforms, end_date - dt.timedelta(weeks = l), end_date,  sources_web, sources_new_app)
    aov_data[name] = tmp

100%|██████████| 3/3 [26:26<00:00, 528.81s/it]


In [69]:
dump_data(aov_data,'aov_data.pickle')

### AOV_goods_initial_load

In [71]:
aov_initial_data = {}
for l in tqdm(lengths):
    name = f"{l}w"
    tmp = get_initial_aov_goods_ratio_oos(platforms, end_date - dt.timedelta(weeks = l), end_date,  sources_web, sources_new_app)
    aov_initial_data[name] = tmp

100%|██████████| 3/3 [27:00<00:00, 540.08s/it]


In [72]:
dump_data(aov_initial_data,'aov_initial_data.pickle')

### Retention

In [75]:
retention_data = {}
for l in tqdm(lengths):
    name = f"{l}w"
    tmp = get_general_metrics_oos(platforms, end_date - dt.timedelta(weeks = l), end_date,  sources_web, sources_new_app)
    retention_data[name] = tmp

100%|██████████| 3/3 [26:15<00:00, 525.05s/it]


In [76]:
dump_data(retention_data,'retention_data.pickle')

## Data_preprocessing

### Отмены/замены_prep

In [73]:
%%time
for item in not_found_data:
    t = not_found_data[item]
    t = t.groupby(["anonymous_id", "platform"])[['not_found','total_items']].sum().reset_index()
    not_found_data[item] = t

CPU times: user 6.78 s, sys: 137 ms, total: 6.91 s
Wall time: 6.9 s


In [74]:
not_found_data['4w'].platform.value_counts()

android    683482
web        114477
Name: platform, dtype: int64

### Время_сборки_prep

In [77]:
%%time
for item in collection_data:
    t = collection_data[item]
    t = t.groupby(["anonymous_id", "platform"])[['assembly_speed_positions_num','n_shipments']].sum().reset_index()
    collection_data[item] = t

CPU times: user 4.65 s, sys: 2.66 ms, total: 4.65 s
Wall time: 4.69 s


In [78]:
collection_data['4w'].platform.value_counts()

android    531832
web        100273
Name: platform, dtype: int64

### AOV_goods_prep

In [79]:
%%time
for item in aov_data:
    t = aov_data[item]
    t = t.groupby(["anonymous_id", "platform"]).agg({'goods':'sum','order_number':'nunique'}).reset_index()
    aov_data[item] = t

CPU times: user 7.99 s, sys: 0 ns, total: 7.99 s
Wall time: 8.1 s


In [80]:
aov_data['4w'].platform.value_counts()

android    669200
web        107118
Name: platform, dtype: int64

### AOV_initial_prep

In [83]:
aov_initial_data['2w'].head()

Unnamed: 0,platform,anonymous_id,order_number,aov_initial_before,aov_initial_after
0,web,00540057-4c46-460e-8f4b-352be27d090e,R348772771,2050.470009,2050.470009
1,web,013dd674-8cb4-4869-b5c4-8dc6b21bc1de,R535705004,2768.809998,2340.809998
2,web,017e491f-3956-4a4c-bc4c-0affbb7534e8,R300857222,5256.139893,2628.069946
3,web,031ae5bf-3f65-444c-be06-10b6d31167c0,R271260184,1494.599989,1494.599989
4,web,032a2b2f-e96e-4d15-aa16-73e2e909f4ce,R301703084,10856.035005,11038.729998


In [84]:
%%time
for item in aov_initial_data:
    t = aov_initial_data[item]
    t = t.groupby(["anonymous_id", "platform"]).agg({'aov_initial_before':'sum','aov_initial_after':'sum','order_number':'nunique'}).reset_index()
    aov_initial_data[item] = t

CPU times: user 9.52 s, sys: 20.4 ms, total: 9.54 s
Wall time: 9.56 s


In [85]:
aov_initial_data['4w'].platform.value_counts()

android    669183
web        107152
Name: platform, dtype: int64

### Retention

In [88]:
retention_data['4w'].anonymous_id.value_counts().max()

1

### Итог

In [91]:
final_data = {}
for l in tqdm(lengths):
    name = f"{l}w"
    not_found_tmp = not_found_data[name].copy()
    col_tmp = collection_data[name].copy()
    aov_tmp = aov_data[name].copy()
    aov_initial_tmp = aov_initial_data[name].copy()
    retention_data_tmp = retention_data[name].copy()
    out = (
        not_found_tmp
        .merge(col_tmp, on = ['platform','anonymous_id'],how = 'outer')
        .merge(aov_tmp, on = ['platform','anonymous_id'],how = 'outer')
        .merge(aov_initial_tmp, on = ['platform','anonymous_id'],how = 'outer')
        .merge(retention_data_tmp, on = ['platform','anonymous_id'],how='outer')
    )
    final_data[name] = out

100%|██████████| 3/3 [00:31<00:00, 10.66s/it]


In [92]:
final_data['2w'].isna().sum()

anonymous_id                          0
platform                              0
not_found                       1349167
total_items                     1349167
assembly_speed_positions_num    1468325
n_shipments                     1468325
goods                           1363537
order_number_x                  1363537
aov_initial_before              1363521
aov_initial_after               1363521
order_number_y                  1363521
tenant                             4739
gmv_per_user_within_retilist       4739
orders_within_retilist             4739
conversion_within_retilist         4739
first_order_date                1347855
ret_14d_within_retilist            4739
dtype: int64

In [81]:
# Проверям пустые чеки (исходные)

In [93]:
final_data['2w'].query('oos_aov_initial_num != oos_aov_initial_num').query('goods==goods')

UndefinedVariableError: name 'oos_aov_initial_num' is not defined

In [91]:
aoi = get_initial_aov_goods_ratio(platforms, end_date - dt.timedelta(weeks = 2), end_date, oos_stores)

In [92]:
aof = get_aov_goods_ratio(platforms, end_date - dt.timedelta(weeks = 2), end_date, oos_stores)

In [108]:
nf = get_not_found_ratio(platforms, end_date - dt.timedelta(weeks = 2), end_date, oos_stores)

In [119]:
ct = get_collection_time_ratio(platforms, end_date - dt.timedelta(weeks = 2), end_date, oos_stores)

In [93]:
aof.query('anonymous_id == "056f9ad467c29f42"')

Unnamed: 0,platform,anonymous_id,order_number,goods
397153,android,056f9ad467c29f42,R086115625,3039.620117


In [94]:
aoi.query('anonymous_id == "056f9ad467c29f42"')

Unnamed: 0,platform,anonymous_id,order_number,oos_aov_initial_num


In [96]:
read_sql_query("""
SELECT *
  FROM shipments
 WHERE order_number = 'R086115625'
""")

Unnamed: 0,shipment_number,order_number,shipment_id,order_id,shipment_state,order_state,order_payment_state,driver_login,driver_name,driver_phone,shopper_login,shopper_name,shopper_phone,store_id,retailer_id,item_total,promo_total,total_cost,total,shipped_at,item_count,total_quantity,total_weight,starts_at,shipping_category_id,ends_at,completed_at,created_at,tenant_id,shipping_method_kind,retailer_name,b2b,company_document_id,store_name,city_id,city_name,receipt_total,fiscal_checksum,receipts_count,first_paid_at,user_id,phone,new_or_repeated,cohort_tenant,delivery_window_id,ship_address_id,api_client_id,uuid,company_name,company_inn,weight_over_base,nominal_cost,owner_id,owner_type,import_key,b2b_order_company_flg,payment_method_name,replaced_items_cnt,canceled_items_cnt,client_identifier,device_type,os,platform,promo_code,promo_type,shopper_call_flg,courier_call_flg,prev_delivery_window_id,store_express_delivery_flg,on_demand_delivery_flg,rate,comment,delivery_area_id,base_store_id,retailer_size,division_manager,city_manager,supervisor,address_latitude,address_longitude,prime_flg,external_service_state,external_id,b2b_measure,external_assembly_kind,orders_api_integration_type,payment_id,is_pharmacy_flg,type_delivery,type_store_delivery,prime_lvl,cost,invoice_number,invoice_total,zone_name,zone_id,retailer_discount_item_total,loyal_user_flg,assembly_rate,shipping_rate,retailer_category_name,integration_type,delivery_window_created_at
0,H40289121689,R086115625,316833697,253063671,shipped,complete,paid,9996362082,Першина Ксения Михайловна,79384117066,222409044305,Левина Инна Андреевна,79282478506,251,15,3039.620117,0.0,0.0,3039.620117,2023-08-14 11:25:03,1,38,15960,2023-08-14 10:30:00,2,2023-08-14 17:00:00,2023-08-14 09:30:44,2022-12-15 11:00:45,sbermarket,pickup,АШАН,0,,"АШАН, Краснодар, улица Уральская, 79 .",9,Краснодар,3039.620117,2424485804,1,2023-08-14 13:08:00,15331402,2950132536493270676,repeated,auchan,277452504,133863773,10,469e87a8-c2ea-4ba0-9339-310c7b3a61b6,,,0.0,0.0,15331402,Spree::User,10-21,0,Картой онлайн,0,0,SbermarketAndroid,mobile,android,app,[],[],0,0,0,0,0,5.0,,4113,251,hypermarket,Лукомский Олег Владимирович,Джанхот Маргарита Алиевна,Нархова Елена Ивановна,45.025288,39.108463,1,,,0,0,0,159488704,0,pickup,planned,prime+,0.0,,0.0,"ЗОНА Г-Ашан, Уральская",5372,0.0,not_loyal,5,0,Продукты питания,universal,2023-08-07 23:03:22


In [105]:
read_sql_query("""
SELECT *
  FROM line_items
 WHERE order_number = 'R086115625'
   AND toDate(order_completed_at) = toDate('2023-08-14')
""")

Unnamed: 0,line_item_id,offer_id,price,retailer_shelf_price,cost_price,vat_rate,quantity,found_quantity,discount,order_id,shipment_id,tenant_id,store_id,city,order_completed_at,shipment_shipped_at,order_state,shipment_state,product_id,product_name,brand_id,brand_name,manufacturer,manufacturer_id,retailer_sku,assembly_issue,li_created_at,li_deleted_at,b2b,retailer_name,retailer_id,user_id,master_category,master_category_id,parent_category_id,uuid,sku,order_number
0,2799640070,22652954484,345.98999,345.98999,345.98999,20,1,,0.0,253063671,316833697,sbermarket,251,Краснодар,2023-08-14 09:30:44,2023-08-14 11:25:03,complete,shipped,27297833,Наполнитель для кошачьего туалета АШАН Красная...,8261,АШАН Красная птица,Лидинг ООО,6482,842656,,2023-05-31 10:15:15,2023-08-14 09:04:39,0,АШАН,15,15331402,Наполнители,1053,1050,0d641a39-216d-4b0e-adba-1db77ab241f5,15276586,R086115625
1,2694598489,1658494566,176.990005,176.990005,176.990005,20,1,,0.0,253063671,316833697,sbermarket,251,Краснодар,2023-08-14 09:30:44,2023-08-14 11:25:03,complete,shipped,5074492,Конфеты Raffaello с цельным миндальным орехом ...,2119,Raffaello,Ферреро Руссия ЗАО,5940,882974,,2023-05-04 11:55:47,2023-05-04 11:56:50,0,АШАН,15,15331402,Конфеты и сладкие подарочные наборы,61,56,169f8326-71c0-4df3-aaf4-d734eb7d05cc,139256,R086115625


In [None]:
# Проверям пустые чеки (итоговые)

In [96]:
final_data['2w'].query('goods != goods')

Unnamed: 0,anonymous_id,platform,not_found,total_items,assembly_speed_positions_num,n_shipments,goods,order_number_x,aov_initial_before,aov_initial_after,order_number_y,tenant,gmv_per_user_within_retilist,orders_within_retilist,conversion_within_retilist,first_order_date,ret_14d_within_retilist
56,000682bf-b846-4ae8-b700-8a989d379315,web,2.0,18.0,1233.5,1.0,,,,,,sbermarket,0.0,0.0,0.0,,0.0
62,00076b82cdc0c5e2,android,0.0,9.0,899.5,1.0,,,,,,sbermarket,0.0,0.0,0.0,,0.0
70,0008793fa71e4e08,android,0.0,26.0,2070.0,1.0,,,,,,sbermarket,0.0,0.0,0.0,,0.0
88,000a509ab59f8de0,android,1.0,101.0,11335.5,11.0,,,,,,sbermarket,0.0,0.0,0.0,,0.0
111,000c445d2838af2b,android,2.0,23.0,2051.0,1.0,,,,,,sbermarket,0.0,0.0,0.0,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1877432,006e1965b648a12f,android,,,,,,,,,,sbermarket,0.0,0.0,0.0,,0.0
1877433,274d524f-d81f-47d8-903d-5363188384ff,web,,,,,,,,,,sbermarket,0.0,0.0,0.0,,0.0
1877434,59ca33cd-33f1-478f-8ced-87e9ceae7f54,web,,,,,,,,,,sbermarket,0.0,0.0,0.0,,0.0
1877435,b8f428d25e3000fa,android,,,,,,,,,,sbermarket,0.0,0.0,0.0,,0.0


In [110]:
nf.query('anonymous_id == "000a509ab59f8de0"')

Unnamed: 0,platform,anonymous_id,order_number,order_id,completed_at,total_items,repl,canc,not_found,rc_rate
218518,android,000a509ab59f8de0,R345246042,349754810,2023-08-08 14:01:46,2,0,0,0,0.0
220274,android,000a509ab59f8de0,R754514567,353835672,2023-08-21 04:40:37,4,0,0,0,0.0
220275,android,000a509ab59f8de0,R837783041,354001854,2023-08-21 04:41:22,15,0,0,0,0.0
235832,android,000a509ab59f8de0,R553763856,352009394,2023-08-19 06:56:03,20,2,0,2,10.0
252947,android,000a509ab59f8de0,R765123067,350802793,2023-08-11 04:35:20,15,0,0,0,0.0
252948,android,000a509ab59f8de0,R741020041,350877291,2023-08-11 04:38:53,3,0,0,0,0.0
252949,android,000a509ab59f8de0,R017004467,350877956,2023-08-11 06:32:16,40,3,1,4,9.756098
254813,android,000a509ab59f8de0,R463530013,350905937,2023-08-13 15:30:40,27,0,1,1,3.571429
484669,android,000a509ab59f8de0,R766121107,351745883,2023-08-14 14:14:40,13,1,0,1,7.692308
557543,android,000a509ab59f8de0,R328428704,353834252,2023-08-20 11:59:54,1,0,0,0,0.0


In [112]:
read_sql_query("""
SELECT owner_type,*
  FROM shipments
 WHERE order_number = 'R345246042'
""")

Unnamed: 0,owner_type,shipment_number,order_number,shipment_id,order_id,shipment_state,order_state,order_payment_state,driver_login,driver_name,driver_phone,shopper_login,shopper_name,shopper_phone,store_id,retailer_id,item_total,promo_total,total_cost,total,shipped_at,item_count,total_quantity,total_weight,starts_at,shipping_category_id,ends_at,completed_at,created_at,tenant_id,shipping_method_kind,retailer_name,b2b,company_document_id,store_name,city_id,city_name,receipt_total,fiscal_checksum,receipts_count,first_paid_at,user_id,phone,new_or_repeated,cohort_tenant,delivery_window_id,ship_address_id,api_client_id,uuid,company_name,company_inn,weight_over_base,nominal_cost,owner_id,owner_type.1,import_key,b2b_order_company_flg,payment_method_name,replaced_items_cnt,canceled_items_cnt,client_identifier,device_type,os,platform,promo_code,promo_type,shopper_call_flg,courier_call_flg,prev_delivery_window_id,store_express_delivery_flg,on_demand_delivery_flg,rate,comment,delivery_area_id,base_store_id,retailer_size,division_manager,city_manager,supervisor,address_latitude,address_longitude,prime_flg,external_service_state,external_id,b2b_measure,external_assembly_kind,orders_api_integration_type,payment_id,is_pharmacy_flg,type_delivery,type_store_delivery,prime_lvl,cost,invoice_number,invoice_total,zone_name,zone_id,retailer_discount_item_total,loyal_user_flg,assembly_rate,shipping_rate,retailer_category_name,integration_type,delivery_window_created_at
0,Spree::User,H31353796589,R345246042,429325988,349754810,shipped,complete,paid,342603685224,Новиков Олег Игоревич,79220423079,341600497703,Сафонова Надежда Юрьевна,79047554738,107,1,1013.909973,0.0,59.0,1072.910034,2023-08-08 16:40:59,2,4,3630,2023-08-08 15:00:00,2,2023-08-08 18:00:00,2023-08-08 14:01:46,2023-08-07 10:26:32,sbermarket,by_courier,METRO,1,,"METRO, Волгоград, Историческая ул. 164",12,Волгоград,0.0,,0,,1940749,5651203267827947780,repeated,sbermarket,274008803,208462832,10,4579c507-7b45-48ef-9750-4bbf9586a926,,,0.0,229.0,1940749,Spree::User,1-28,0,Картой онлайн,0,0,SbermarketAndroid,mobile,android,app,[],[],0,0,0,0,0,0.0,,0,0,hypermarket,Синчугов Иван Алексеевич,Треножкин Дмитрий Викторович,Храмов Аркадий Викторович,48.729786,44.5252,1,,,2,0,0,157843931,0,planned,planned,prime+,59.0,0/0/0094/050631,874.530029,Историческая Метро К,12738,56.91,no_info,0,0,Продукты питания,universal,2023-08-01 23:01:37


In [114]:
read_sql_query("""
SELECT b2b_order_company_flg, b2b_measure
  FROM financial_measures
 WHERE order_number = 'R345246042'
""")

Unnamed: 0,b2b_order_company_flg,b2b_measure
0,0,2


In [None]:
# Проверям пустое время сборки

In [116]:
final_data['2w'].query('assembly_speed_positions_num != assembly_speed_positions_num')

Unnamed: 0,anonymous_id,platform,not_found,total_items,assembly_speed_positions_num,n_shipments,goods,order_number_x,oos_aov_initial_num,order_number_y
3,00010adb4bbc712c,android,0.0,22.0,,,3125.300049,2.0,3125.299974,2.0
24,00053fbdd166e105,android,0.0,96.0,,,11565.059814,3.0,11565.060116,3.0
30,000611cda706943d,android,0.0,7.0,,,720.429993,1.0,720.430016,1.0
37,000755ec3f765b50,android,0.0,9.0,,,1589.880005,1.0,1589.880013,1.0
38,00075951e01323ab,android,0.0,8.0,,,942.099976,1.0,942.100014,1.0
...,...,...,...,...,...,...,...,...,...,...
460079,ffffdcfecfa9e7f6,android,0.0,6.0,,,1394.869995,2.0,1394.870014,2.0
460142,270017ce6b507db6,android,,,,,,,3812.059988,1.0
460143,67d02803-437e-4f48-9fb8-286d6ffd94f9,web,,,,,,,519.000000,1.0
460144,783f9a7dba591329,android,,,,,,,26997.000000,1.0


In [122]:
nf.query('anonymous_id == "00010adb4bbc712c"')

Unnamed: 0,platform,anonymous_id,order_number,order_id,completed_at,total_items,repl,canc,not_found,rc_rate
18512,android,00010adb4bbc712c,R484810503,350034006,2023-08-13 10:41:40,10,0,0,0,0.0
308741,android,00010adb4bbc712c,R813060778,351646601,2023-08-18 09:51:39,12,0,0,0,0.0


In [128]:
read_sql_query("""
SELECT shipment_number, shipped_at FROM shipments where order_number IN ('R484810503','R813060778')
""")

Unnamed: 0,shipment_number,shipped_at
0,H71808562601,2023-08-13 11:25:35
1,H94553473094,2023-08-18 10:26:14


In [None]:
read_sql_query("""
select toDate(shipped_at) as Date, 
           store_id as store_id_click, 
           start_assembling_to_first_item * ifNull(start_to_first_item_coef, 1)   
               + items_collection_time * ifNull(items_collection_time_coef, 1)                            
               + last_item_to_cashing_started * ifNull(last_item_to_cashing_coef, 1) as assembly_speed_positions_num 
    from sandbox.cte_decomposition_new
    where toDate(shipped_at) >= toDate('{date_from}') 
    and toDate(shipped_at) <= toDate('{date_to}')
    and overall_time_of_order is not null
    фтв
""")

In [130]:
read_sql_query("""
SELECT toDate(shipped_at) as Date, 
           store_id as store_id_click, 
           start_assembling_to_first_item * ifNull(start_to_first_item_coef, 1)   
               + items_collection_time * ifNull(items_collection_time_coef, 1)                            
               + last_item_to_cashing_started * ifNull(last_item_to_cashing_coef, 1) as assembly_speed_positions_num 
  FROM sandbox.cte_decomposition_new 
 where shipment_number in ('H71808562601','H94553473094') 
   AND toDate(shipped_at) IN (toDate('2023-08-13'),toDate('2023-08-18'))
 limit 10""")

Unnamed: 0,Date,store_id_click,assembly_speed_positions_num
0,2023-08-18,33860,
1,2023-08-13,31354,


## MDE_estimation

In [102]:
res = {"platform": [], 
       "length": [], 
       "traffic_proportion": [], 
       "alpha":[], 
       "nobs":[],
       "est":[],
       "mde_abs": [], 
       "mde_percent": [],
       "metric": []
      }

In [103]:
# data = {f'{i+min(lengths)}w':old_data[j] for i,j in enumerate(old_data.keys())}
data = final_data.copy()

In [104]:
# format: (num, denum), if denum is None => average metric
metrics = [
    ("not_found", "total_items"),
    ("assembly_speed_positions_num", "n_shipments"),
    ("goods", "order_number_x"),
    ("aov_initial_before", "order_number_y"),
    ("aov_initial_after", "order_number_y"),
    ("ret_14d_within_retilist", None)
]

In [105]:
ratio = 1
power = 0.8

for platform in platforms:
    for length in data.keys():
        for traffic_proportion in traffic_proportions:
            for metric in tqdm(metrics):
                df = data[length].loc[data[length]["platform"]==platform].sample(frac=traffic_proportion)
                num_col = metric[0]
                denum_col = metric[1]
                
                if num_col == "gmv_per_user" and denum_col == "order_count": # aov preprocessing
                    df = df[df["order_count"]>0]
                if num_col == "assembly_speed_positions_num" and denum_col == "n_shipments": # aov preprocessing
                    df = df[df["n_shipments"]>0]
                if num_col == "not_found" and denum_col == "total_items": # aov preprocessing
                    df = df[df["total_items"]>0]
                if num_col == "goods" and denum_col == "order_number_x": # aov preprocessing
                    df = df[df["order_number_x"]>0]
                if num_col == "aov_initial_before" and denum_col == "order_number_y": # aov preprocessing
                    df = df[df["order_number_y"]>0]
                if num_col == "aov_initial_after" and denum_col == "order_number_y": # aov preprocessing
                    df = df[df["order_number_y"]>0]
                if num_col == "ret_14d_within_retilist": # aov preprocessing
                    df = df[df["ret_14d_within_retilist"]==df["ret_14d_within_retilist"]]                    
                    
                df = df.sample(frac=0.5) # select one group
                
                if denum_col:
                    num = df[num_col]
                    denum = df[denum_col]
                    est, sd = deltamethod(denum, num, independent=False, bc=True)
                else:
                    est = df[num_col].mean()
                    sd = df[num_col].std()
                
                nobs_test = df[num_col].shape[0]
                
                for threshold in thresholds:
                    alpha = threshold
                    effect_size =  tt_ind_solve_power(power=power, nobs1=nobs_test, alpha=alpha, ratio=ratio)
                    mde_percent = effect_size*sd/est
                    mde_abs = mde_percent*est
                    res["platform"].append(platform)
                    res["length"].append(length)
                    res["traffic_proportion"].append(traffic_proportion)
                    res["alpha"].append(alpha)
                    res["nobs"].append(nobs_test)
                    res["est"].append(est)
                    res["mde_abs"].append(mde_abs)
                    res["mde_percent"].append(mde_percent)
                    res["metric"].append(metric)
                

100%|██████████| 6/6 [00:03<00:00,  1.53it/s]
100%|██████████| 6/6 [00:04<00:00,  1.21it/s]
100%|██████████| 6/6 [00:07<00:00,  1.21s/it]
100%|██████████| 6/6 [00:10<00:00,  1.68s/it]
100%|██████████| 6/6 [00:12<00:00,  2.07s/it]
100%|██████████| 6/6 [00:04<00:00,  1.30it/s]
100%|██████████| 6/6 [00:06<00:00,  1.00s/it]
100%|██████████| 6/6 [00:09<00:00,  1.53s/it]
100%|██████████| 6/6 [00:12<00:00,  2.07s/it]
100%|██████████| 6/6 [00:16<00:00,  2.79s/it]
100%|██████████| 6/6 [00:06<00:00,  1.03s/it]
100%|██████████| 6/6 [00:07<00:00,  1.29s/it]
100%|██████████| 6/6 [00:11<00:00,  1.97s/it]
100%|██████████| 6/6 [00:15<00:00,  2.55s/it]
100%|██████████| 6/6 [00:20<00:00,  3.46s/it]
100%|██████████| 6/6 [00:01<00:00,  3.18it/s]
100%|██████████| 6/6 [00:02<00:00,  2.73it/s]
100%|██████████| 6/6 [00:02<00:00,  2.20it/s]
100%|██████████| 6/6 [00:03<00:00,  1.78it/s]
100%|██████████| 6/6 [00:03<00:00,  1.58it/s]
100%|██████████| 6/6 [00:02<00:00,  2.67it/s]
100%|██████████| 6/6 [00:02<00:00,

In [106]:
res.keys()

dict_keys(['platform', 'length', 'traffic_proportion', 'alpha', 'nobs', 'est', 'mde_abs', 'mde_percent', 'metric'])

In [107]:
mde_df = pd.DataFrame(res)
mde_df["mde_abs"] = mde_df["mde_abs"].round(4)
mde_df["mde_percent"] = (mde_df["mde_percent"]*100).round(2).astype(str) + "%"
# mde_df["metric"] = mde_df["metric"].astype(str).replace(metric_names)

In [108]:
(
    mde_df
    .query('alpha == 0.05')
    .query('traffic_proportion == 1')
    .sort_values(by = ['metric','platform','length'])
)

Unnamed: 0,platform,length,traffic_proportion,alpha,nobs,est,mde_abs,mde_percent,metric
56,android,2w,1.0,0.05,225132,2492.287723,18.6968,0.75%,"(aov_initial_after, order_number_y)"
116,android,3w,1.0,0.05,283919,2490.515726,16.4591,0.66%,"(aov_initial_after, order_number_y)"
176,android,4w,1.0,0.05,334592,2517.113628,15.5559,0.62%,"(aov_initial_after, order_number_y)"
236,web,2w,1.0,0.05,31826,2886.240884,70.0084,2.43%,"(aov_initial_after, order_number_y)"
296,web,3w,1.0,0.05,42718,2852.41455,55.3826,1.94%,"(aov_initial_after, order_number_y)"
356,web,4w,1.0,0.05,53576,2852.771036,48.3469,1.69%,"(aov_initial_after, order_number_y)"
54,android,2w,1.0,0.05,225132,2669.791938,41.9773,1.57%,"(aov_initial_before, order_number_y)"
114,android,3w,1.0,0.05,283919,2657.210199,32.8805,1.24%,"(aov_initial_before, order_number_y)"
174,android,4w,1.0,0.05,334592,2684.785728,22.7306,0.85%,"(aov_initial_before, order_number_y)"
234,web,2w,1.0,0.05,31826,3090.028753,173.0836,5.6%,"(aov_initial_before, order_number_y)"


In [111]:
mde_df.reset_index(drop=True).assign(metric = lambda _df: _df.metric.astype(str)).to_feather('mde_df.feather')

In [113]:
mde_df

Unnamed: 0,platform,length,traffic_proportion,alpha,nobs,est,mde_abs,mde_percent,metric
0,android,2w,0.125,0.05,28720,0.060070,0.0016,2.69%,"(not_found, total_items)"
1,android,2w,0.125,0.01,28720,0.060070,0.0020,3.28%,"(not_found, total_items)"
2,android,2w,0.125,0.05,21921,1445.594640,33.0128,2.28%,"(assembly_speed_positions_num, n_shipments)"
3,android,2w,0.125,0.01,21921,1445.594640,40.2730,2.79%,"(assembly_speed_positions_num, n_shipments)"
4,android,2w,0.125,0.05,27906,2497.876223,48.9896,1.96%,"(goods, order_number_x)"
...,...,...,...,...,...,...,...,...,...
355,web,4w,1.000,0.01,53576,3066.389657,81.7145,2.66%,"(aov_initial_before, order_number_y)"
356,web,4w,1.000,0.05,53576,2852.771036,48.3469,1.69%,"(aov_initial_after, order_number_y)"
357,web,4w,1.000,0.01,53576,2852.771036,58.9796,2.07%,"(aov_initial_after, order_number_y)"
358,web,4w,1.000,0.05,401648,0.067233,0.0016,2.33%,"(ret_14d_within_retilist, None)"


In [120]:
metric_names = dict.fromkeys(mde_df.metric.astype(str).unique())
metric_names

{"('not_found', 'total_items')": None,
 "('assembly_speed_positions_num', 'n_shipments')": None,
 "('goods', 'order_number_x')": None,
 "('aov_initial_before', 'order_number_y')": None,
 "('aov_initial_after', 'order_number_y')": None,
 "('ret_14d_within_retilist', None)": None}

In [121]:
metric_names = \
{"('not_found', 'total_items')": "Доля отмен/замен",
 "('assembly_speed_positions_num', 'n_shipments')": "Среднее время сборки",
 "('goods', 'order_number_x')": "Средний чек",
 "('aov_initial_before', 'order_number_y')": "Средний чек по line_items (до)",
 "('aov_initial_after', 'order_number_y')": "Средний чек по line_items (после)",
 "('ret_14d_within_retilist', None)": "Ретеншн 14д"}

In [127]:
print((
    mde_df
    .assign(metric_name = lambda _df: _df.metric.astype(str).replace(metric_names))
).to_markdown(index=False))

| platform   | length   |   traffic_proportion |   alpha |   nobs |          est |   mde_abs | mde_percent   | metric                                          | metric_name                       |
|:-----------|:---------|---------------------:|--------:|-------:|-------------:|----------:|:--------------|:------------------------------------------------|:----------------------------------|
| android    | 2w       |                0.125 |    0.05 |  28720 |    0.0600704 |    0.0016 | 2.69%         | ('not_found', 'total_items')                    | Доля отмен/замен                  |
| android    | 2w       |                0.125 |    0.01 |  28720 |    0.0600704 |    0.002  | 3.28%         | ('not_found', 'total_items')                    | Доля отмен/замен                  |
| android    | 2w       |                0.125 |    0.05 |  21921 | 1445.59      |   33.0128 | 2.28%         | ('assembly_speed_positions_num', 'n_shipments') | Среднее время сборки              |
| android    | 

# AA & AB

In [139]:
synthetic_data = {}
for l in tqdm(lengths):
    name = f"{l}w"
    tmp = get_not_found_ratio_oos(platforms, end_date - dt.timedelta(weeks = l), end_date,  sources_web, sources_new_app)
    synthetic_data[name] = tmp

100%|██████████| 3/3 [42:51<00:00, 857.01s/it]


In [140]:
alpha = 0.05
sample_size = 0.5
effect = -0.1

In [141]:
traffic_proportions = [1]

## AA

In [142]:
%%time
out = {"platform": [], 
       "length": [], 
       "traffic_proportion": [], 
       "power":[]
}
p_values = {}

for platform in platforms:
    for length in data.keys():
        for traffic_proportion in traffic_proportions:
            
            state = f'{platform}_{length}_{traffic_proportion}'
            
            df_users = synthetic_data[length][['anonymous_id']].drop_duplicates().sample(frac=traffic_proportion)
            df_raw = synthetic_data[length].query('platform == @platform').query('anonymous_id.isin(@df_users.anonymous_id.tolist())', engine='python')
                        
            df_raw['weekday'] = df_raw.completed_at.dt.weekday + 1 
            
            p_values[state] = []
            
            for _ in tqdm(range(100)):
                
                df = df_raw.sort_values(by = 'not_found', ascending = False).copy()
                
                test_users = df_users.sample(frac=0.5)
                df['group'] = np.where(df.anonymous_id.isin(test_users.anonymous_id.values),'test','control')
                
                df_metrics = df.groupby(["anonymous_id", "platform","group"])[['not_found','total_items']].sum().reset_index()
                
                p, _, _ = calculate_pvalue('test', 'control', ('not_found','total_items'), 'group', df_metrics, metric_type='ratio')
                p_values[state].append(p)
                

            errors = (np.array(p_values[state])>alpha).astype(int)
            power = 1 - np.mean(errors)
                
            out['platform'].append(platform)
            out['length'].append(length)
            out['traffic_proportion'].append(traffic_proportion)
            out['power'].append(power)                

100%|██████████| 100/100 [04:51<00:00,  2.92s/it]
100%|██████████| 100/100 [06:59<00:00,  4.19s/it]
100%|██████████| 100/100 [09:12<00:00,  5.53s/it]
100%|██████████| 100/100 [00:46<00:00,  2.15it/s]
100%|██████████| 100/100 [01:11<00:00,  1.40it/s]
100%|██████████| 100/100 [01:25<00:00,  1.16it/s]

CPU times: user 24min 16s, sys: 21.1 s, total: 24min 37s
Wall time: 24min 38s





In [143]:
pd.DataFrame(out)

Unnamed: 0,platform,length,traffic_proportion,power
0,android,2w,1,0.05
1,android,3w,1,0.04
2,android,4w,1,0.05
3,web,2w,1,0.03
4,web,3w,1,0.07
5,web,4w,1,0.03


In [109]:
np.sum(np.array(p_values['web_2w_1'])>0.05)

100

In [110]:
p_values['web_2w_1']

[0.36898239203911387,
 0.33937531707227586,
 0.30875933255099847,
 0.3440126553493108,
 0.292256820155299,
 0.33359924888017645,
 0.29562825438984636,
 0.34977656211211117,
 0.25518476354985037,
 0.3237396451878439,
 0.39342655685748085,
 0.32017998346441584,
 0.2682432301481323,
 0.36069049711422185,
 0.32883655137085277,
 0.3192357159804766,
 0.38004870182756023,
 0.35693704152081196,
 0.3800836526263771,
 0.34644846131669427,
 0.31301718612660206,
 0.32730386931465283,
 0.31585108833239695,
 0.3166405157374499,
 0.32444230714531586,
 0.2503063411936794,
 0.27207257464251416,
 0.33971896257504897,
 0.30725518833695237,
 0.4242375740709726,
 0.24887214453063766,
 0.3560841384714546,
 0.32241109542533875,
 0.3748666502510756,
 0.35450875238633606,
 0.2981656677553302,
 0.4055451862830265,
 0.3493535833802641,
 0.35601983561516515,
 0.2940545515982298,
 0.37829408041660106,
 0.3489306692751132,
 0.3044311384870983,
 0.3365760650037406,
 0.32434286959825254,
 0.322242105660448,
 0.305537

### AB

In [148]:
effect = -0.05

In [149]:
%%time
out = {"platform": [], 
       "length": [], 
       "traffic_proportion": [], 
       "power":[]
}
p_values = {}

for platform in platforms:
    for length in data.keys():
        for traffic_proportion in traffic_proportions:
            
            state = f'{platform}_{length}_{traffic_proportion}'
            
            df_users = synthetic_data[length][['anonymous_id']].drop_duplicates().sample(frac=traffic_proportion)
            df_raw = synthetic_data[length].query('platform == @platform').query('anonymous_id.isin(@df_users.anonymous_id.tolist())', engine='python')
                        
            df_raw['weekday'] = df_raw.completed_at.dt.weekday + 1
            
            p_values[state] = []
            
            for _ in tqdm(range(100)):
                
                df = df_raw.sort_values(by = 'not_found', ascending = False).copy()
                
                test_users = df_users.sample(frac=0.5)
                df['group'] = np.where(df.anonymous_id.isin(test_users.anonymous_id.values),'test','control')
                tmp = (
                    df
                    .query('group=="test"')
                    .query("weekday.isin([5,6,7,1])", engine='python')
                    [['anonymous_id','platform','order_number','not_found']]
                )
                
                old_not_found_total = tmp.not_found.sum()
                new_not_found_total = int(old_not_found_total*(1+effect))
                tmp['weight'] = tmp.not_found / tmp.not_found.sum()
                tmp['new_not_found'] = np.round(tmp.weight*new_not_found_total)
                tmp['kostyl'] = tmp.new_not_found.cumsum()
                tmp.loc[tmp.kostyl>new_not_found_total, 'new_not_found'] = 0
                
                df = df.merge(tmp[['anonymous_id','platform','order_number','new_not_found']], on = ['anonymous_id','platform','order_number'], how = 'left')
                df['final_not_found'] = df.not_found
                df.loc[df['new_not_found'].notna(), 'final_not_found'] = df['new_not_found']
                
                df_metrics = df.groupby(["anonymous_id", "platform","group"])[['final_not_found','total_items']].sum().reset_index()
                
                p, _, _ = calculate_pvalue('test', 'control', ('final_not_found','total_items'), 'group', df_metrics, metric_type='ratio')
                p_values[state].append(p)
                

            errors = (np.array(p_values[state])>alpha).astype(int)
            power = 1 - np.mean(errors)
                
            out['platform'].append(platform)
            out['length'].append(length)
            out['traffic_proportion'].append(traffic_proportion)
            out['power'].append(power)                

100%|██████████| 100/100 [08:19<00:00,  4.99s/it]
100%|██████████| 100/100 [12:02<00:00,  7.23s/it]
100%|██████████| 100/100 [16:12<00:00,  9.73s/it]
100%|██████████| 100/100 [01:05<00:00,  1.52it/s]
100%|██████████| 100/100 [01:41<00:00,  1.01s/it]
100%|██████████| 100/100 [02:07<00:00,  1.28s/it]

CPU times: user 40min 7s, sys: 1min 30s, total: 41min 38s
Wall time: 41min 40s





In [150]:
pd.DataFrame(out)

Unnamed: 0,platform,length,traffic_proportion,power
0,android,2w,1,1.0
1,android,3w,1,1.0
2,android,4w,1,1.0
3,web,2w,1,0.9
4,web,3w,1,0.98
5,web,4w,1,0.98


In [146]:
p_values

{'android_2w_1': [2.5709574176189915e-77,
  7.135668096207158e-81,
  4.250476604311184e-89,
  7.098531939336588e-79,
  1.241327868542202e-85,
  9.704407793047085e-78,
  2.899874046530344e-54,
  2.0921322263528e-90,
  2.7389492462767956e-72,
  1.2950266322627779e-87,
  1.0598767550622386e-69,
  1.358287213235657e-78,
  9.460556727161592e-88,
  8.164073913702496e-84,
  1.1416273828025342e-91,
  1.214196954025985e-78,
  7.003667817585489e-80,
  5.932734639017576e-72,
  2.8718014003322634e-77,
  5.383682761868047e-91,
  6.868125668483939e-82,
  2.03071131991529e-79,
  4.535219624266471e-65,
  9.337395859510062e-80,
  3.625803746066295e-82,
  4.177026193193421e-95,
  1.312533609963917e-88,
  2.495624368831387e-84,
  2.6046626376416986e-89,
  6.305025443387996e-88,
  1.859488857186489e-71,
  9.795228519333371e-73,
  1.9127785081040375e-86,
  3.092480139300434e-88,
  4.206572834151088e-76,
  1.1385043040492983e-85,
  1.0840210737311506e-84,
  1.142336348887695e-91,
  1.3823729030789968e-84,
 