In [1]:
import sqlite3
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/Users/ilamosin/abTest_pet/app_logs.csv', parse_dates=['date'])
df.head(5)

Unnamed: 0,date,user_id,exp_group,session_id,session_length,device,order_cnt,price,quantity_cnt
0,2022-06-21,1727468184902936832,0,16557919071727468184902936832,1752,mobile,0,0.0,0
1,2022-06-21,3556219827372626733,0,16558186423556219827372626733,86,mobile,0,0.0,0
2,2022-06-21,2136855388003761026,0,16558402842136855388003761026,222,mobile,0,0.0,0
3,2022-06-21,4593970126020001597,0,16558003654593970126020001597,400,mobile,0,0.0,0
4,2022-06-21,3962568860806737949,1,16558324233962568860806737949,1300,mobile,0,0.0,0


## EDA (exploratory data analysis)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269142 entries, 0 to 269141
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   date            269142 non-null  datetime64[ns]
 1   user_id         269142 non-null  int64         
 2   exp_group       269142 non-null  int64         
 3   session_id      269142 non-null  object        
 4   session_length  269142 non-null  int64         
 5   device          269142 non-null  object        
 6   order_cnt       269142 non-null  int64         
 7   price           269142 non-null  float64       
 8   quantity_cnt    269142 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(5), object(2)
memory usage: 18.5+ MB


In [4]:
print(f'Devices which users use: {df.device.unique()}')
print(f'Unique users count: {df.user_id.nunique()}')
print(f'Unique session count: {df.session_id.nunique()}')
print(f'Max date: {df.date.max()}')
print(f'Min date: {df.date.min()}')
print(f'Diff between count session_id and unique session_id : {df.session_id.count()-df.session_id.nunique()}')

Devices which users use: ['mobile' 'desktop' 'tablet']
Unique users count: 68287
Unique session count: 267929
Max date: 2022-07-13 00:00:00
Min date: 2022-06-21 00:00:00
Diff between count session_id and unique session_id : 1213


In [5]:
two_more_sess = df\
    .groupby(['user_id', 'session_id', 'device'])\
    .agg({'session_id':'count'})\
    .rename(columns={'session_id':'count_s'})\
    .query('count_s>1')\
    .sort_values('count_s',  ascending=False)\
    .reset_index()\
    .head(20)

In [6]:
two_more_sess

Unnamed: 0,user_id,session_id,device,count_s
0,16965268489840455,165584166516965268489840455,desktop,2
1,6188767753121332219,16571344236188767753121332219,desktop,2
2,6179372293126118898,16574857006179372293126118898,mobile,2
3,6178250370357987265,16558445156178250370357987265,mobile,2
4,6162080960945238324,16573105496162080960945238324,desktop,2
5,6162080960945238324,16572272406162080960945238324,desktop,2
6,6162080960945238324,16562757896162080960945238324,desktop,2
7,6157659880517790061,16567937066157659880517790061,desktop,2
8,6157032325675880426,16570546826157032325675880426,mobile,2
9,6149419174013868201,16575728676149419174013868201,desktop,2


In [7]:
df[df['session_id']=='16569673066180726904333254161']

Unnamed: 0,date,user_id,exp_group,session_id,session_length,device,order_cnt,price,quantity_cnt
157890,2022-07-04,6180726904333254161,1,16569673066180726904333254161,1065,desktop,0,0.0,0
175317,2022-07-05,6180726904333254161,1,16569673066180726904333254161,1958,desktop,0,0.0,0


In [8]:
df[df['session_id']=='16567944336087415308146188769']

Unnamed: 0,date,user_id,exp_group,session_id,session_length,device,order_cnt,price,quantity_cnt
143798,2022-07-02,6087415308146188769,1,16567944336087415308146188769,1149,mobile,0,0.0,0
149475,2022-07-03,6087415308146188769,1,16567944336087415308146188769,895,mobile,0,0.0,0


## Aggregating signals to the metrics

In [30]:
con = sqlite3.connect('db')

In [31]:
df.to_sql('metrics', con, index=False, if_exists='replace')

269142

In [295]:
sql = '''
with cte as (
    SELECT
        date, 
        exp_group,
        user_id,
        session_id,
        session_length,
        order_cnt,
        price,
        quantity_cnt,
        IIF(sum(price)>0, 1, 0) AS conversion 
    FROM
        metrics
    GROUP BY
        date,
        exp_group,
        user_id,
        session_id
)
SELECT
    DATE(date) as date, 
    exp_group,
    user_id,
    conversion,
    session_id,
    sum(session_length) as sum_ses_len,
    ROUND((sum(price)),2) as rev,
    sum(quantity_cnt) as sum_quant,
    CAST(((((COUNT(CASE WHEN price>0 THEN 1 END))*0.1)/count(session_id))*1000) as INT) as p_secs_pay_ses
FROM
    cte
GROUP BY
    user_id,
    date
'''

In [296]:
df_f = pd.read_sql(sql, con)

In [297]:
df_f

Unnamed: 0,date,exp_group,user_id,conversion,session_id,sum_ses_len,rev,sum_quant,p_secs_pay_ses
0,2022-07-06,0,125741107230500,0,1657093005125741107230500,1037,0.00,0,0
1,2022-07-05,0,667585499425385,0,1657003845667585499425385,432,0.00,0,0
2,2022-07-11,0,681720336100105,0,1657485675681720336100105,4453,0.00,0,0
3,2022-06-30,1,767791454221621,0,1656573195767791454221621,89,0.00,0,0
4,2022-07-04,1,767791454221621,0,1656928523767791454221621,220,0.00,0,0
...,...,...,...,...,...,...,...,...,...
188844,2022-07-07,1,9223272807582961860,1,16571695339223272807582961860,1488,19887.33,8,100
188845,2022-07-08,1,9223272807582961860,1,16572540399223272807582961860,471,19887.33,8,100
188846,2022-07-09,1,9223272807582961860,1,16573422619223272807582961860,312,9943.66,4,100
188847,2022-07-10,1,9223272807582961860,1,16574340469223272807582961860,146,14915.50,6,100


In [285]:
df_f.groupby('exp_group')\
    .agg({'secs_pay_ses':'mean'})

Unnamed: 0_level_0,secs_pay_ses
exp_group,Unnamed: 1_level_1
0,6.922402
1,7.145106


In [214]:
df_f[df_f['user_id']==3253297642743067]

Unnamed: 0,date,exp_group,user_id,conversion,session_id,sum_ses_len,rev,sum_quant
19,2022-06-22 00:00:00,1,3253297642743067,1,16558936393253297642743067,1703,4040.05,3
