Here we'll put the bulk of the work.

In [1]:
import pandas as pd
import numpy as np

v_df = pd.read_csv(
    'datasets/visits_log_us.csv',
    parse_dates=['start_time', 'end_time'],
    dtype=
        {
            'device': 'category',
            'source_id': 'category'
        }
    )
o_df = pd.read_csv(
    'datasets/orders_log_us.csv',
    parse_dates=['purchase_time']
)
c_df = pd.read_csv(
    'datasets/costs_us.csv',
    parse_dates=['date'],
    dtype=
        {
            'source_id': 'category'
        }
)

# for checking on the data, but only one line needs to be commented out or un-commented out
check_data = False
# check_data = True
if check_data:
    v_df.info()
    print(v_df.head())
    print()
    o_df.info()
    print(o_df.head())
    print()
    c_df.info()
    print(c_df.head())

In [2]:
v_df['session_year']  = v_df['start_time'].dt.isocalendar().year
v_df['session_month'] = v_df['start_time'].dt.month
v_df['session_week']  = v_df['start_time'].dt.isocalendar().week
v_df['session_date'] = v_df['start_time'].dt.date

print(v_df.sample(7))

         device            end_time source_id          start_time  \
208606    touch 2017-11-11 19:33:00         3 2017-11-11 19:32:00   
40435   desktop 2017-10-02 17:55:00         5 2017-10-02 17:52:00   
234600  desktop 2017-09-25 12:06:00         4 2017-09-25 12:06:00   
236919  desktop 2018-01-02 01:57:00         5 2018-01-02 01:56:00   
355041    touch 2017-12-05 20:13:00         4 2017-12-05 19:57:00   
236234  desktop 2017-12-29 19:52:00         3 2017-12-29 19:50:00   
210584    touch 2018-05-24 14:21:00         1 2018-05-24 13:36:00   

                         uid  session_year  session_month  session_week  \
208606    726182164025950557          2017             11            45   
40435   12467258775864758748          2017             10            40   
234600  13738821311423048524          2017              9            39   
236919   9483480442402464301          2018              1             1   
355041  15662027884326541793          2017             12            49 

In [3]:
dau_total = (
    v_df.groupby('session_date')
    .agg({'uid': 'nunique'})
    .mean()
)

wau_total = (
    v_df.groupby(['session_year', 'session_week'])
    .agg({'uid': 'nunique'})
    .mean()
)

mau_total = (
    v_df.groupby(['session_year', 'session_month'])
    .agg({'uid': 'nunique'})
    .mean()
)

sticky_wau = (dau_total / wau_total) * 100
sticky_mau = (dau_total / mau_total) * 100

print(f'dau = {int(dau_total)} wau = {int(wau_total)} mau = {int(mau_total)}')
print(f'sticky_wau = {float(sticky_wau):.2f}% sticky_mau = {float(sticky_mau):.2f}%')

dau = 907 wau = 5716 mau = 23228
sticky_wau = 15.88% sticky_mau = 3.91%


In [4]:
v_df['session_duration_mins'] = (
    v_df['end_time'] - v_df['start_time']
).dt.seconds / 60

print(v_df.sample(10))

         device            end_time source_id          start_time  \
41423   desktop 2018-05-25 09:42:00         4 2018-05-25 09:22:00   
84362     touch 2017-07-28 17:35:00         4 2017-07-28 17:33:00   
42283   desktop 2018-05-17 11:18:00         2 2018-05-17 11:17:00   
91223   desktop 2017-07-24 13:48:00         3 2017-07-24 13:45:00   
75025   desktop 2017-06-23 11:46:00         2 2017-06-23 11:45:00   
100630    touch 2018-03-16 11:25:00        10 2018-03-16 11:19:00   
168964  desktop 2018-01-21 19:10:00         4 2018-01-21 19:00:00   
27776   desktop 2017-09-21 12:05:00         9 2017-09-21 12:04:00   
247790  desktop 2017-11-29 09:31:00         2 2017-11-29 09:30:00   
232281  desktop 2018-02-19 11:44:00         3 2018-02-19 11:35:00   

                         uid  session_year  session_month  session_week  \
41423    4557690209991592540          2018              5            21   
84362    2521821748719908325          2017              7            30   
42283   1677682

In [5]:
v_by_day = v_df.groupby('session_date').agg({
    'start_time': 'count'
})
v_by_day = v_by_day.rename(
    columns={
        'start_time': 'sessions'
    }
)
print(v_by_day.sample(10))

              sessions
session_date          
2017-07-17        1278
2017-10-09        1362
2018-03-29        1128
2017-11-01        1339
2017-09-15         774
2017-08-10         470
2017-08-21         558
2018-04-10         965
2018-02-11        1124
2018-01-27        1044


In [6]:
first_session_date = v_df.groupby(['uid'])['session_date'].min()
first_session_date.name = 'first_session_date'

v_df = v_df.join(first_session_date, on='uid')

In [8]:
first_session_week = v_df.groupby(['uid'])['session_week'].min()
first_session_week.name = 'first_session_week'

v_df = v_df.join(first_session_week, on='uid')

In [9]:
first_session_month = v_df.groupby(['uid'])['session_month'].min()
first_session_month.name = 'first_session_month'

v_df = v_df.join(first_session_month, on='uid')

In [10]:
v_df['cohort_lifetime_months'] = ((
        v_df['session_date']
            -
        v_df['first_session_date']
    ) / np.timedelta64(1, 'M')
).astype(int)

print(v_df.sample(10))

         device            end_time source_id          start_time  \
165109    touch 2018-03-28 18:12:00         1 2018-03-28 18:02:00   
150093  desktop 2018-02-08 22:31:00         5 2018-02-08 22:30:00   
13695     touch 2017-08-07 14:03:00         3 2017-08-07 13:49:00   
103965  desktop 2017-10-22 20:52:00         3 2017-10-22 20:52:00   
14541   desktop 2018-03-30 09:24:00         4 2018-03-30 08:32:00   
164850    touch 2017-09-08 14:45:00         3 2017-09-08 14:40:00   
248016  desktop 2018-01-05 10:42:00         4 2018-01-05 10:35:00   
221960  desktop 2017-06-28 00:57:00         3 2017-06-28 00:34:00   
149830    touch 2018-03-03 08:16:00         1 2018-03-03 07:52:00   
319662  desktop 2018-03-05 11:30:00         4 2018-03-05 10:45:00   

                         uid  session_year  session_month  session_week  \
165109   7937065672578171549          2018              3            13   
150093   2243689000952188372          2018              2             6   
13695   1372108

In [11]:
cohorts = (
    v_df.groupby(['first_session_month', 'cohort_lifetime_months'])
    .agg({'uid': 'nunique'})
    .reset_index()
)
initial_users_count = cohorts[cohorts['cohort_lifetime_months'] == 0][ ['first_session_month', 'uid'] ]
initial_users_count = initial_users_count.rename( columns={'uid': 'cohort_users'} ) 

cohorts = cohorts.merge(initial_users_count, on='first_session_month')

cohorts['retention'] = cohorts['uid'] / cohorts['cohort_users']

print(cohorts.sample(10))

    first_session_month  cohort_lifetime_months    uid  cohort_users  \
63                    6                       3    433         11309   
39                    4                       3    235         17468   
10                    1                      10    272         28716   
45                    4                       9    226         17468   
82                    9                       3    188         14790   
76                    8                       2    375          8945   
74                    8                       0   8945          8945   
12                    2                       0  26007         26007   
36                    4                       0  17468         17468   
32                    3                       8    361         23510   

    retention  
63   0.038288  
39   0.013453  
10   0.009472  
45   0.012938  
82   0.012711  
76   0.041923  
74   1.000000  
12   1.000000  
36   1.000000  
32   0.015355  


In [12]:
# make and print table
retention_pivot = cohorts.pivot_table(
    index='first_session_month',
    columns='cohort_lifetime_months',
    values='retention',
    aggfunc='sum',
)

print(retention_pivot)

cohort_lifetime_months   0         1         2         3         4         5   \
first_session_month                                                             
1                       1.0  0.118053  0.098865  0.084343  0.062021  0.048997   
2                       1.0  0.061945  0.065675  0.051102  0.037913  0.026839   
3                       1.0  0.041684  0.035006  0.034368  0.033730  0.027690   
4                       1.0  0.028681  0.006526  0.013453  0.026219  0.024216   
5                       1.0  0.008554  0.004990  0.004574  0.011287  0.019425   
6                       1.0  0.034397  0.032187  0.038288  0.034928  0.033336   
7                       1.0  0.035284  0.036070  0.033188  0.033362  0.014847   
8                       1.0  0.043823  0.041923  0.034097  0.013639       NaN   
9                       1.0  0.051454  0.043475  0.012711       NaN       NaN   
10                      1.0  0.044922  0.017683       NaN       NaN       NaN   
11                      1.0 