In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import json
import argparse
import os
from datetime import date, datetime, timedelta
from pprint import pprint

# import dash_auth
import mpld3
import numpy as np
import pandas as pd
import plotly.express as px
import pytz
# from dash import Dash, dcc, html
# from plotnine import *
from pymongo import MongoClient

import sys
sys.path.append("./app/")
from log import get_logger

In [2]:
# ACTIVITY_COOLOFF_MINS = 10
# NUM_SESSIONS_THRESHOLD = 5
LOOKBACK_PERIOD_DAYS = 30
ACTIVATION_WINDOW = 20

# SESSION_COUNT_THRESHOLDS = [1, 3, 5]
CONNECTION_TEMPLATE = """mongodb://{user}:{password}@cluster0-shard-00-00.dbkij.mongodb.net:27017,cluster0-shard-00-01.dbkij.mongodb.net:27017,cluster0-shard-00-02.dbkij.mongodb.net:27017/myFirstDatabase?authSource=admin&replicaSet=atlas-xn7hxv-shard-0&w=majority&readPreference=primary&appname=MongoDB%20Compass&retryWrites=true&ssl=true"""
logger = get_logger(__name__)

mongo_user = os.getenv("MONGO_USER")
mongo_password = os.getenv("MONGO_PASSWORD")
if not mongo_user or not mongo_password:
    logger.fatal("MONGO_USER or MONGO_PASSWORD not set!")
    exit(1)


In [3]:
dt = datetime.today().strftime("%Y-%m-%d")
# dt = datetime.strptime("2022-12-19", "%Y-%m-%d").strftime("%Y-%m-%d")

window = LOOKBACK_PERIOD_DAYS + ACTIVATION_WINDOW + 1

In [4]:
client = MongoClient(
    CONNECTION_TEMPLATE.format(user=mongo_user, password=mongo_password),
    unicode_decode_error_handler='ignore',
)

main_db = client.main
events_collection = main_db.log_events
user_collection = main_db.users


end = datetime.strptime(
    dt, "%Y-%m-%d").astimezone(pytz.timezone("US/Pacific"))

In [5]:
# print(end.date(), window)

In [131]:
# get user data

cursor = user_collection.find()
df_users = pd.DataFrame(list(cursor))
df_users = df_users.rename(columns={"_id": "user_id"}, errors="raise")
df_users = df_users[["user_id", "email", "name", "created_at"]]
df_users["user_id"] = df_users["user_id"].astype(str)
df_users = df_users.rename(columns={"created_at": "signed_up_at"})


print(df_users.shape)
df_users.head()

(2223, 4)


Unnamed: 0,user_id,email,name,signed_up_at
0,61463e014cf3dfd537c399fb,john@generaltask.com,John Reinstra,NaT
1,6146889f4cf3dfd537c8d987,jack_hamilton@me.com,Jack Hamilton,NaT
2,616cd20a2dbdcd0a7ba1642c,jreinstra@gmail.com,John Reinstra,NaT
3,61722cb1e1abefac8feddc31,scottmai702@gmail.com,Scott Mai,NaT
4,617343f4e1abefac8f00ab42,nolan1299@gmail.com,Nolan Jimenez,NaT


In [7]:
# generate event level data

# query events table
date_filter = {"created_at": {
    "$gt": end - timedelta(days=window)
    # , "$lt": end
}}
cursor = events_collection.find(date_filter)
events_df = pd.DataFrame(list(cursor))
events_df["user_id"] = events_df["user_id"].astype(str)
print(events_df.shape)

# merge with users
events_df = events_df.merge(df_users, on="user_id", how="left")
print(events_df.shape)

# add PST timestamps and sort by user and timestamps
events_df = events_df.rename(columns={"_id": "event_id"}, errors="raise")
events_df["ts_pst"] = events_df.created_at.dt.tz_localize(
    pytz.utc).dt.tz_convert('US/Pacific')
events_df["dt"] = events_df.ts_pst.dt.date  # date in PST

print(events_df.shape)
events_df.head(3)

(9378025, 4)
(9378025, 6)
(9378025, 8)


Unnamed: 0,event_id,user_id,event_type,created_at,email,name,ts_pst,dt
0,63c8cd63ed503dccb5843fda,63c8c88bd9c144308101b202,api_hit_/recurring_task_templates/modify/63c8c...,2023-01-19 04:56:03.432,kdy0250@gmail.com,김동영,2023-01-18 20:56:03.432000-08:00,2023-01-18
1,63c8cd62ed503dccb5843fd9,63c794e78a6901a5fded6e87,api_hit_/events/,2023-01-19 04:56:02.952,milkade01@gmail.com,Jinn Cha,2023-01-18 20:56:02.952000-08:00,2023-01-18
2,63c8cd612e8cc1f830512fdc,6394f4a61ab04f28d8dfb6cc,api_hit_/events/,2023-01-19 04:56:01.110,laikhtewari1@gmail.com,Laikh Tewari,2023-01-18 20:56:01.110000-08:00,2023-01-18


In [8]:
# events_df.to_csv('events_df.csv')

In [29]:
events_df.event_type.str.contains('time_spent')

0          False
1          False
2          False
3          False
4          False
           ...  
9378020    False
9378021    False
9378022    False
9378023    False
9378024    False
Name: event_type, Length: 9378025, dtype: bool

In [47]:
time_spent_raw = (
    events_df
    # .groupby('user_')
    [events_df.event_type.str.contains('time_spent')]
)
time_spent_raw.head()

Unnamed: 0,event_id,user_id,event_type,created_at,email,name,ts_pst,dt
7,63c8cd5eed503dccb5843fd1,63c758fd8a6901a5fdd5e26a,"{""type"":""time_spent"",""time_focused"":28343,""tim...",2023-01-19 04:55:58.022,skdev24@gmail.com,shivam dev,2023-01-18 20:55:58.022000-08:00,2023-01-18
45,63c8cd4e2e8cc1f830512fb1,63c758fd8a6901a5fdd5e26a,"{""type"":""time_spent"",""time_focused"":119992,""ti...",2023-01-19 04:55:42.104,skdev24@gmail.com,shivam dev,2023-01-18 20:55:42.104000-08:00,2023-01-18
46,63c8cd4ded503dccb5843fa4,63c868f6d9c1443081df44bb,"{""type"":""time_spent"",""time_focused"":0,""time_vi...",2023-01-19 04:55:41.566,clny8912@gmail.com,Chris Lee,2023-01-18 20:55:41.566000-08:00,2023-01-18
57,63c8cd47ed503dccb5843f97,63c794e78a6901a5fded6e87,"{""type"":""time_spent"",""time_focused"":116430,""ti...",2023-01-19 04:55:35.662,milkade01@gmail.com,Jinn Cha,2023-01-18 20:55:35.662000-08:00,2023-01-18
83,63c8cd3a2e8cc1f830512f7f,63c8b557d9c1443081fb6dfe,"{""type"":""time_spent"",""time_focused"":0,""time_vi...",2023-01-19 04:55:22.289,oka@lxgic.com,Hiroshi Oka,2023-01-18 20:55:22.289000-08:00,2023-01-18


In [66]:
time_spent_concat = (
    pd.concat(
        [time_spent_raw,
         time_spent_raw.event_type.apply(lambda x: pd.Series(json.loads(x)))],
        axis=1,
    )
)
time_spent_concat 

Unnamed: 0,event_id,user_id,event_type,created_at,email,name,ts_pst,dt,type,time_focused,time_visible
7,63c8cd5eed503dccb5843fd1,63c758fd8a6901a5fdd5e26a,"{""type"":""time_spent"",""time_focused"":28343,""tim...",2023-01-19 04:55:58.022,skdev24@gmail.com,shivam dev,2023-01-18 20:55:58.022000-08:00,2023-01-18,time_spent,28343,120008
45,63c8cd4e2e8cc1f830512fb1,63c758fd8a6901a5fdd5e26a,"{""type"":""time_spent"",""time_focused"":119992,""ti...",2023-01-19 04:55:42.104,skdev24@gmail.com,shivam dev,2023-01-18 20:55:42.104000-08:00,2023-01-18,time_spent,119992,119992
46,63c8cd4ded503dccb5843fa4,63c868f6d9c1443081df44bb,"{""type"":""time_spent"",""time_focused"":0,""time_vi...",2023-01-19 04:55:41.566,clny8912@gmail.com,Chris Lee,2023-01-18 20:55:41.566000-08:00,2023-01-18,time_spent,0,119999
57,63c8cd47ed503dccb5843f97,63c794e78a6901a5fded6e87,"{""type"":""time_spent"",""time_focused"":116430,""ti...",2023-01-19 04:55:35.662,milkade01@gmail.com,Jinn Cha,2023-01-18 20:55:35.662000-08:00,2023-01-18,time_spent,116430,119993
83,63c8cd3a2e8cc1f830512f7f,63c8b557d9c1443081fb6dfe,"{""type"":""time_spent"",""time_focused"":0,""time_vi...",2023-01-19 04:55:22.289,oka@lxgic.com,Hiroshi Oka,2023-01-18 20:55:22.289000-08:00,2023-01-18,time_spent,0,119998
...,...,...,...,...,...,...,...,...,...,...,...
713376,63bf43edabc6d211e5004a4e,6346075975ebd7528bd59371,"{""type"":""time_spent"",""time_focused"":693,""time_...",2023-01-11 23:19:09.439,micahdkim1@gmail.com,Micah Kim,2023-01-11 15:19:09.439000-08:00,2023-01-11,time_spent,693,723
713641,63bf4375728a680326e989c2,6346075975ebd7528bd59371,"{""type"":""time_spent"",""time_focused"":1590,""time...",2023-01-11 23:17:09.433,micahdkim1@gmail.com,Micah Kim,2023-01-11 15:17:09.433000-08:00,2023-01-11,time_spent,1590,1637
713911,63bf42f7abc6d211e5004884,6346075975ebd7528bd59371,"{""type"":""time_spent"",""time_focused"":858,""time_...",2023-01-11 23:15:03.756,micahdkim1@gmail.com,Micah Kim,2023-01-11 15:15:03.756000-08:00,2023-01-11,time_spent,858,5607
714160,63bf4288abc6d211e50047d7,6346075975ebd7528bd59371,"{""type"":""time_spent"",""time_focused"":5483,""time...",2023-01-11 23:13:12.436,micahdkim1@gmail.com,Micah Kim,2023-01-11 15:13:12.436000-08:00,2023-01-11,time_spent,5483,6781


In [72]:
list(time_spent_concat[time_spent_concat.time_visible < 0].iloc[:5].event_type)

['{"type":"time_spent","time_focused":72675,"time_visible":-32288113}',
 '{"type":"time_spent","time_focused":5227,"time_visible":-32288116}',
 '{"type":"time_spent","time_focused":44908,"time_visible":-3484479}',
 '{"type":"time_spent","time_focused":16830,"time_visible":-7084188}']

In [85]:
len(time_spent_concat[time_spent_concat.time_visible > 130*1000])

424

In [91]:
len(time_spent_concat[(time_spent_concat.time_visible > 130*1000) | (time_spent_concat.time_focused > 130*1000)]) / len(time_spent_concat) * 100

2.022584009795474

In [107]:
len(time_spent_concat[time_spent_concat.time_focused > -1]) / len(time_spent_concat) * 100

100.0

In [112]:
# time_spent_concat[(time_spent_concat.time_focused > -5) & (time_spent_concat.time_focused <= 0)]

In [117]:
time_spent= (
    time_spent_concat
    [['email', 'dt', 'ts_pst', 'time_focused', 'time_visible']]
    [
        True
        & (time_spent_concat.time_visible >= 0)
        & (time_spent_concat.time_focused >= 0)
        & (time_spent_concat.time_visible < 130*1000)
        & (time_spent_concat.time_focused < 130*1000)
    ]
)
len(time_spent)/ len(time_spent_concat)*100

97.95927622330053

In [120]:
time_spent['time_focused_s'] = time_spent['time_focused']/1000
time_spent['time_visible_s'] = time_spent['time_visible']/1000
(
    time_spent
    .sort_values(['time_visible'])
    .describe()
)

Unnamed: 0,time_focused,time_visible,time_focused_s,time_visible_s
count,21601.0,21601.0,21601.0,21601.0
mean,39795.144669,85445.781955,39.795145,85.445782
std,51698.806089,48886.797901,51.698806,48.886798
min,0.0,0.0,0.0,0.0
25%,0.0,32400.0,0.0,32.4
50%,3144.0,119994.0,3.144,119.994
75%,116506.0,120002.0,116.506,120.002
max,129823.0,129823.0,129.823,129.823


In [121]:
time_spent

Unnamed: 0,email,dt,ts_pst,time_focused,time_visible,time_focused_s,time_visible_s
7,skdev24@gmail.com,2023-01-18,2023-01-18 20:55:58.022000-08:00,28343,120008,28.343,120.008
45,skdev24@gmail.com,2023-01-18,2023-01-18 20:55:42.104000-08:00,119992,119992,119.992,119.992
46,clny8912@gmail.com,2023-01-18,2023-01-18 20:55:41.566000-08:00,0,119999,0.000,119.999
57,milkade01@gmail.com,2023-01-18,2023-01-18 20:55:35.662000-08:00,116430,119993,116.430,119.993
83,oka@lxgic.com,2023-01-18,2023-01-18 20:55:22.289000-08:00,0,119998,0.000,119.998
...,...,...,...,...,...,...,...
713376,micahdkim1@gmail.com,2023-01-11,2023-01-11 15:19:09.439000-08:00,693,723,0.693,0.723
713641,micahdkim1@gmail.com,2023-01-11,2023-01-11 15:17:09.433000-08:00,1590,1637,1.590,1.637
713911,micahdkim1@gmail.com,2023-01-11,2023-01-11 15:15:03.756000-08:00,858,5607,0.858,5.607
714160,micahdkim1@gmail.com,2023-01-11,2023-01-11 15:13:12.436000-08:00,5483,6781,5.483,6.781


In [148]:
time_spent_daily = (
    time_spent[['email', 'dt', 'time_visible_s', 'time_focused_s']]
    .groupby(['email', 'dt'])
    .sum()
    .reset_index()
)
time_spent_daily

Unnamed: 0,email,dt,time_visible_s,time_focused_s
0,1206.sachin@gmail.com,2023-01-17,2705.161,23.841
1,1206.sachin@gmail.com,2023-01-18,767.895,631.279
2,200wilson@gmail.com,2023-01-17,36.623,36.579
3,2343777@gmail.com,2023-01-18,32.423,31.322
4,3sco@mpyr3.com,2023-01-18,16501.518,36.871
...,...,...,...,...
1297,zhuxiaote@gmail.com,2023-01-15,2314.795,1038.219
1298,zhuxiaote@gmail.com,2023-01-18,55.363,55.327
1299,zinjifrah@gmail.com,2023-01-17,114.757,114.429
1300,zivnavoth@gmail.com,2023-01-17,201.573,179.128


In [178]:
(
    time_spent_daily
    [
        True
        & (time_spent_daily.time_visible_s > 0)
        & (time_spent_daily.time_focused_s > 0)
    ]
    .describe()
)

Unnamed: 0,time_visible_s,time_focused_s
count,1291.0,1291.0
mean,1403.460771,640.100325
std,4125.669309,2441.085342
min,0.018,0.015
25%,49.4685,30.7595
50%,196.835,111.772
75%,777.6545,344.289
max,56258.096,34680.481


In [179]:
users_since_product_hunt = df_users[(df_users.signed_up_at > "2023-01-16")]
users_since_product_hunt.email

1097           bjorn.antell@gmail.com
1098       charliedonnelly4@gmail.com
1099             vasu.nandi@gmail.com
1100          claytonhalim3@gmail.com
1101            mar38094306@gmail.com
                    ...              
2218            ananeagu012@gmail.com
2219                  rindo@width.app
2220          yamadarei.000@gmail.com
2221              rockygrey@gmail.com
2222    jeremy.fiest@safetyculture.io
Name: email, Length: 1126, dtype: object

In [180]:
(
    time_spent_daily
    [
        True
        & (time_spent_daily.time_visible_s > 0)
        & (time_spent_daily.time_focused_s > 0)
        & ~time_spent_daily.email.isin(users_since_product_hunt.email)
    ]
    .describe()
)

Unnamed: 0,time_visible_s,time_focused_s
count,280.0,280.0
mean,3255.401168,1645.236729
std,7292.913499,4730.668283
min,0.018,0.015
25%,132.36025,50.06775
50%,475.998,219.8555
75%,2325.18175,824.659
max,56258.096,34680.481


In [185]:
(
    time_spent_daily
    [
        True
        & (time_spent_daily.time_visible_s > 0)
        & (time_spent_daily.time_focused_s > 0)
        & ~time_spent_daily.email.isin(users_since_product_hunt.email)
    ]
    .groupby('email')
    .mean()
    .describe()
)

  time_spent_daily


Unnamed: 0,time_visible_s,time_focused_s
count,106.0,106.0
mean,1955.271236,1074.249188
std,3851.631197,2431.435587
min,0.879,0.82
25%,139.5925,68.929
50%,386.7541,208.6955
75%,1374.707333,584.666268
max,20420.662429,13624.454


In [175]:
(
    time_spent_daily
    [
        True
        & (time_spent_daily.time_visible_s > 0)
        & (time_spent_daily.time_focused_s > 0)
        & ~time_spent_daily.email.isin(users_since_product_hunt.email)
        & time_spent_daily.dt.apply(lambda x: x.weekday() < 5)
    ]
    .describe()
)

Unnamed: 0,time_visible_s,time_focused_s
count,235.0,235.0
mean,3193.945183,1542.010094
std,7411.76192,4715.51416
min,0.018,0.015
25%,134.92,50.853
50%,444.089,213.776
75%,2329.513,762.599
max,56258.096,34680.481
