In [55]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import argparse
import os
from datetime import date, datetime, timedelta
from pprint import pprint

import dash_auth
import mpld3
import numpy as np
import pandas as pd
import plotly.express as px
import pytz
from dash import Dash, dcc, html
from plotnine import *
from pymongo import MongoClient

import sys
sys.path.append("./app/")
from log import get_logger

In [2]:
# ACTIVITY_COOLOFF_MINS = 10
# NUM_SESSIONS_THRESHOLD = 5
LOOKBACK_PERIOD_DAYS = 30
ACTIVATION_WINDOW = 20

# SESSION_COUNT_THRESHOLDS = [1, 3, 5]
CONNECTION_TEMPLATE = """mongodb://{user}:{password}@cluster0-shard-00-00.dbkij.mongodb.net:27017,cluster0-shard-00-01.dbkij.mongodb.net:27017,cluster0-shard-00-02.dbkij.mongodb.net:27017/myFirstDatabase?authSource=admin&replicaSet=atlas-xn7hxv-shard-0&w=majority&readPreference=primary&appname=MongoDB%20Compass&retryWrites=true&ssl=true"""
logger = get_logger(__name__)

mongo_user = os.getenv("MONGO_USER")
mongo_password = os.getenv("MONGO_PASSWORD")
if not mongo_user or not mongo_password:
    logger.fatal("MONGO_USER or MONGO_PASSWORD not set!")
    exit(1)


In [83]:
# dt = datetime.today().strftime("%Y-%m-%d")
dt = datetime.strptime("2022-12-19", "%Y-%m-%d").strftime("%Y-%m-%d")

window = LOOKBACK_PERIOD_DAYS + ACTIVATION_WINDOW + 1

In [84]:
client = MongoClient(
    CONNECTION_TEMPLATE.format(user=mongo_user, password=mongo_password),
    unicode_decode_error_handler='ignore',
)

main_db = client.main
events_collection = main_db.log_events
user_collection = main_db.users


end = datetime.strptime(
    dt, "%Y-%m-%d").astimezone(pytz.timezone("US/Pacific"))

In [82]:
# print(end.date(), window)

In [85]:
# get user data

cursor = user_collection.find()
df_users = pd.DataFrame(list(cursor))
df_users = df_users.rename(columns={"_id": "user_id"}, errors="raise")
df_users = df_users[["user_id", "email", "name"]]
df_users["user_id"] = df_users["user_id"].astype(str)

print(df_users.shape)
df_users.head()

(964, 3)


Unnamed: 0,user_id,email,name
0,61463e014cf3dfd537c399fb,john@generaltask.com,John Reinstra
1,6146889f4cf3dfd537c8d987,jack_hamilton@me.com,Jack Hamilton
2,616cd20a2dbdcd0a7ba1642c,jreinstra@gmail.com,John Reinstra
3,61722cb1e1abefac8feddc31,scottmai702@gmail.com,Scott Mai
4,617343f4e1abefac8f00ab42,nolan1299@gmail.com,Nolan Jimenez


In [86]:
# generate event level data

# query events table
date_filter = {"created_at": {
    "$gt": end - timedelta(days=window)
    # , "$lt": end
}}
cursor = events_collection.find(date_filter)
events_df = pd.DataFrame(list(cursor))
events_df["user_id"] = events_df["user_id"].astype(str)
print(events_df.shape)

# merge with users
events_df = events_df.merge(df_users, on="user_id", how="left")
print(events_df.shape)

# add PST timestamps and sort by user and timestamps
events_df = events_df.rename(columns={"_id": "event_id"}, errors="raise")
events_df["ts_pst"] = events_df.created_at.dt.tz_localize(
    pytz.utc).dt.tz_convert('US/Pacific')
events_df["dt"] = events_df.ts_pst.dt.date  # date in PST

print(events_df.shape)
events_df.head(3)

(12758914, 4)
(12758914, 6)
(12758914, 8)


Unnamed: 0,event_id,user_id,event_type,created_at,email,name,ts_pst,dt
0,63b06651e762c71630549f27,62587d69ab75e6bfa2e919e9,api_hit_/tasks/v3/,2022-12-31 16:41:53.546,hans@generaltask.com,Hans van de Bruggen,2022-12-31 08:41:53.546000-08:00,2022-12-31
1,63b06651e762c71630549f26,62587d69ab75e6bfa2e919e9,api_hit_/overview/views/,2022-12-31 16:41:53.546,hans@generaltask.com,Hans van de Bruggen,2022-12-31 08:41:53.546000-08:00,2022-12-31
2,63b066516a1181e38c804f36,62587d69ab75e6bfa2e919e9,api_hit_/tasks/fetch/,2022-12-31 16:41:53.472,hans@generaltask.com,Hans van de Bruggen,2022-12-31 08:41:53.472000-08:00,2022-12-31


# Event Counts by Type

In [87]:
background_events = [
    "api_hit_/events/",
    "api_hit_/tasks/fetch/",
    "api_hit_/tasks/v3/",
    "api_hit_/pull_requests/fetch/",
    "api_hit_/pull_requests/",
]

df_event_type_counts = (
    events_df
    [~events_df.event_type.isin(background_events)]
    .groupby(["email", "dt", "event_type"])
    .agg(num_events=('event_id', 'count'))
    # .sort_values(["name", "dt", "num_events"], ascending=False)
    .reset_index()
)
df_event_type_counts.head(2)

Unnamed: 0,email,dt,event_type,num_events
0,127.0.0.69@gmail.com,2022-11-21,"""open_auth_window_https://api.generaltask.com/...",1
1,127.0.0.69@gmail.com,2022-11-21,"""open_auth_window_https://api.generaltask.com/...",1


In [166]:
df_event_type_counts["total_num_events"] = (
    df_event_type_counts
    .groupby(["dt", "email"])["num_events"]
    .transform(np.sum)
)
# df_event_type_counts["number_of_active_days"] = (
#     df_event_type_counts
#     .groupby(["email"])["num_events"]
#     .transform(np.sum)
# )
df_event_type_counts = (
    df_event_type_counts
    .sort_values(["dt", "total_num_events", "num_events"], ascending=[True, False, False])
)
df_event_type_counts.head()

Unnamed: 0,email,dt,event_type,num_events,total_num_events
18927,jerrylinew@gmail.com,2022-10-28,api_hit_/overview/views/,114,119
18924,jerrylinew@gmail.com,2022-10-28,api_hit_/linked_accounts/,1,119
18925,jerrylinew@gmail.com,2022-10-28,api_hit_/linked_accounts/supported_types/,1,119
18926,jerrylinew@gmail.com,2022-10-28,api_hit_/overview/supported_views/,1,119
18928,jerrylinew@gmail.com,2022-10-28,api_hit_/settings/,1,119


In [167]:
# df_event_type_counts.to_csv("./user_daily_events_by_type_all_types.csv")

In [168]:
activity_per_day = (
    df_event_type_counts
    .drop_duplicates(['email', 'dt', 'total_num_events'])
    [['email', 'dt', 'total_num_events']]
    # .group
)
# activity_per_day['num_days_active'] = (
#     activity_per_day
#     .groupby(["email"])["dt"]
#     .nunique()
# )
activity_per_day

Unnamed: 0,email,dt,total_num_events
18927,jerrylinew@gmail.com,2022-10-28,119
12018,garyshen.garyshen@gmail.com,2022-10-28,16
37295,ole@nordstar.com,2022-10-28,13
18933,jerrylinew@gmail.com,2022-10-29,737
12030,garyshen.garyshen@gmail.com,2022-10-29,217
...,...,...,...
27311,julian@generaltask.com,2022-12-31,28
5815,bwstearns@petaurus.co,2022-12-31,8
10941,emily@atlantic.money,2022-12-31,7
17234,info@codeagency.be,2022-12-31,4


In [169]:
# (
#     activity_per_day
#     .groupby(["email"])["dt"]
#     .nunique()
#     # .transform(np.unique)
    
# )

In [170]:
# (
#     df_event_type_counts
#     .groupby(["dt", "email"])["num_events"]
#     .transform(np.sum)
# )

In [171]:
# dt = datetime.today()
dt = datetime.strptime("2022-12-19", "%Y-%m-%d")

print('dt', (dt).strftime("%Y-%m-%d"))
print('dt - 20 days', (dt - timedelta(days=20)).strftime("%Y-%m-%d"))

print('dt - 30 days', (dt - timedelta(days=30)).strftime("%Y-%m-%d"))
print('dt - 30 - 20', (dt - timedelta(days=30+ACTIVATION_WINDOW)).strftime("%Y-%m-%d"))

min(df_event_type_counts.dt)
max(df_event_type_counts.dt)


dt 2022-12-19
dt - 20 days 2022-11-29
dt - 30 days 2022-11-19
dt - 30 - 20 2022-10-30


datetime.date(2022, 10, 28)

datetime.date(2022, 12, 31)

In [172]:
# num_days_active = pd.DataFrame(
#     activity_per_day
#     .groupby(["email"])["dt"]
#     .nunique()
# ).reset_index().rename(columns={"dt": "num_days_active"}, errors="raise")
# activated_users = num_days_active[num_days_active.num_days_active > 4]
# # TODO: we should be doing this to calculate "activated" users on both of the dataframes separately
# activated_users

In [173]:
# activity_per_day = activity_per_day[activity_per_day.email.isin(activated_users.email)]

In [179]:
max(events_df.dt), min(events_df.dt)
max(activity_per_day.dt), min(activity_per_day.dt)
max(df_event_type_counts.dt), min(df_event_type_counts.dt)

(datetime.date(2022, 12, 31), datetime.date(2022, 10, 28))

(datetime.date(2022, 12, 31), datetime.date(2022, 10, 28))

(datetime.date(2022, 12, 31), datetime.date(2022, 10, 28))

In [177]:
# import qgrid
# from pandasgui import show
# import tabloo
import dtale

In [178]:
# activity_per_day

num_days_active = pd.DataFrame(
    activity_per_day
    .groupby(["email"])["dt"]
    .nunique()
).reset_index().rename(columns={"dt": "num_days_active"}, errors="raise").sort_values(['num_days_active'], ascending=False)
num_days_active['days_usage_perc'] = num_days_active.num_days_active / (max(activity_per_day.dt) - min(activity_per_day.dt)).days
dtale.show(num_days_active)



In [161]:
num_days_active.to_csv('num_days_active.csv')

In [192]:
num_days_active_threshold = 2

In [193]:
activated_users_today = (
    activity_per_day
    [
        (activity_per_day.dt >= (dt - timedelta(days=ACTIVATION_WINDOW)).date())
        & (activity_per_day.dt <= (dt).date())
        & (activity_per_day.total_num_events > 0)
        # & (activity_per_day.total_num_events > 1)
    ]
)
# activated_users_today
activated_users_today.shape


num_days_active = pd.DataFrame(
    activated_users_today
    .groupby(["email"])["dt"]
    .nunique()
).reset_index().rename(columns={"dt": "num_days_active"}, errors="raise")
activated_users = num_days_active[num_days_active.num_days_active > num_days_active_threshold]


activated_users_today = activated_users_today[activated_users_today.email.isin(activated_users.email)]
activated_users_today.shape

(1225, 3)

(985, 3)

In [194]:
min(activity_per_day.dt)
max(activity_per_day.dt)

datetime.date(2022, 10, 28)

datetime.date(2022, 12, 31)

In [195]:
activated_users_30_days_ago = (
    activity_per_day
    [
        (activity_per_day.dt >= (dt - timedelta(days=30+ACTIVATION_WINDOW)).date())
        & (activity_per_day.dt <= (dt - timedelta(days=30)).date())
        & (activity_per_day.total_num_events > 0)
        # & (activity_per_day.total_num_events > 1)
    ]
)

activated_users_30_days_ago.shape


num_days_active = pd.DataFrame(
    activated_users_30_days_ago
    .groupby(["email"])["dt"]
    .nunique()
).reset_index().rename(columns={"dt": "num_days_active"}, errors="raise")
activated_users = num_days_active[num_days_active.num_days_active > num_days_active_threshold]


activated_users_30_days_ago = activated_users_30_days_ago[activated_users_30_days_ago.email.isin(activated_users.email)]
activated_users_30_days_ago.shape

(606, 3)

(452, 3)

In [196]:
s_activated_users_today = set(activated_users_today.email)
s_activated_users_30_days_ago = set(activated_users_30_days_ago.email)
churned_users = s_activated_users_30_days_ago.difference(s_activated_users_today)
users_retained = s_activated_users_30_days_ago.intersection(s_activated_users_today)
len(s_activated_users_today)
len(s_activated_users_30_days_ago)
len(churned_users)
len(users_retained)

115

52

23

29

In [197]:
len(users_retained)/len(s_activated_users_30_days_ago)

0.5576923076923077

In [56]:
'agaldy@standvast.com' in s_activated_users_30_days_ago
'agaldy@standvast.com' in churned_users

False

False

In [78]:
events_df[events_df.email == 'rob@vanta.com'].sort_values(['dt'])

Unnamed: 0,event_id,user_id,event_type,created_at,email,name,time_since_previous_event_this_day,ts_pst,dt
10377468,63705b796fe5b2bba051e48b,634d6957f6a8432dcc2d5b27,api_hit_/events/,2022-11-13 02:50:33.052,rob@vanta.com,Rob Picard,NaT,2022-11-12 18:50:33.052000-08:00,2022-11-12
10364215,637079c4c49a61eb2840d21a,634d6957f6a8432dcc2d5b27,api_hit_/events/,2022-11-13 04:59:48.245,rob@vanta.com,Rob Picard,0 days 00:00:06.130000,2022-11-12 20:59:48.245000-08:00,2022-11-12
10364217,637079bec49a61eb2840d218,634d6957f6a8432dcc2d5b27,api_hit_/events/,2022-11-13 04:59:42.115,rob@vanta.com,Rob Picard,0 days 00:00:28,2022-11-12 20:59:42.115000-08:00,2022-11-12
10364258,637079a2c49a61eb2840d1e1,634d6957f6a8432dcc2d5b27,api_hit_/events/,2022-11-13 04:59:14.115,rob@vanta.com,Rob Picard,0 days 00:00:27.873000,2022-11-12 20:59:14.115000-08:00,2022-11-12
10364317,63707986c49a61eb2840d1b0,634d6957f6a8432dcc2d5b27,api_hit_/events/,2022-11-13 04:58:46.242,rob@vanta.com,Rob Picard,0 days 00:00:06.086000,2022-11-12 20:58:46.242000-08:00,2022-11-12
...,...,...,...,...,...,...,...,...,...
6850431,6384b8492e9c097db9fd7920,634d6957f6a8432dcc2d5b27,api_hit_/tasks/v3/,2022-11-28 13:31:53.855,rob@vanta.com,Rob Picard,0 days 00:00:02.574000,2022-11-28 05:31:53.855000-08:00,2022-11-28
6850430,6384b849f04b2799ef1c6db0,634d6957f6a8432dcc2d5b27,api_hit_/overview/views/,2022-11-28 13:31:53.890,rob@vanta.com,Rob Picard,0 days 00:00:00.035000,2022-11-28 05:31:53.890000-08:00,2022-11-28
6850333,6384b8862e9c097db9fd7993,634d6957f6a8432dcc2d5b27,api_hit_/events/,2022-11-28 13:32:54.848,rob@vanta.com,Rob Picard,0 days 00:01:00.958000,2022-11-28 05:32:54.848000-08:00,2022-11-28
6850331,6384b8872e9c097db9fd7995,634d6957f6a8432dcc2d5b27,api_hit_/overview/views/,2022-11-28 13:32:55.115,rob@vanta.com,Rob Picard,0 days 00:00:00.225000,2022-11-28 05:32:55.115000-08:00,2022-11-28


In [71]:
activity_per_day[activity_per_day.email == 'mmcconnell@nextdoor.com']

Unnamed: 0,email,dt,total_num_events
31158,mmcconnell@nextdoor.com,2022-11-10,980
31201,mmcconnell@nextdoor.com,2022-11-11,833
31217,mmcconnell@nextdoor.com,2022-11-12,770
31235,mmcconnell@nextdoor.com,2022-11-13,2176
31256,mmcconnell@nextdoor.com,2022-11-14,2468
31280,mmcconnell@nextdoor.com,2022-11-15,498
31288,mmcconnell@nextdoor.com,2022-11-16,223
31297,mmcconnell@nextdoor.com,2022-11-17,778
31316,mmcconnell@nextdoor.com,2022-11-18,262
31328,mmcconnell@nextdoor.com,2022-11-20,283


In [70]:
(
    activity_per_day[activity_per_day.email.isin(churned_users)]
    .sort_values(['email', 'dt'])
).to_csv('churned_users_correct_5_days.csv')

# Scratch

In [21]:
# df_daily_event_counts = (
#     df_event_type_counts
#     .groupby(["dt", "name"])
#     .agg(total_num_events=('num_events', 'sum'))
#     .sort_values(["dt", "total_num_events"], ascending=[True, False])

# )
# df_daily_event_counts

In [22]:
# df_daily_event_counts.to_csv("./user_daily_event_counts.csv")

In [23]:
# set(df_event_type_counts.event_type)