In [55]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import argparse
import os
from datetime import date, datetime, timedelta
from pprint import pprint

import dash_auth
import mpld3
import numpy as np
import pandas as pd
import plotly.express as px
import pytz
from dash import Dash, dcc, html
from plotnine import *
from pymongo import MongoClient

import sys
sys.path.append("./app/")
from log import get_logger

In [2]:
# ACTIVITY_COOLOFF_MINS = 10
# NUM_SESSIONS_THRESHOLD = 5
LOOKBACK_PERIOD_DAYS = 30
ACTIVATION_WINDOW = 20

# SESSION_COUNT_THRESHOLDS = [1, 3, 5]
CONNECTION_TEMPLATE = """mongodb://{user}:{password}@cluster0-shard-00-00.dbkij.mongodb.net:27017,cluster0-shard-00-01.dbkij.mongodb.net:27017,cluster0-shard-00-02.dbkij.mongodb.net:27017/myFirstDatabase?authSource=admin&replicaSet=atlas-xn7hxv-shard-0&w=majority&readPreference=primary&appname=MongoDB%20Compass&retryWrites=true&ssl=true"""
logger = get_logger(__name__)

mongo_user = os.getenv("MONGO_USER")
mongo_password = os.getenv("MONGO_PASSWORD")
if not mongo_user or not mongo_password:
    logger.fatal("MONGO_USER or MONGO_PASSWORD not set!")
    exit(1)


In [3]:
dt = datetime.today().strftime("%Y-%m-%d")
window = LOOKBACK_PERIOD_DAYS + ACTIVATION_WINDOW + 1

In [4]:
client = MongoClient(
    CONNECTION_TEMPLATE.format(user=mongo_user, password=mongo_password),
    unicode_decode_error_handler='ignore',
)

main_db = client.main
events_collection = main_db.log_events
user_collection = main_db.users


end = datetime.strptime(
    dt, "%Y-%m-%d").astimezone(pytz.timezone("US/Pacific"))

In [5]:
# print(end.date(), window)

In [6]:
# get user data

cursor = user_collection.find()
df_users = pd.DataFrame(list(cursor))
df_users = df_users.rename(columns={"_id": "user_id"}, errors="raise")
df_users = df_users[["user_id", "email", "name"]]
df_users["user_id"] = df_users["user_id"].astype(str)

print(df_users.shape)
df_users.head()

(963, 3)


Unnamed: 0,user_id,email,name
0,61463e014cf3dfd537c399fb,john@generaltask.com,John Reinstra
1,6146889f4cf3dfd537c8d987,jack_hamilton@me.com,Jack Hamilton
2,616cd20a2dbdcd0a7ba1642c,jreinstra@gmail.com,John Reinstra
3,61722cb1e1abefac8feddc31,scottmai702@gmail.com,Scott Mai
4,617343f4e1abefac8f00ab42,nolan1299@gmail.com,Nolan Jimenez


In [7]:
# generate event level data

# query events table
date_filter = {"created_at": {
    "$gt": end - timedelta(days=window)
    # , "$lt": end
}}
cursor = events_collection.find(date_filter)
events_df = pd.DataFrame(list(cursor))
events_df["user_id"] = events_df["user_id"].astype(str)
print(events_df.shape)

# merge with users
events_df = events_df.merge(df_users, on="user_id", how="left")
print(events_df.shape)

# add PST timestamps and sort by user and timestamps
events_df = events_df.rename(columns={"_id": "event_id"}, errors="raise")
events_df["ts_pst"] = events_df.created_at.dt.tz_localize(
    pytz.utc).dt.tz_convert('US/Pacific')
events_df["dt"] = events_df.ts_pst.dt.date  # date in PST

print(events_df.shape)
events_df.head(3)

(11369882, 4)
(11369882, 6)
(11369882, 9)


Unnamed: 0,event_id,user_id,event_type,created_at,email,name,time_since_previous_event_this_day,ts_pst,dt
0,63af6f45e762c71630532c4c,62587d69ab75e6bfa2e919e9,api_hit_/events/,2022-12-30 23:07:49.376,hans@generaltask.com,Hans van de Bruggen,0 days 00:00:04.457000,2022-12-30 15:07:49.376000-08:00,2022-12-30
1,63af6f406a1181e38c7ee016,62587d69ab75e6bfa2e919e9,api_hit_/overview/views/,2022-12-30 23:07:44.919,hans@generaltask.com,Hans van de Bruggen,0 days 00:00:00.034000,2022-12-30 15:07:44.919000-08:00,2022-12-30
2,63af6f406a1181e38c7ee015,62587d69ab75e6bfa2e919e9,api_hit_/tasks/v3/,2022-12-30 23:07:44.885,hans@generaltask.com,Hans van de Bruggen,0 days 00:00:00.079000,2022-12-30 15:07:44.885000-08:00,2022-12-30


# Event Counts by Type

In [8]:
background_events = [
    "api_hit_/events/",
    "api_hit_/tasks/fetch/",
    "api_hit_/tasks/v3/",
    "api_hit_/pull_requests/fetch/",
    "api_hit_/pull_requests/",
]

df_event_type_counts = (
    events_df
    [~events_df.event_type.isin(background_events)]
    .groupby(["email", "dt", "event_type"])
    .agg(num_events=('event_id', 'count'))
    # .sort_values(["name", "dt", "num_events"], ascending=False)
    .reset_index()
)
df_event_type_counts.head(2)

Unnamed: 0,email,dt,event_type,num_events
0,127.0.0.69@gmail.com,2022-11-21,"""open_auth_window_https://api.generaltask.com/...",1
1,127.0.0.69@gmail.com,2022-11-21,"""open_auth_window_https://api.generaltask.com/...",1


In [9]:
df_event_type_counts["total_num_events"] = (
    df_event_type_counts
    .groupby(["dt", "email"])["num_events"]
    .transform(np.sum)
)
# df_event_type_counts["number_of_active_days"] = (
#     df_event_type_counts
#     .groupby(["email"])["num_events"]
#     .transform(np.sum)
# )
df_event_type_counts = (
    df_event_type_counts
    .sort_values(["dt", "total_num_events", "num_events"], ascending=[True, False, False])
)
df_event_type_counts.head()

Unnamed: 0,email,dt,event_type,num_events,total_num_events
18526,john@generaltask.com,2022-11-08,list_pull_requests,1680,2937
18524,john@generaltask.com,2022-11-08,get_pull_request_info,1008,2937
18519,john@generaltask.com,2022-11-08,api_hit_/overview/views/,156,2937
18525,john@generaltask.com,2022-11-08,get_pull_requests,56,2937
18516,john@generaltask.com,2022-11-08,api_hit_/linked_accounts/,9,2937


In [10]:
# df_event_type_counts.to_csv("./user_daily_events_by_type_all_types.csv")

In [60]:
activity_per_day = (
    df_event_type_counts
    .drop_duplicates(['email', 'dt', 'total_num_events'])
    [['email', 'dt', 'total_num_events']]
    # .group
)
# activity_per_day['num_days_active'] = (
#     activity_per_day
#     .groupby(["email"])["dt"]
#     .nunique()
# )
activity_per_day

Unnamed: 0,email,dt,total_num_events
18526,john@generaltask.com,2022-11-08,2937
29540,maz@generaltask.com,2022-11-08,2047
25791,lea.broudo@zocdoc.com,2022-11-08,1049
12605,hans@generaltask.com,2022-11-08,880
17595,jiyoon@generaltask.com,2022-11-08,345
...,...,...,...
4071,bradley@newbridgemg.com,2022-12-30,32
15683,info@codeagency.be,2022-12-30,15
36031,rkelch@montecarlodata.com,2022-12-30,9
38876,sohyoonahn@gmail.com,2022-12-30,5


In [12]:
# (
#     activity_per_day
#     .groupby(["email"])["dt"]
#     .nunique()
#     # .transform(np.unique)
    
# )

In [13]:
# (
#     df_event_type_counts
#     .groupby(["dt", "email"])["num_events"]
#     .transform(np.sum)
# )

In [14]:
print((datetime.today()).strftime("%Y-%m-%d"))
print((datetime.today() - timedelta(days=20)).strftime("%Y-%m-%d"))

print((datetime.today() - timedelta(days=30)).strftime("%Y-%m-%d"))
print((datetime.today() - timedelta(days=30+ACTIVATION_WINDOW)).strftime("%Y-%m-%d"))

2022-12-30
2022-12-10
2022-11-30
2022-11-10


In [24]:
num_days_active = pd.DataFrame(
    activity_per_day
    .groupby(["email"])["dt"]
    .nunique()
).reset_index().rename(columns={"dt": "num_days_active"}, errors="raise")
activated_users = num_days_active[num_days_active.num_days_active > 4]
# TODO: we should be doing this to calculate "activated" users on both of the dataframes separately
activated_users

Unnamed: 0,email,num_days_active
1,14farresa@gmail.com,12
4,abishan.sutharshan@gmail.com,8
11,agaldy@standvast.com,13
12,aks.jain.1990@gmail.com,24
14,alessio.galdy@gmail.com,10
...,...,...
338,will.macdonald@gmail.com,5
346,xanderm@justappraised.com,10
347,xav@otta.com,16
348,yashkishore.y@gmail.com,10


In [25]:
# activity_per_day = activity_per_day[activity_per_day.email.isin(activated_users.email)]

In [64]:
activated_users_today = (
    activity_per_day
    [
        (activity_per_day.dt >= (datetime.today() - timedelta(days=ACTIVATION_WINDOW)).date())
        & (activity_per_day.dt <= (datetime.today()).date())
        & (activity_per_day.total_num_events > 0)
        # & (activity_per_day.total_num_events > 1)
    ]
)
# activated_users_today
activated_users_today.shape


num_days_active = pd.DataFrame(
    activated_users_today
    .groupby(["email"])["dt"]
    .nunique()
).reset_index().rename(columns={"dt": "num_days_active"}, errors="raise")
activated_users = num_days_active[num_days_active.num_days_active > 4]


activated_users_today = activated_users_today[activated_users_today.email.isin(activated_users.email)]
activated_users_today.shape





(913, 3)

(640, 3)

In [66]:
activated_users_30_days_ago = (
    activity_per_day
    [
        (activity_per_day.dt >= (datetime.today() - timedelta(days=30+ACTIVATION_WINDOW)).date())
        & (activity_per_day.dt <= (datetime.today() - timedelta(days=30)).date())
        & (activity_per_day.total_num_events > 0)
        # & (activity_per_day.total_num_events > 1)
    ]
)

activated_users_30_days_ago.shape


num_days_active = pd.DataFrame(
    activated_users_30_days_ago
    .groupby(["email"])["dt"]
    .nunique()
).reset_index().rename(columns={"dt": "num_days_active"}, errors="raise")
activated_users = num_days_active[num_days_active.num_days_active > 4]


activated_users_30_days_ago = activated_users_30_days_ago[activated_users_30_days_ago.email.isin(activated_users.email)]
activated_users_30_days_ago.shape




(1489, 3)

(687, 3)

In [67]:
s_activated_users_today = set(activated_users_today.email)
s_activated_users_30_days_ago = set(activated_users_30_days_ago.email)
churned_users = s_activated_users_30_days_ago.difference(s_activated_users_today)
users_retained = s_activated_users_30_days_ago.intersection(s_activated_users_today)
len(s_activated_users_today)
len(s_activated_users_30_days_ago)
len(churned_users)
len(users_retained)

58

72

45

27

In [69]:
len(users_retained)/len(s_activated_users_30_days_ago)

0.375

In [56]:
'agaldy@standvast.com' in s_activated_users_30_days_ago
'agaldy@standvast.com' in churned_users

False

False

True

False

In [42]:
activity_per_day[activity_per_day.email == 'rob@vanta.com']

Unnamed: 0,email,dt,total_num_events
36047,rob@vanta.com,2022-11-14,1748
36063,rob@vanta.com,2022-11-15,1405
36069,rob@vanta.com,2022-11-16,1009
36081,rob@vanta.com,2022-11-17,932
36089,rob@vanta.com,2022-11-18,589
36096,rob@vanta.com,2022-11-21,780
36105,rob@vanta.com,2022-11-22,956
36111,rob@vanta.com,2022-11-23,293
36112,rob@vanta.com,2022-11-24,485
36113,rob@vanta.com,2022-11-25,508


In [70]:
(
    activity_per_day[activity_per_day.email.isin(churned_users)]
    .sort_values(['email', 'dt'])
).to_csv('churned_users_correct_5_days.csv')

# Scratch

In [21]:
# df_daily_event_counts = (
#     df_event_type_counts
#     .groupby(["dt", "name"])
#     .agg(total_num_events=('num_events', 'sum'))
#     .sort_values(["dt", "total_num_events"], ascending=[True, False])

# )
# df_daily_event_counts

In [22]:
# df_daily_event_counts.to_csv("./user_daily_event_counts.csv")

In [23]:
# set(df_event_type_counts.event_type)