In [1]:
import argparse
import os
from datetime import date, datetime, timedelta
from pprint import pprint

import dash_auth
import mpld3
import numpy as np
import pandas as pd
import plotly.express as px
import pytz
from dash import Dash, dcc, html
from plotnine import *
from pymongo import MongoClient

import sys
sys.path.append("./app/")
from log import get_logger

The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`
  import dash_html_components as html
The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`
  import dash_core_components as dcc


In [2]:
# ACTIVITY_COOLOFF_MINS = 10
# NUM_SESSIONS_THRESHOLD = 5
LOOKBACK_PERIOD_DAYS = 20
# SESSION_COUNT_THRESHOLDS = [1, 3, 5]
CONNECTION_TEMPLATE = """mongodb://{user}:{password}@cluster0-shard-00-00.dbkij.mongodb.net:27017,cluster0-shard-00-01.dbkij.mongodb.net:27017,cluster0-shard-00-02.dbkij.mongodb.net:27017/myFirstDatabase?authSource=admin&replicaSet=atlas-xn7hxv-shard-0&w=majority&readPreference=primary&appname=MongoDB%20Compass&retryWrites=true&ssl=true"""
logger = get_logger(__name__)

mongo_user = os.getenv("MONGO_USER")
mongo_password = os.getenv("MONGO_PASSWORD")
if not mongo_user or not mongo_password:
    logger.fatal("MONGO_USER or MONGO_PASSWORD not set!")
    exit(1)


In [3]:
dt = datetime.today().strftime("%Y-%m-%d")
window = LOOKBACK_PERIOD_DAYS

In [4]:
client = MongoClient(
    CONNECTION_TEMPLATE.format(user=mongo_user, password=mongo_password),
    unicode_decode_error_handler='ignore',
)

main_db = client.main
events_collection = main_db.log_events
user_collection = main_db.users


end = datetime.strptime(
    dt, "%Y-%m-%d").astimezone(pytz.timezone("US/Pacific"))

In [5]:
print(end.date(), window)

2022-11-30 20


In [6]:
# get user data

cursor = user_collection.find()
df_users = pd.DataFrame(list(cursor))
df_users = df_users.rename(columns={"_id": "user_id"}, errors="raise")
df_users = df_users[["user_id", "email", "name"]]
df_users["user_id"] = df_users["user_id"].astype(str)

print(df_users.shape)
df_users.head()

(813, 3)


Unnamed: 0,user_id,email,name
0,61463e014cf3dfd537c399fb,john@generaltask.com,John Reinstra
1,6146889f4cf3dfd537c8d987,jack_hamilton@me.com,Jack Hamilton
2,616cd20a2dbdcd0a7ba1642c,jreinstra@gmail.com,John Reinstra
3,61722cb1e1abefac8feddc31,scottmai702@gmail.com,Scott Mai
4,617343f4e1abefac8f00ab42,nolan1299@gmail.com,Nolan Jimenez


In [7]:
# generate event level data

# query events table
date_filter = {"created_at": {
    "$gt": end - timedelta(days=window), "$lt": end}}
cursor = events_collection.find(date_filter)
events_df = pd.DataFrame(list(cursor))
events_df["user_id"] = events_df["user_id"].astype(str)
print(events_df.shape)

# merge with users
events_df = events_df.merge(df_users, on="user_id", how="outer")
print(events_df.shape)

# # add PST timestamps and sort by user and timestamps
# events_df = events_df.rename(columns={"_id": "event_id"}, errors="raise")
# events_df["time_since_previous_event_this_day"] = (
#     events_df
#     .sort_values(by=["user_id", "created_at"])
#     .groupby(by='user_id')["created_at"]
#     .diff()
# )
# events_df["ts_pst"] = events_df.created_at.dt.tz_localize(
#     pytz.utc).dt.tz_convert('US/Pacific')
# events_df["dt"] = events_df.ts_pst.dt.date  # date in PST

# print(events_df.shape)
# events_df.head(3)

(4994876, 4)
(4995078, 6)


In [13]:
churned_users = (
    events_df
    [events_df.event_type.isna()]
    # .groupby("user_id")
    # .agg({})
)

active_users = (
    events_df
    [events_df.event_type.notna()]
    # .groupby("user_id")
    # .agg({})
)

In [14]:
churned_users

Unnamed: 0,_id,user_id,event_type,created_at,email,name
4994876,,616cd20a2dbdcd0a7ba1642c,,NaT,jreinstra@gmail.com,John Reinstra
4994877,,617343f4e1abefac8f00ab42,,NaT,nolan1299@gmail.com,Nolan Jimenez
4994878,,6194444356e4da3c4754c8b4,,NaT,najimene@usc.edu,Nolan Jimenez
4994879,,61a9106e56e4da3c47f36b80,,NaT,mlandolfi99@gmail.com,Michael l
4994880,,61d5391e152bde5512133184,,NaT,jifagbemi@gmail.com,Jare Fagbemi
...,...,...,...,...,...,...
4995073,,6388dc0fceea7a6dd0bd0140,,NaT,raphael.sisa@gmail.com,Raphael Sisa
4995074,,6388e076ceea7a6dd0c3d1e3,,NaT,jackpellegrini83@gmail.com,Giacomo Pellegrini
4995075,,6388e148ceea7a6dd0c5294a,,NaT,spcasey99@gmail.com,Sean Casey
4995076,,6388e975ceea7a6dd0d3286a,,NaT,bsanchez.dncu@gmail.com,Brian Sanchez


In [15]:
active_users

Unnamed: 0,_id,user_id,event_type,created_at,email,name
0,638842df0c27add3a9e02f8f,6388359ba56e57746efaee22,api_hit_/tasks/fetch/,2022-12-01 05:59:59.965,michelle.j.caron@gmail.com,Michelle Caron
1,638842c866f6132ff25b17fd,6388359ba56e57746efaee22,api_hit_/overview/views/,2022-12-01 05:59:36.961,michelle.j.caron@gmail.com,Michelle Caron
2,638842c866f6132ff25b17fb,6388359ba56e57746efaee22,api_hit_/pull_requests/,2022-12-01 05:59:36.944,michelle.j.caron@gmail.com,Michelle Caron
3,638842c866f6132ff25b17f9,6388359ba56e57746efaee22,api_hit_/pull_requests/fetch/,2022-12-01 05:59:36.817,michelle.j.caron@gmail.com,Michelle Caron
4,638842c20c27add3a9e02f64,6388359ba56e57746efaee22,api_hit_/events/,2022-12-01 05:59:30.853,michelle.j.caron@gmail.com,Michelle Caron
...,...,...,...,...,...,...
4994871,636de598fefbe459a7e52e36,636de5979a159213694c9e6e,api_hit_/tasks/v3/,2022-11-11 06:03:04.829,kev.guo123@gmail.com,Kevin Guo
4994872,636de598fefbe459a7e52e35,636de5979a159213694c9e6e,api_hit_/pull_requests/fetch/,2022-11-11 06:03:04.827,kev.guo123@gmail.com,Kevin Guo
4994873,636de5981345dc62e19ab089,636de5979a159213694c9e6e,api_hit_/events/,2022-11-11 06:03:04.823,kev.guo123@gmail.com,Kevin Guo
4994874,636de5981345dc62e19ab088,636de5979a159213694c9e6e,api_hit_/user_info/,2022-11-11 06:03:04.820,kev.guo123@gmail.com,Kevin Guo


Unnamed: 0,_id,user_id,event_type,created_at,email,name
4994876,,616cd20a2dbdcd0a7ba1642c,,NaT,jreinstra@gmail.com,John Reinstra
4994877,,617343f4e1abefac8f00ab42,,NaT,nolan1299@gmail.com,Nolan Jimenez
4994878,,6194444356e4da3c4754c8b4,,NaT,najimene@usc.edu,Nolan Jimenez
4994879,,61a9106e56e4da3c47f36b80,,NaT,mlandolfi99@gmail.com,Michael l
4994880,,61d5391e152bde5512133184,,NaT,jifagbemi@gmail.com,Jare Fagbemi
...,...,...,...,...,...,...
4995073,,6388dc0fceea7a6dd0bd0140,,NaT,raphael.sisa@gmail.com,Raphael Sisa
4995074,,6388e076ceea7a6dd0c3d1e3,,NaT,jackpellegrini83@gmail.com,Giacomo Pellegrini
4995075,,6388e148ceea7a6dd0c5294a,,NaT,spcasey99@gmail.com,Sean Casey
4995076,,6388e975ceea7a6dd0d3286a,,NaT,bsanchez.dncu@gmail.com,Brian Sanchez


In [12]:
churned_users[['name', 'email', 'user_id']].to_csv("./churned_users_dec_1_20_day_lookback.csv")

In [11]:
df_users

Unnamed: 0,user_id,email,name
0,61463e014cf3dfd537c399fb,john@generaltask.com,John Reinstra
1,6146889f4cf3dfd537c8d987,jack_hamilton@me.com,Jack Hamilton
2,616cd20a2dbdcd0a7ba1642c,jreinstra@gmail.com,John Reinstra
3,61722cb1e1abefac8feddc31,scottmai702@gmail.com,Scott Mai
4,617343f4e1abefac8f00ab42,nolan1299@gmail.com,Nolan Jimenez
...,...,...,...
808,6388dc0fceea7a6dd0bd0140,raphael.sisa@gmail.com,Raphael Sisa
809,6388e076ceea7a6dd0c3d1e3,jackpellegrini83@gmail.com,Giacomo Pellegrini
810,6388e148ceea7a6dd0c5294a,spcasey99@gmail.com,Sean Casey
811,6388e975ceea7a6dd0d3286a,bsanchez.dncu@gmail.com,Brian Sanchez
