In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import argparse
import os
from datetime import date, datetime, timedelta
from pprint import pprint

import dash_auth
import mpld3
import numpy as np
import pandas as pd
import plotly.express as px
import pytz
from dash import Dash, dcc, html
# from plotnine import *
from pymongo import MongoClient

import sys
sys.path.append("./app/")
from log import get_logger

In [None]:
# ACTIVITY_COOLOFF_MINS = 10
# NUM_SESSIONS_THRESHOLD = 5
LOOKBACK_PERIOD_DAYS = 30
ACTIVATION_WINDOW = 20

# SESSION_COUNT_THRESHOLDS = [1, 3, 5]
CONNECTION_TEMPLATE = """mongodb://{user}:{password}@cluster0-shard-00-00.dbkij.mongodb.net:27017,cluster0-shard-00-01.dbkij.mongodb.net:27017,cluster0-shard-00-02.dbkij.mongodb.net:27017/myFirstDatabase?authSource=admin&replicaSet=atlas-xn7hxv-shard-0&w=majority&readPreference=primary&appname=MongoDB%20Compass&retryWrites=true&ssl=true"""
logger = get_logger(__name__)

mongo_user = os.getenv("MONGO_USER")
mongo_password = os.getenv("MONGO_PASSWORD")
if not mongo_user or not mongo_password:
    logger.fatal("MONGO_USER or MONGO_PASSWORD not set!")
    exit(1)


In [None]:
# dt = datetime.today().strftime("%Y-%m-%d")
dt = datetime.strptime("2022-12-19", "%Y-%m-%d").strftime("%Y-%m-%d")

window = LOOKBACK_PERIOD_DAYS + ACTIVATION_WINDOW + 1

In [None]:
client = MongoClient(
    CONNECTION_TEMPLATE.format(user=mongo_user, password=mongo_password),
    unicode_decode_error_handler='ignore',
)

main_db = client.main
events_collection = main_db.log_events
user_collection = main_db.users


end = datetime.strptime(
    dt, "%Y-%m-%d").astimezone(pytz.timezone("US/Pacific"))

In [None]:
# print(end.date(), window)

In [None]:
# get user data

cursor = user_collection.find()
df_users = pd.DataFrame(list(cursor))
df_users = df_users.rename(columns={"_id": "user_id"}, errors="raise")
df_users = df_users[["user_id", "email", "name"]]
df_users["user_id"] = df_users["user_id"].astype(str)

print(df_users.shape)
df_users.head()

In [None]:
# generate event level data

# query events table
date_filter = {"created_at": {
    "$gt": end - timedelta(days=window)
    # , "$lt": end
}}
cursor = events_collection.find(date_filter)
events_df = pd.DataFrame(list(cursor))
events_df["user_id"] = events_df["user_id"].astype(str)
print(events_df.shape)

# merge with users
events_df = events_df.merge(df_users, on="user_id", how="left")
print(events_df.shape)

# add PST timestamps and sort by user and timestamps
events_df = events_df.rename(columns={"_id": "event_id"}, errors="raise")
events_df["ts_pst"] = events_df.created_at.dt.tz_localize(
    pytz.utc).dt.tz_convert('US/Pacific')
events_df["dt"] = events_df.ts_pst.dt.date  # date in PST

print(events_df.shape)
events_df.head(3)

In [None]:
events_df.to_csv('events_df.csv')

# Event Counts by Type

In [None]:
background_events = [
    "api_hit_/events/",
    "api_hit_/tasks/fetch/",
    "api_hit_/tasks/v3/",
    "api_hit_/pull_requests/fetch/",
    "api_hit_/pull_requests/",
    "api_hit_/overview/views/",
    "api_hit_/recurring_task_templates/backfill_tasks/",
]

df_event_type_counts = (
    events_df
    [~events_df.event_type.isin(background_events)]
    .groupby(["email", "dt", "event_type"])
    .agg(num_events=('event_id', 'count'))
    # .sort_values(["name", "dt", "num_events"], ascending=False)
    .reset_index()
)
df_event_type_counts.head(2)

In [None]:
df_event_type_counts["total_num_events"] = (
    df_event_type_counts
    .groupby(["dt", "email"])["num_events"]
    .transform(np.sum)
)
# df_event_type_counts["number_of_active_days"] = (
#     df_event_type_counts
#     .groupby(["email"])["num_events"]
#     .transform(np.sum)
# )
df_event_type_counts = (
    df_event_type_counts
    .sort_values(["dt", "total_num_events", "num_events"], ascending=[True, False, False])
)
df_event_type_counts.head()

In [None]:
# df_event_type_counts.to_csv("./user_daily_events_by_type_all_types.csv")

In [None]:
num_days_active_threshold = 4


# dt = datetime.today()
dt = datetime.strptime("2022-12-19", "%Y-%m-%d")



print('dt', (dt).strftime("%Y-%m-%d"))
print('dt - 20 days', (dt - timedelta(days=20)).strftime("%Y-%m-%d"))

print('dt - 30 days', (dt - timedelta(days=30)).strftime("%Y-%m-%d"))
print('dt - 30 - 20', (dt - timedelta(days=30+ACTIVATION_WINDOW)).strftime("%Y-%m-%d"))

min(df_event_type_counts.dt)
max(df_event_type_counts.dt)

In [None]:
activity_per_day = (
    df_event_type_counts
    .drop_duplicates(['email', 'dt', 'total_num_events'])
    [['email', 'dt', 'total_num_events']]
    # .group
)
# activity_per_day['num_days_active'] = (
#     activity_per_day
#     .groupby(["email"])["dt"]
#     .nunique()
# )
activity_per_day

In [None]:
activity_per_day

In [None]:
# num_days_active = pd.DataFrame(
#     activity_per_day
#     .groupby(["email"])["dt"]
#     .nunique()
# ).reset_index().rename(columns={"dt": "num_days_active"}, errors="raise").sort_values(['num_days_active'], ascending=False)
# num_days_active['days_usage_perc'] = num_days_active.num_days_active / (max(activity_per_day.dt) - min(activity_per_day.dt)).days
# # dtale.show(num_days_active)
# num_days_active.head()

In [None]:
(dt - timedelta(days=30+ACTIVATION_WINDOW))
(dt).date()

In [None]:
# # activity_per_day

num_days_active = pd.DataFrame(
    activity_per_day
    [
        (activity_per_day.dt >= (dt - timedelta(days=30+ACTIVATION_WINDOW)).date())
        & (activity_per_day.dt <= (dt).date())
    ]
    .groupby(["email"])["dt"]
    .nunique()
).reset_index().rename(columns={"dt": "num_days_active"}, errors="raise")
activated_users = num_days_active[num_days_active.num_days_active > num_days_active_threshold]
# TODO: we should be doing this to calculate "activated" users on both of the dataframes separately
# activated_users


activity_per_day = activity_per_day[activity_per_day.email.isin(activated_users.email)]

In [None]:
# max(events_df.dt), min(events_df.dt)
# max(activity_per_day.dt), min(activity_per_day.dt)
# max(df_event_type_counts.dt), min(df_event_type_counts.dt)

In [None]:
# # import qgrid
# # from pandasgui import show
# # import tabloo
# import dtale
# # num_days_active.to_csv('num_days_active.csv')

In [None]:
activated_users_today = (
    activity_per_day
    [
        (activity_per_day.dt >= (dt - timedelta(days=ACTIVATION_WINDOW)).date())
        & (activity_per_day.dt <= (dt).date())
        & (activity_per_day.total_num_events > 0)
        # & (activity_per_day.total_num_events > 1)
    ]
)
# activated_users_today
activated_users_today.shape

# num_days_active = pd.DataFrame(
#     activated_users_today
#     .groupby(["email"])["dt"]
#     .nunique()
# ).reset_index().rename(columns={"dt": "num_days_active"}, errors="raise")
# activated_users = num_days_active[num_days_active.num_days_active > num_days_active_threshold]



In [None]:
min(activity_per_day.dt)
max(activity_per_day.dt)

In [None]:
activated_users_30_days_ago = (
    activity_per_day
    [
        (activity_per_day.dt >= (dt - timedelta(days=30+ACTIVATION_WINDOW)).date())
        & (activity_per_day.dt <= (dt - timedelta(days=30)).date())
        & (activity_per_day.total_num_events > 0)
        # & (activity_per_day.total_num_events > 1)
    ]
)

activated_users_30_days_ago.shape


# num_days_active = pd.DataFrame(
#     activated_users_30_days_ago
#     .groupby(["email"])["dt"]
#     .nunique()
# ).reset_index().rename(columns={"dt": "num_days_active"}, errors="raise")
# activated_users = num_days_active[num_days_active.num_days_active > num_days_active_threshold]


# activated_users_30_days_ago = activated_users_30_days_ago[activated_users_30_days_ago.email.isin(activated_users.email)]
# activated_users_30_days_ago.shape

In [None]:
s_activated_users_today = set(activated_users_today.email)
s_activated_users_30_days_ago = set(activated_users_30_days_ago.email)
churned_users = s_activated_users_30_days_ago.difference(s_activated_users_today)
users_retained = s_activated_users_30_days_ago.intersection(s_activated_users_today)
len(s_activated_users_today)
len(s_activated_users_30_days_ago)
len(churned_users)
len(users_retained)

In [None]:
len(users_retained)/len(s_activated_users_30_days_ago)

In [None]:
'agaldy@standvast.com' in s_activated_users_30_days_ago
'agaldy@standvast.com' in churned_users

In [None]:
events_df[events_df.email == 'rob@vanta.com'].sort_values(['dt'])

In [None]:
activity_per_day[activity_per_day.email == 'mmcconnell@nextdoor.com']

In [None]:
(
    activity_per_day[activity_per_day.email.isin(churned_users)]
    .sort_values(['email', 'dt'])
).to_csv('churned_users_correct_5_days.csv')

# Scratch

In [None]:
# df_daily_event_counts = (
#     df_event_type_counts
#     .groupby(["dt", "name"])
#     .agg(total_num_events=('num_events', 'sum'))
#     .sort_values(["dt", "total_num_events"], ascending=[True, False])

# )
# df_daily_event_counts

In [None]:
# df_daily_event_counts.to_csv("./user_daily_event_counts.csv")

In [None]:
# set(df_event_type_counts.event_type)