In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import argparse
import os
from datetime import date, datetime, timedelta
from pprint import pprint

# import dash_auth
import mpld3
import numpy as np
import pandas as pd
import plotly.express as px
import pytz
# from dash import Dash, dcc, html
# from plotnine import *
from pymongo import MongoClient

import sys
sys.path.append("./app/")
from log import get_logger

In [None]:
# ACTIVITY_COOLOFF_MINS = 10
# NUM_SESSIONS_THRESHOLD = 5
LOOKBACK_PERIOD_DAYS = 30
ACTIVATION_WINDOW = 20

# SESSION_COUNT_THRESHOLDS = [1, 3, 5]
CONNECTION_TEMPLATE = """mongodb://{user}:{password}@cluster0-shard-00-00.dbkij.mongodb.net:27017,cluster0-shard-00-01.dbkij.mongodb.net:27017,cluster0-shard-00-02.dbkij.mongodb.net:27017/myFirstDatabase?authSource=admin&replicaSet=atlas-xn7hxv-shard-0&w=majority&readPreference=primary&appname=MongoDB%20Compass&retryWrites=true&ssl=true"""
logger = get_logger(__name__)

mongo_user = os.getenv("MONGO_USER")
mongo_password = os.getenv("MONGO_PASSWORD")
if not mongo_user or not mongo_password:
    logger.fatal("MONGO_USER or MONGO_PASSWORD not set!")
    exit(1)


In [None]:
# dt = datetime.today().strftime("%Y-%m-%d")
# dt = datetime.strptime("2022-12-19", "%Y-%m-%d").strftime("%Y-%m-%d")
dt = datetime.strptime("2023-01-07", "%Y-%m-%d").strftime("%Y-%m-%d")

window = LOOKBACK_PERIOD_DAYS + ACTIVATION_WINDOW + 1

In [None]:
client = MongoClient(
    CONNECTION_TEMPLATE.format(user=mongo_user, password=mongo_password),
    unicode_decode_error_handler='ignore',
)

main_db = client.main
events_collection = main_db.log_events
user_collection = main_db.users


end = datetime.strptime(
    dt, "%Y-%m-%d").astimezone(pytz.timezone("US/Pacific"))

In [None]:
# print(end.date(), window)

In [None]:
# get user data

cursor = user_collection.find()
df_users_raw = pd.DataFrame(list(cursor))
df_users_raw = df_users_raw.rename(columns={"_id": "user_id"}, errors="raise")

In [None]:
df_users_raw["signup_dt_pst"] = df_users_raw.created_at.dt.tz_localize(
    pytz.utc).dt.tz_convert('US/Pacific').dt.date
df_users_raw

In [None]:
df_users = df_users_raw[["user_id", "email", "name", "signup_dt_pst"]].copy()
df_users["user_id"] = df_users["user_id"].astype(str)

print(df_users.shape)
df_users.head()

In [None]:
end - timedelta(days=window), end

In [None]:
# generate event level data

# query events table
date_filter = {"created_at": {
    "$gt": end - timedelta(days=window)
    , "$lt": end + timedelta(days=2)
}}
cursor = events_collection.find(date_filter)
events_df_raw = pd.DataFrame(list(cursor))
events_df_raw["user_id"] = events_df_raw["user_id"].astype(str)
print(events_df_raw.shape)

In [None]:
# merge with users
events_df = events_df_raw.merge(df_users, on="user_id", how="left")
print(events_df.shape)

In [None]:
# add PST timestamps and sort by user and timestamps
events_df = events_df.rename(columns={"_id": "event_id"}, errors="raise")
events_df["ts_pst"] = events_df.created_at.dt.tz_localize(
    pytz.utc).dt.tz_convert('US/Pacific')
events_df["dt"] = events_df.ts_pst.dt.date  # date in PST

print(events_df.shape)
events_df.head(3)

In [None]:
# events_df.to_csv('events_df.csv')

# Event Counts by Type

In [None]:
background_events = [
    "api_hit_/events/",
    "api_hit_/tasks/fetch/",
    "api_hit_/tasks/v3/",
    "api_hit_/pull_requests/fetch/",
    "api_hit_/pull_requests/",
    "api_hit_/overview/views/",
    "api_hit_/recurring_task_templates/backfill_tasks/",
]

df_event_type_counts = (
    events_df
    # [~events_df.event_type.isin(background_events)]
    .groupby(["email", "dt", "event_type"])
    .agg(num_events=('event_id', 'count'))
    # .sort_values(["name", "dt", "num_events"], ascending=False)
    .reset_index()
)
df_event_type_counts.head(2)

In [None]:
df_event_type_counts["total_num_events"] = (
    df_event_type_counts
    .groupby(["dt", "email"])["num_events"]
    .transform(np.sum)
)
# df_event_type_counts["number_of_active_days"] = (
#     df_event_type_counts
#     .groupby(["email"])["num_events"]
#     .transform(np.sum)
# )
df_event_type_counts = (
    df_event_type_counts
    .sort_values(["dt", "total_num_events", "num_events"], ascending=[True, False, False])
)
df_event_type_counts.head()

In [None]:
signup_start = (end - timedelta(days=20+30)).date()
signup_end = (end - timedelta(days=30)).date()
signup_start, signup_end

In [None]:
study_users = df_users[(df_users.signup_dt_pst >= signup_start) & (df_users.signup_dt_pst <= signup_end)]
study_users

In [None]:
activity_per_day = (
    df_event_type_counts
    .drop_duplicates(['email', 'dt', 'total_num_events'])
    [['email', 'dt', 'total_num_events']]
    # .group
)
activity_per_day.head()

In [None]:
study_activity_per_day = (
    activity_per_day[activity_per_day.email.isin(study_users.email)]
    .merge(df_users[['signup_dt_pst', 'email']], on="email", how="left")
)
study_activity_per_day.head()

In [None]:
study_activity_per_day['d1_date'] = study_activity_per_day['signup_dt_pst'] + timedelta(days=1)
study_activity_per_day['d7_date'] = study_activity_per_day['signup_dt_pst'] + timedelta(days=7)
study_activity_per_day['d28_date'] = study_activity_per_day['signup_dt_pst'] + timedelta(days=28)
study_activity_per_day.head()

In [None]:
study_activity_per_day['active_d1'] = study_activity_per_day['dt'] == study_activity_per_day['d1_date']
study_activity_per_day['active_d7'] = study_activity_per_day['dt'] == study_activity_per_day['d7_date']
study_activity_per_day['active_d28'] = study_activity_per_day['dt'] == study_activity_per_day['d28_date']
study_activity_per_day.head()

In [None]:
len(study_users), min(study_users.signup_dt_pst), max(study_users.signup_dt_pst)

In [None]:
counts = (
    study_activity_per_day
    # .groupby('email')
    .sum()
)
counts[2:] / len(study_users) * 100

# Scratch

In [None]:
# df_daily_event_counts = (
#     df_event_type_counts
#     .groupby(["dt", "name"])
#     .agg(total_num_events=('num_events', 'sum'))
#     .sort_values(["dt", "total_num_events"], ascending=[True, False])

# )
# df_daily_event_counts

In [None]:
# df_daily_event_counts.to_csv("./user_daily_event_counts.csv")

In [None]:
# set(df_event_type_counts.event_type)