In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime

# OLS 模型 (來自 0429.ipynb & linear models.ipynb)
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col

# 固定效應 (Fixed Effects) 模型 (來自 test.ipynb)
import pyfixest as pf

# 匯入我們重構的 utils 模組
# (匯總 0429.ipynb 和 linear models.ipynb 所需的函式)
from utils import (
    get_events, 
    get_track_event, 
    insert_date, 
    trim, 
    get_background, 
    insert_gender, 
    drop_null_target
)

# 設定顯示格式與路徑
pd.options.display.float_format = '{:.4f}'.format
FIG_PATH = Path('/home/cmc1503/Desktop/JD_exploration/figures')
DATA_PATH = Path('/home/cmc1503/Desktop/JD_exploration/data')
p = DATA_PATH

### Functions

In [None]:
def get_all_track_event(period: str, cleared=True) -> pd.DataFrame:
    path = Path('/home/cmc1503/Desktop/JD_exploration/data')
    if period == 'before':
        df = feather.read_feather(path / 'all_track_event_before.feather')
    elif period == 'after':
        if cleared:
            df = feather.read_feather(path / 'all_track_event_after_cleared.feather')
        else:
            df = feather.read_feather(path / 'all_track_event_after.feather')
    else:
        df = feather.read_feather(path / 'all_track_event_combined_cleared.feather')
    return df

def get_card_pool(start=9, end=25, gender='F'):
    data = pd.DataFrame()
    
    for week in range(start, end+1):
        t1 = time.perf_counter()
        df = pd.DataFrame()
        for i in range(10):
            if week < 10:
                temp = pd.read_csv(p / f"week0{week}_{gender}_{i}.csv")
            else:
                temp = pd.read_csv(p / f"week{week}_{gender}_{i}.csv")
            df = pd.concat([df, temp])
        
        df["week"] = week
    
        data = pd.concat([data, df])
        t2 = time.perf_counter()

        print(t2 - t1)
    return data

def get_all_pool(gender='F'):
    path = Path('/home/cmc1503/Desktop/JD_exploration/data')
    if gender == 'F':
        df = feather.read_feather(path / 'all_pool_female.feather')
    else:
        df = feather.read_feather(path / 'all_pool_male.feather')
    return df

# 實驗開始後，清除過後、女性、page == meet、act == interestYes or interestNo
def get_all_track_event_with_stars():
    path = Path('/home/cmc1503/Desktop/JD_exploration/data')
    # df = feather.read_feather(path / 'all_track_event_after_cleared_fem_meet_with_stars.feather')
    df = feather.read_feather(path / 'all_track_event_after_cleared_fem_meet_with_stars_and_prev.feather') # 有「先前」的行為
    return df

def get_background(columns=['uCode', 'gender'], gender='F', last_login=202307) -> pd.DataFrame:
    path = Path('/home/cmc1503/Desktop/JD_exploration/data')
    if gender == 'F':
        df = pd.read_csv(path / f'users_females.csv')
    else:
        df = pd.read_csv(path / f'users_males.csv')
    
    # 只保留 last_login_ymd 在202307後的帳號
    filt = df['last_login_ymd'] >= last_login

    return df[filt][columns]

# problematic accounts (by age: <= 0 or > 70)
# input background 的 dataset
def get_prob_accounts_by_BD(gender='F') -> pd.Series:
    df = get_background(columns=['uCode', 'birthday'], gender=gender)

    df['birthday'] = pd.to_datetime(df.birthday, errors="coerce", format="%Y-%m-%d")
    low_filt = (df.birthday < np.datetime64(date(1954, 1, 1)))
    up_filt = (df.birthday >= np.datetime64(date(2007, 1, 1)))
    na_filt = df.birthday.isna()
    prob_accounts = df[low_filt | up_filt | na_filt]['uCode']
    return prob_accounts

# problematic accounts (by interestYes sent: > 0.999 (quantile))
def get_prob_accounts_by_Yes(df, threshold=0.999) -> pd.Series:
    gender_filt = (df['gender'] == 'F')
    act_filt = (df['act'] == 'interestYes')
    cnt_Yes = df[gender_filt & act_filt][['uCode', 'act']].groupby('uCode').count()
    
    prob_accounts = cnt_Yes[cnt_Yes['act'] > cnt_Yes['act'].quantile(threshold)].reset_index()['uCode']
    return prob_accounts

def drop_prob_accounts(df, threshold=0.999):
    '''
    1. 生日在 1954/1/1 之前 或 2007/1/1 之後
    2. 總按讚數在該性別當中的 99.9% 以上
    '''    
    t1 = time.process_time()
    prob_accounts_by_BD_female = get_prob_accounts_by_BD('F')
    
    t2 = time.process_time()
    prob_accounts_by_BD_male = get_prob_accounts_by_BD('M')

    t3 = time.process_time()
    prob_accounts_by_Yes = get_prob_accounts_by_Yes(df, threshold=threshold)
    
    t4 = time.process_time()
    prob_accounts = set(prob_accounts_by_Yes).union(set(prob_accounts_by_BD_female), set(prob_accounts_by_BD_male))
    prob_account_filt = (df['uCode'].isin(prob_accounts) | df['target_uCode'].isin(prob_accounts)) # 剔除掉 uCode 或 target_uCode 在 prob_accounts 裡的資料
    
    t5 = time.process_time()
    non_prob_df = df[~prob_account_filt]
    
    t6 = time.process_time()
    print(t2-t1, t3-t2, t4-t3, t5-t4, t6-t5)
    return non_prob_df

def insert_date(data, tscol='ts'):
    data['datetime'] = data[tscol].apply(lambda x: datetime.fromtimestamp(x))
    data['date'] = data['datetime'].dt.date
    return

def insert_gender(data, gender='F'):
    df = get_background(columns=['uCode', 'gender'], gender=gender)
    gender_dict = dict(zip(df['uCode'], df['gender']))
    data['gender'] = data['uCode'].map(gender_dict)
    return

def insert_group(data):
    path = Path('/home/cmc1503/Desktop/JD_exploration/data')
    df = pd.read_csv(path / 'fem_treat_arms_sep_jan_20w.csv')
    group_dict = dict(zip(df['uCode'], df['divide']))
    data['group'] = data['uCode'].map(group_dict)
    return

def insert_background(data, col, gender, last_login=202307):
    df = get_background(columns=['uCode', col], gender=gender, last_login=last_login)
    background_dict = dict(zip(df['uCode'], df[col]))
    if gender == 'F':
        data['uCode_{}'.format(col)] = data['uCode'].map(background_dict)
    else:
        data['target_{}'.format(col)] = data['target_uCode'].map(background_dict)

    return

def get_pie_chart(df, col='page'):
    x = df[col].value_counts()
    sizes = x / x.sum()
    labels = [f'{l} ({s:.1%})' for l, s in zip(x.index, sizes)]
    top_labels = [f'{l} ({s:.1%})' if s > 0.05 else '' for l, s in zip(x.index, sizes)]
    
    fig, ax = plt.subplots()
    
    ax.pie(x, labels=top_labels)
    ax.legend(labels=labels, bbox_to_anchor=(1.8,1.2))

    return fig, ax

def get_summary(ols_results, to_latex=False):
    beta = ols_results.params
    std_err = ols_results.bse
    nobs = ols_results.nobs
    if to_latex:
        return beta.to_latex(), std_err.to_latex(), nobs
    else:
        return beta, std_err, nobs

# ================================================
# 暫時用不到的
# ================================================


def drop_null_target(data):
    data.dropna(subset=['target_uCode'], inplace=True)
    return 

# events dataset
def trim_by_event(data, event, bound=0.999, show_outlier=False):
    df = data[['uCode', 'event']].groupby(['uCode', 'event']).value_counts()

    lis = ['SingIn', 'attnExposure', 'interestNo', 'interestYes']
    event_counts_by_uCode = df.unstack()[lis].fillna(0)

    # filt1 = temp.interestYes < temp.interestYes.quantile(.999)
    # filt2 = temp.interestNo < temp.interestNo.quantile(.95)
    # filt3 = temp.attnExposure < temp.attnExposure.quantile(.999)
    filt = event_counts_by_uCode[event] < event_counts_by_uCode[event].quantile(bound)
    neg_filt = event_counts_by_uCode[event] >= event_counts_by_uCode[event].quantile(bound)

    # get outliers' uCode
    outliers = event_counts_by_uCode[neg_filt].index
    
    if show_outlier:
        return event_counts_by_uCode[neg_filt]
    else:
        return data[~data.uCode.isin(outliers)]

# track_event dataset
def trim_by_act(data, act='interestYes', bound=0.999, show_outliers=False):
    df = data[['uCode', 'act']].groupby(['uCode', 'act']).value_counts()

    lis = data.act.unique()
    act_counts_by_uCode = df.unstack()[lis].fillna(0)

    # filt1 = temp.interestYes < temp.interestYes.quantile(.999)
    # filt2 = temp.interestNo < temp.interestNo.quantile(.95)
    # filt3 = temp.attnExposure < temp.attnExposure.quantile(.999)
    filt = act_counts_by_uCode[act] < act_counts_by_uCode[act].quantile(bound)
    neg_filt = act_counts_by_uCode[act] >= act_counts_by_uCode[act].quantile(bound)

    # get outliers' uCode
    outliers = act_counts_by_uCode[neg_filt].index
    
    if show_outliers:
        return act_counts_by_uCode[neg_filt]
    else:
        return data[~data.uCode.isin(outliers)]

def get_graph(data, group, event):
    df = data[data.event == event][["event", "date"]].groupby('date').count()
    # df.plot()
    # plt.xticks(rotation=30)
    # plt.savefig(fig_path / f'activity_{group}_{event}_count_by_time', dpi=500)

    return df