In [None]:
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime
from pyarrow import feather

# OLS 模型
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col

# Fixed Effects 模型
import pyfixest as pf

from utils import (
    get_events, 
    get_track_event, 
    insert_date, 
    insert_gender,
    insert_group,
    trim, 
    get_background, 
    insert_gender, 
    get_prob_accounts_by_BD,
    get_prob_accounts_by_Yes,
    drop_prob_accounts,
    drop_null_target
)

# 設定顯示格式與路徑
pd.options.display.float_format = '{:.4f}'.format
FIG_PATH = Path('/home/cmc1503/Desktop/JD_exploration/figures')
DATA_PATH = Path('/home/cmc1503/Desktop/JD_exploration/data')
p = DATA_PATH

### Functions

In [2]:
def get_all_track_event(period: str, cleared=True) -> pd.DataFrame:
    """
    從 Feather 檔案讀取 'track_event' 資料。
    根據 'period' (before/after/other) 和 'cleared' (True/False) 選擇不同檔案。

    Args:
        period (str): 'before', 'after', 或 'other' (代表 combined)。
        cleared (bool): 是否讀取 "cleared" (已清理) 版本 (僅在 period='after' 時有效)。

    Returns:
        pd.DataFrame: 讀取的 track_event 資料。
    """
    path = Path('/home/cmc1503/Desktop/JD_exploration/data')
    if period == 'before':
        df = feather.read_feather(path / 'all_track_event_before.feather')
    elif period == 'after':
        if cleared:
            df = feather.read_feather(path / 'all_track_event_after_cleared.feather')
        else:
            df = feather.read_feather(path / 'all_track_event_after.feather')
    else:
        # 預設讀取 combined_cleared 版本
        df = feather.read_feather(path / 'all_track_event_combined_cleared.feather')
    return df

def get_all_track_event_with_stars():
    """
    讀取包含 'stars' 和 'prev' (先前行為) 的特定 Feather 檔案。
    實驗開始後、清除過後、女性、page == meet、act == interestYes/No 的資料

    Returns:
        pd.DataFrame: 讀取的特定 track_event 資料。
    """
    # df = feather.read_feather(DATA_PATH / 'all_track_event_after_cleared_fem_meet_with_stars.feather')
    df = feather.read_feather(DATA_PATH / 'all_track_event_after_cleared_fem_meet_with_stars_and_prev.feather') # 有「先前」的行為
    return df

def get_card_pool(start=9, end=25, gender='F'):
    """
    從多個分片的 CSV 檔案中讀取 'card_pool' 資料。

    Args:
        start (int): 開始週次 (包含)。
        end (int): 結束週次 (包含)。
        gender (str): 'F' (女性) 或 'M' (男性)。

    Returns:
        pd.DataFrame: 包含指定週次範圍的所有 card_pool 資料。
    """
    
    data = pd.DataFrame()
    
    for week in range(start, end+1):
        t1 = time.perf_counter()
        df = pd.DataFrame()
        for i in range(10):
            if week < 10:
                temp = pd.read_csv(p / f"week0{week}_{gender}_{i}.csv")
            else:
                temp = pd.read_csv(p / f"week{week}_{gender}_{i}.csv")
            df = pd.concat([df, temp])
        
        df["week"] = week
        data = pd.concat([data, df])
        t2 = time.perf_counter()

        print(f"Week {week} loaded in {t2 - t1:.2f} seconds")
    return data

def get_all_pool(gender='F'):
    """
    從 Feather 檔案讀取 'all_pool' (女性或男性) 資料。

    Args:
        gender (str): 'F' (女性) 或 'M' (男性)。

    Returns:
        pd.DataFrame: 讀取的 all_pool 資料。
    """
    path = Path('/home/cmc1503/Desktop/JD_exploration/data')
    if gender == 'F':
        df = feather.read_feather(path / 'all_pool_female.feather')
    else:
        df = feather.read_feather(path / 'all_pool_male.feather')
    return df

def get_summary(ols_results, to_latex=False):
    """
    從 OLS (迴歸) 結果物件中提取 Beta、Standard Error 和 Number of Observations。

    Args:
        ols_results: statsmodels OLS 執行 .fit() 後的結果物件。
        to_latex (bool): 是否將 Beta 和 StdErr 轉換為 LaTeX 格式 (預設 False)。

    Returns:
        (beta, std_err, nobs): 三個值 (或兩個字串和一個值)。
    """
    beta = ols_results.params
    std_err = ols_results.bse
    nobs = ols_results.nobs
    if to_latex:
        return beta.to_latex(), std_err.to_latex(), nobs
    else:
        return beta, std_err, nobs



## Regression Models

### Load data and Make Filters

In [None]:
# Load data
# data = get_all_track_event_with_stars()
# female_uCode = get_background(columns=['uCode', 'gender'], gender='F')
# male_uCode = get_background(columns=['uCode', 'gender'], gender='M')
# all_uCode = pd.concat([female_uCode, male_uCode], axis=0)

# data = insert_date(data)
# data = insert_gender(data, background_df=all_uCode)
# data = insert_group(data)

data

In [None]:
# Filters

gender_filt = (data['gender'] == 'F')
group_na_filt = (~data['group'].isna())
act_na_filt = (~data['act'].isna())
na_filt = group_na_filt & act_na_filt

week_filt = data['week'] <= 13 # 前五週
reveal_filt = data['reveal_stars'] == 1
group_filt = data['group'] == 'D'

filters = {
    'gender': gender_filt,
    'group_na': group_na_filt,
    'act_na': act_na_filt,
    'na_filt': group_na_filt & act_na_filt
}

# add column: month
data['month'] = data.date.apply(lambda x: x.month)

base_week = 9

### Model 1
考慮每個 InterestYes (swipe) 對 group, week, vip 的迴歸，但要注意，可能同個人會一直被按讚，$Y_i$ 間不獨立。

a. 
$$
\mathbb{I}(\text{InterestYes})_i \sim \text{group}_i
$$

b.
$$
\mathbb{I}(\text{InterestYes})_i \sim \text{group}_i + \text{week}_i
$$

c.
$$
\mathbb{I}(\text{InterestYes})_i \sim \text{group}_i + \text{week}_i + \text{vip}_i
$$

d.
$$
\mathbb{I}(\text{InterestYes})_i \sim \text{group}_i + \text{week}_i + \text{group}\times \text{week}_i
$$

其中 $i$ 表示第$i$筆資料。

In [None]:
def Model1(data, model):
    """
    根據資料，回傳 ols fit 的 results

    Args:
        data: 要跑迴歸的 track_event
        model: a, b, c, d

    Return:
        fit 的結果，可使用 summary() 以報表形式查看
    """
    page_filt = (data['page'] == 'meet')
    act_filt = (data['act'] == 'interestNo') | (data['act'] == 'interestYes')
    data = data[gender_filt & page_filt & act_filt & group_na_filt]

    # swipe 欄已包含在 all_track_event_with_stars 裡面
    
    # conditions = [(data["act"] == "interestNo"), (data['act'] == "interestYes")]
    # choices = [0, 1]
    # data['swipe'] = np.select(conditions, choices, default=np.nan)
    # data['swipe'] = data['swipe'].astype('Int64')

    if model == 'a':
        lm = smf.ols('swipe ~ C(group, Treatment(reference="D"))', data=data)
    elif model == 'b':
        lm = smf.ols('swipe ~ C(group, Treatment(reference="D")) + C(week)', data=data)
    elif model == 'c':
        lm = smf.ols('swipe ~ C(group, Treatment(reference="D")) + C(week) + vip', data=data)
    elif model == 'd':
        lm = smf.ols('swipe ~ C(group, Treatment(reference="D")) + C(week) + C(group, Treatment(reference="D")):C(week)', data=data)
    else:
        print('請選擇 a, b, c, d')

    return lm.fit()

Model1(data, 'a').summary()

### Model 2

考慮每週收到的 'interestYes' 的總和，對 group, week 的迴歸，一樣要注意有些人可能只有一週的數據，有些人則多週都有收到 'interestYes'，$Y_i$ 間不獨立。

a.
$$
\#(\text{InterestYes})_{\text{i, t}} \sim \text{group}_i + \text{week}_t
$$

b.
$$
\#(\text{InterestYes})_{\text{i, t}} \sim \text{group}_i + \text{week}_t + \text{group}_i \times \text{week}_t
$$

其中 $i$ 表示使用者，$t$ 表示週次，所以 $\text{week}_t = t$。

In [None]:
def Model2(data, model):
    """
    根據資料，回傳 ols fit 的 results

    Args:
        data: 要跑迴歸的 track_event
        model: a, b

    Return:
        fit 的結果，可使用 summary() 以報表形式查看
    """
    page_filt = (data['page'] == 'meet')
    act_filt = (data['act'] == 'interestYes')

    columns = ['uCode', 'act', 'week', 'group']

    data = data[gender_filt & page_filt & act_filt & group_na_filt][columns]
    
    data.groupby(['uCode', 'act', 'week']).count().reset_index()

    if model == 'a':
        lm = smf.ols('act ~ C(group, Treatment(reference="D")) + C(week)', data=df)
    elif model == 'b':
        lm = smf.ols('act ~ C(group, Treatment(reference="D")) + C(week) + C(group, Treatment(reference="D")):C(week)', data=df)
    else:
        print('請選擇 a, b')

    return lm.fit()

Model2(data, model='a').summary()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,act
uCode,group,week,Unnamed: 3_level_1
AA1636121728,B,13,189
AA1745126491,D,9,921
AA1745126491,D,10,722
AA1745126491,D,11,457
AA1745126491,D,12,560
...,...,...,...
ZZ9405386190,B,18,3649
ZZ9405386190,B,19,946
ZZ9405386190,B,20,58
ZZ9405386190,B,21,6


### Model 8