-------------
### ライブラリのインポート＆その他の設定（Importing Libraries & Other Settings）

In [1]:
# 基本的なライブラリ
import numpy as np
import pandas as pd
from numpy.typing import NDArray
from scipy import stats

# Scikit-learn関連
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.utils.validation import check_X_y
from sklearn.metrics import silhouette_score
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split

# 抽象基底クラス (ABC)
from abc import ABCMeta, abstractmethod

# タイピングのサポート
from typing import Optional


# 可視化の設定
import matplotlib.pyplot as plt
import japanize_matplotlib
import itertools

plt.style.use("ggplot")

--------
### 実験設定

In [2]:
TARGET = "GMV"  # "GMV"か"BCR"
DATA = "part"  # "all" or "part"
DATA_SIZE = 20000  # データサイズ(訓練データとテストデータあわせて）
TEST_SIZE = 0.5  # テストデータの比率
THRESHOLD = 2  # 外れ値除外の閾値

N_TRIALS = 10000  # 試行回数（標本平均を求める回数）
SAMPLE_SIZE = 100  # 標本サイズ
RANDOM_STATE = 0  # 乱数シード

-----------
### データの前処理（Data Preprocessing）

##### 外れ値を除去する関数

In [3]:
# 外れ値の除去
def remove_outliers_zscore(
    data: pd.DataFrame, metric: str, threshold: float = 2
) -> pd.DataFrame:
    z_scores = np.abs(stats.zscore(data[metric]))
    data = data[(z_scores < threshold)]
    return data

##### メルカリデータ（df1：全部, df2：一部）

In [4]:
# df1(all the data)
df1 = pd.read_csv(
    R"C:\Users\HaruMomozu\Documents\aug_first_cpn_data_for_ab_test_sensibility_tsukuba.csv"
)

features_list = [
    "hist_4_day_buy_num",
    "hist_4_day_gmv",
    "his_4_day_is_buy",
    "hist_30_day_buy_days",
    "hist_30_day_buy_num",
    "hist_30_day_gmv",
    "hist_30_day_buy_recency",
    "hist_30_day_pay_days",
    "hist_30_day_atpu",
    "hist_30_day_gpv",
    "hist_30_day_pay_recency",
    "hist_30_day_list_days",
    "hist_30_day_list_num",
    "hist_30_day_list_recency",
    "hist_30_day_like_count",
    "hist_30_day_like_count_not_deleted",
    "hist_30_day_like_recency",
]

# df2(subset of the data)
df2 = df1.iloc[:DATA_SIZE]

##### 外れ値除去と標準化

In [5]:
if DATA == "all":
    df = df1
if DATA == "part":
    df = df2

# 外れ値除去
df = remove_outliers_zscore(data=df, metric="GMV", threshold=THRESHOLD)

df["BCR"] = np.where(df["GMV"] > 0, 1, 0)

X_all = df[features_list]
y_all = df[TARGET]

# 行を詰める
df = df.reset_index(drop=True)

print("外れ値除外前のデータ数（訓練+テスト）:", DATA_SIZE)
print("外れ値除外後のデータ数（訓練+テスト）:", len(df))

外れ値除外前のデータ数（訓練+テスト）: 20000
外れ値除外後のデータ数（訓練+テスト）: 19801


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["BCR"] = np.where(df["GMV"] > 0, 1, 0)


##### 訓練とテストに分割

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=TEST_SIZE, random_state=0
)

print("訓練データのデータ数:", len(X_train))
print("テストデータのデータ数:", len(X_test))

訓練データのデータ数: 9900
テストデータのデータ数: 9901


##### 目的変数と最も相関が高い共変量を選ぶ

In [7]:
correlations = X_train.corrwith(y_train)

# 最も相関が高い変数の選択
most_correlated_var = correlations.abs().idxmax()  # 絶対値が最大の変数を取得
max_correlation = correlations[most_correlated_var]

print(
    f"{TARGET}と最も相関の高い変数: {most_correlated_var}, 相関係数: {max_correlation}"
)

GMVと最も相関の高い変数: hist_30_day_gmv, 相関係数: 0.37772488020585426


-------------
### CUPED

In [8]:
def CUPED(
    X: pd.DataFrame,
    y: pd.DataFrame,
    covariate: str,
    sample_size: int,
    n_trials: int,
    metric="GMV",
):
    sample_means = np.zeros(n_trials)

    # StandardScalerを使用して共変量をスケーリング
    scaler = StandardScaler()
    scaled_cov: NDArray = scaler.fit_transform(
        X[covariate].values.reshape(-1, 1)
    )  # X の covariate の列を NDarrayにしたものを標準化

    # 共分散を使用してalphaを計算
    alpha: NDArray = np.cov(y, scaled_cov[:, 0])[0, 1] / np.var(scaled_cov)
    y_cuped = y - alpha * scaled_cov[:, 0]

    for i in range(n_trials):
        # y_cupedからランダムにサンプリング
        sample = y_cuped.sample(n=sample_size, random_state=i)
        sample_means[i] = sample.mean()

    return np.var(sample_means)

In [9]:
# CUPED の分散
cuped_std = CUPED(
    X=X_test,
    y=y_test,
    covariate=most_correlated_var,
    n_trials=N_TRIALS,
    sample_size=SAMPLE_SIZE,
)

In [11]:
# ランダムの分散
np.random.seed(0)
y_hats = []
for random_state in range(N_TRIALS):
    sample = np.random.choice(y_test, SAMPLE_SIZE, replace=False)
    y_hat_random = sample.mean()
    y_hats.append(y_hat_random)
    random_std = np.array(y_hats).var()

# CUPED の分散削減率
reduction_rate = (1 - cuped_std / random_std) * 100
print(reduction_rate)
print(random_std)

8.066076597762152
208604.41171572442
