-------------
# １．ライブラリのインポート

In [1]:
# 基本的なライブラリ
import numpy as np
import pandas as pd
from numpy.typing import NDArray
from scipy import stats

# Scikit-learn関連
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.utils.validation import check_X_y

# タイピングのサポート
from typing import Optional


# 可視化の設定
import matplotlib.pyplot as plt
import japanize_matplotlib
import itertools

plt.style.use("ggplot")

--------
# ２．実験設定

In [2]:
### 実験データの設定 ###
TARGET = "PM_US Post"
DATA = "all"  # "all" or "part"
THRESHOLD = 2  # 外れ値除外の閾値

### 実験設定 ###
N_TRIALS = 10000  # 試行回数（標本平均を求める回数）
SAMPLE_SIZE = 10000  # 標本サイズ
RANDOM_STATE = 0  # 乱数シード
SEED = 0

-----------
# ３．データの前処理

## 3.1 データ読み込み

In [3]:
df_Beijing = pd.read_csv(
    R"C:\Users\HaruMomozu\Documents\中国気象データ\BeijingPM20100101_20151231.csv"
)
df_Chengdu = pd.read_csv(
    R"C:\Users\HaruMomozu\Documents\中国気象データ\ChengduPM20100101_20151231.csv"
)
df_Guangzhou = pd.read_csv(
    R"C:\Users\HaruMomozu\Documents\中国気象データ\GuangzhouPM20100101_20151231.csv"
)
df_Shanghai = pd.read_csv(
    R"C:\Users\HaruMomozu\Documents\中国気象データ\ShanghaiPM20100101_20151231.csv"
)
df_Shenyang = pd.read_csv(
    R"C:\Users\HaruMomozu\Documents\中国気象データ\ShenyangPM20100101_20151231.csv"
)

if DATA == "all":
    df_all = pd.concat(
        [df_Beijing, df_Chengdu, df_Guangzhou, df_Shanghai, df_Shenyang],
        axis=0,
        ignore_index=True,
    )
if DATA == "part":
    df_all = df_Beijing

features_list = [
    "DEWP",
    "TEMP",
    "HUMI",
    "PRES",
    "Iws",
    "precipitation",
    "Iprec",
]

variables_list = [
    "PM_US Post",
    "DEWP",
    "TEMP",
    "HUMI",
    "PRES",
    "Iws",
    "precipitation",
    "Iprec",
]

df_2014 = df_all[df_all["year"] == 2014][variables_list]
df_2015 = df_all[df_all["year"] == 2015][variables_list]

ALL_FEATURES_INDEX = [i for i in range(len(features_list))]

## 3.2 欠損値除去

In [4]:
def remove_nan(data: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    # 各列に対して、pd.to_numericを使用して数値に変換（エラーがあればNaNにする）
    df_numeric = data.apply(lambda col: pd.to_numeric(col, errors="coerce"))

    # 数値に変換できなかった行を抽出（NaNを含む行）
    df_excluded = data[df_numeric.isna().any(axis=1)]

    # NaNを含む行を削除
    df_clean = df_numeric.dropna()

    return df_excluded, df_clean

In [5]:
_, df_2014_clean = remove_nan(df_2014)
_, df_2015_clean = remove_nan(df_2015)

print("全データ数（訓練）：", len(df_2014))
print("全データ数（テスト）：", len(df_2015))
print("欠損値除去後のデータ数（訓練）：", len(df_2014_clean))
print("欠損値除去後のデータ数（テスト）：", len(df_2015_clean))

全データ数（訓練）： 43800
全データ数（テスト）： 43800
欠損値除去後のデータ数（訓練）： 40334
欠損値除去後のデータ数（テスト）： 39098


## 3.3 外れ値除去

In [6]:
def remove_outliers_zscore(
    data: pd.DataFrame, metric: str, threshold: float
) -> pd.DataFrame:
    z_scores = np.abs(stats.zscore(data[metric]))
    data = data[(z_scores < threshold)]
    return data

In [7]:
df_train = remove_outliers_zscore(
    data=df_2014_clean, metric=TARGET, threshold=THRESHOLD
)
df_train = df_train.reset_index(drop=True)  # 行を詰める
df_test = remove_outliers_zscore(data=df_2015_clean, metric=TARGET, threshold=THRESHOLD)
df_test = df_test.reset_index(drop=True)  # 行を詰める

print("欠損値と外れ値除外後のデータ数（訓練）:", len(df_train))
print("欠損値と外れ値除外後のデータ数（テスト）:", len(df_test))

欠損値と外れ値除外後のデータ数（訓練）: 38559
欠損値と外れ値除外後のデータ数（テスト）: 37237


## 3.4 X, y に分割

In [8]:
X_train = df_train[features_list].to_numpy()
y_train = df_train[TARGET].to_numpy()

X_test = df_test[features_list].to_numpy()
y_test = df_test[TARGET].to_numpy()

-----------------------------
# 4. 共変量の選択（訓練データ)

In [9]:
X_train_df = pd.DataFrame(X_train)
y_train_series = pd.Series(y_train)

correlations = X_train_df.corrwith(y_train_series)

# 最も相関が高い変数の選択
most_correlated_var_index = correlations.abs().idxmax()  # 絶対値が最大の変数を取得
max_correlation = correlations[most_correlated_var_index]

print(
    f"{TARGET}と最も相関の高い変数のインデックス: {most_correlated_var_index}, 相関係数: {round(max_correlation, 3)}"
)

PM_US Postと最も相関の高い変数のインデックス: 1, 相関係数: -0.226


-----------------
# 5．CUPED

## 5.1 αの計算（訓練データ）

In [10]:
scaler = StandardScaler()
scaled_cov = scaler.fit_transform(X_train[:, [most_correlated_var_index]])
alpha = np.cov(y_train, scaled_cov[:, 0])[0, 1] / np.var(scaled_cov[:, 0])

print("alpha : ", alpha)

alpha :  -9.673673177441103


## 5.2 CUPEDの実施（テストデータ）

In [11]:
def cauculate_var_cuped(
    X: NDArray,
    y: NDArray,
    seed: int,
    alpha: float,
    most_correlated_var_index: int,
    sample_size: int,
    n_trials: int,
) -> float:
    np.random.seed(seed)

    scaler = StandardScaler()
    scaled_cov = scaler.fit_transform(X[:, [most_correlated_var_index]])
    y_cuped = y - alpha * scaled_cov[:, 0]

    sample_means = np.zeros(n_trials)
    for i in range(n_trials):
        # y_cupedからランダムにサンプリング
        sample = np.random.choice(y_cuped, sample_size, replace=False)
        sample_means[i] = sample.mean()

    # 標本分散を計算
    variance = np.var(sample_means)

    return variance

-------------------
# 6. COSS

In [12]:
def cauculate_var_coss(
    X: NDArray,
    y: NDArray,
    seed: int,
    most_correlated_var_index: int,
    sample_size: int,
    n_trials: int,
) -> float:
    np.random.seed(seed)

    X = pd.DataFrame(X)
    y = pd.Series(y)

    sample_means_t = np.zeros(n_trials)
    sample_means_c = np.zeros(n_trials)
    for i in range(n_trials):
        rows = np.random.choice(X.index, sample_size * 2, replace=False)

        sample_X = X.iloc[rows, most_correlated_var_index]
        sample_X_sorted = sample_X.sort_values(ascending=False)
        # print("rows:", rows)
        # print("sample_X_sorted:", sample_X_sorted.values)

        rows_t = sample_X_sorted.index[::2]
        rows_c = sample_X_sorted.index[1::2]
        # print("rows_t:", rows_t)
        # print("rows_c:", rows_c)

        sample_t = y.loc[rows_t].to_numpy()
        sample_c = y.loc[rows_c].to_numpy()

        sample_means_t[i] = sample_t.mean()
        sample_means_c[i] = sample_c.mean()

    variance_t = np.var(sample_means_t)
    variance_c = np.var(sample_means_c)

    return variance_t, variance_c

----------------
# 7. 評価

In [13]:
def cauculate_var_random(y: NDArray, seed: int, n_trials: int, sample_size) -> float:
    np.random.seed(seed)
    y_hats = []
    for i in range(n_trials):
        sample = np.random.choice(y, sample_size, replace=False)
        y_hat_random = sample.mean()
        y_hats.append(y_hat_random)
    var_random = np.array(y_hats).var()

    return var_random

In [14]:
VAR_RANDOM_TRAIN = cauculate_var_random(
    y=y_train, seed=SEED, n_trials=N_TRIALS, sample_size=SAMPLE_SIZE
)
VAR_RANDOM_TEST = cauculate_var_random(
    y=y_test, seed=SEED, n_trials=N_TRIALS, sample_size=SAMPLE_SIZE
)

var_cuped_train = cauculate_var_cuped(
    X=X_train,
    y=y_train,
    seed=SEED,
    alpha=alpha,
    most_correlated_var_index=most_correlated_var_index,
    sample_size=SAMPLE_SIZE,
    n_trials=N_TRIALS,
)

var_cuped_test = cauculate_var_cuped(
    X=X_test,
    y=y_test,
    seed=SEED,
    alpha=alpha,
    most_correlated_var_index=most_correlated_var_index,
    sample_size=SAMPLE_SIZE,
    n_trials=N_TRIALS,
)

var_coss_train_t, var_coss_train_c = cauculate_var_coss(
    X=X_train,
    y=y_train,
    seed=SEED,
    most_correlated_var_index=most_correlated_var_index,
    sample_size=SAMPLE_SIZE,
    n_trials=N_TRIALS,
)

var_coss_test_t, var_coss_test_c = cauculate_var_coss(
    X=X_test,
    y=y_test,
    seed=SEED,
    most_correlated_var_index=most_correlated_var_index,
    sample_size=SAMPLE_SIZE,
    n_trials=N_TRIALS,
)

### 確認用

In [15]:
print("全データ数（訓練）：", len(df_2014))
print("全データ数（テスト）：", len(df_2015))
print("欠損値除去後のデータ数（訓練）：", len(df_2014_clean))
print("欠損値除去後のデータ数（テスト）：", len(df_2015_clean))
print("欠損値と外れ値除外後のデータ数（訓練）:", len(df_train))
print("欠損値と外れ値除外後のデータ数（テスト）:", len(df_test))

全データ数（訓練）： 43800
全データ数（テスト）： 43800
欠損値除去後のデータ数（訓練）： 40334
欠損値除去後のデータ数（テスト）： 39098
欠損値と外れ値除外後のデータ数（訓練）: 38559
欠損値と外れ値除外後のデータ数（テスト）: 37237


In [16]:
print("var_random_train =", VAR_RANDOM_TRAIN)
print("var_random_test =", VAR_RANDOM_TEST)

var_random_train = 0.13669576793727964
var_random_test = 0.10967935865886788


In [17]:
# CUPED の分散削減率
reduction_rate_cuped = (1 - var_cuped_test / VAR_RANDOM_TEST) * 100
print("CUPEDの分散削減率 : ", reduction_rate_cuped)

reduction_rate_coss_t = (1 - var_coss_test_t / VAR_RANDOM_TEST) * 100
reduction_rate_coss_c = (1 - var_coss_test_c / VAR_RANDOM_TEST) * 100
print("COSSの分散削減率(test) : ", reduction_rate_coss_t)
print("COSSの分散削減率(control) : ", reduction_rate_coss_c)

CUPEDの分散削減率 :  6.955600172971533
COSSの分散削減率(test) :  7.454781376456198
COSSの分散削減率(control) :  7.91254729542048


In [18]:
print("var_cuped_train =", var_cuped_train)
print("var_cuped_test =", var_cuped_test)

var_cuped_train = 0.1299281831547518
var_cuped_test = 0.1020505009982776


In [19]:
print("var_coss_train =", var_coss_train_t)
print("var_coss_test =", var_coss_test_t)

var_coss_train = 0.13040194188211104
var_coss_test = 0.10150300225575
