# Doubly Robust 法

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, LogisticRegression

In [5]:
filepath = '../../../../data/processed/df_filtered_5years.xlsx'
df = pd.read_excel(filepath)
df['log_income'] = np.log(df['income'])
df = df.dropna()
df.head()

Unnamed: 0,island,year,island_id,region_code,region_name,prefecture_code,population,dummy_island_has_bridge,dummy_island_is_connected_mainland,year_bridge_opened,dummy_after_bridge_opened,year_connect_mainland,dummy_after_connect_mainland,income,prefecture_name,area_km2,distance_m,log_income
2,島後,2005,0,32528,隠岐の島町,32,17259.0,0,0,0,0,0,0,2030.888121,島根県,241.53,67840.0,7.616228
3,島後,2010,0,32528,隠岐の島町,32,15930.0,0,0,0,0,0,0,1725.990958,島根県,241.53,67840.0,7.453557
4,島後,2015,0,32528,隠岐の島町,32,14901.0,0,0,0,0,0,0,1613.5317,島根県,241.53,67840.0,7.386181
5,島後,2020,0,32528,隠岐の島町,32,13882.0,0,0,0,0,0,0,1686.1734,島根県,241.53,67840.0,7.430217
6,中ノ島,1985,1,32525,海士町,32,3339.0,0,0,0,0,0,0,363.212405,島根県,32.29,49730.0,5.894988


## 処置変数と共変量の指定

In [6]:
covariates = df[['log_income', 'area_km2', 'distance_m']]
treatment = df['dummy_after_bridge_opened']
target = df['population']

# 回帰モデル

In [7]:
X = df[['dummy_after_bridge_opened', 'log_income', 'area_km2', 'distance_m']]

reg1 = LinearRegression().fit(X, target)

# treatment = 0 の場合
treatment_0 = X.copy()
treatment_0['dummy_after_bridge_opened'] = 0
target_0 = reg1.predict(treatment_0)

# treatment = 1 の場合
treatment_1 = X.copy()
treatment_1['dummy_after_bridge_opened'] = 1
target_1 = reg1.predict(treatment_1)

## 傾向スコアを求める

In [8]:
# ロジスティック回帰
reg2 = LogisticRegression().fit(covariates, treatment)

print(f'Intercept: {reg2.intercept_}')
print(f'Coefficients: {reg2.coef_}')

# 傾向スコアを計算
treatment_pred = reg2.predict_proba(covariates)
print(f'Predicted probabilities: {treatment_pred[0:5]}')
print('---')
print(treatment[0:5])

Intercept: [-1.17470098]
Coefficients: [[ 0.21169135  0.06112427 -0.00192777]]
Predicted probabilities: [[1.00000000e+00 6.37642983e-51]
 [1.00000000e+00 6.16058741e-51]
 [1.00000000e+00 6.07334315e-51]
 [1.00000000e+00 6.13022423e-51]
 [1.00000000e+00 1.79449700e-41]]
---
2    0
3    0
4    0
5    0
6    0
Name: dummy_after_bridge_opened, dtype: int64


## ATE

変数

- T: 処置変数
- Y: 被説明変数
- X: 共変量


$$
E[Y_i | T_i = 1] =
    \frac{T_i}{P(T = 1 | X_i)}Y_i
+ \left(\frac{1 - T_i}{P(T = 1 | X_i)}\right)\hat{Y^1_i} \\
$$


- 第1項: 処置群の観測アウトカム
- 第2項: 対照群が処置を受けた場合の予測アウトカム

$$
E[Y_i | T_i = 0] =
    \frac{1 - T_i}{P(T = 0 | X_i)}Y_i
+ \left(\frac{T_i}{P(T = 0 | X_i)}\right)\hat{Y^0_i}
$$

- 第1項: 対照群の観測アウトカム
- 第2項: 処置群が処置を受けなかった場合の予測アウトカム

$$
ATE = E[Y_i | T_i = 1] - E[Y_i | T_i = 0] \\
$$

In [9]:
# 処置群のITEを計算
ITE_1_i = target / treatment_pred[:, 1] * treatment + (1 - treatment / treatment_pred[:, 1]) * target_1

# 対照群のITEを計算
ITE_0_i = target / treatment_pred[:, 0] * (1 - treatment) + (1 - (1 - treatment) / treatment_pred[:, 0]) * target_0

# 推定ATEを計算
ATE = 1 / len(target) * (ITE_1_i - ITE_0_i).sum()
print(f"推定したATE: {ATE:.2f}")

推定したATE: 2147.42


In [69]:
# ブートストラップサンプルの数
n_bootstraps = 1000
bootstrapped_ates = []

# ブートストラップサンプルを生成してATEを計算
for _ in range(n_bootstraps):
    # ブートストラップサンプルを生成
    bootstrap_sample = df.sample(n=len(df), replace=True)

    # 回帰モデル
    target_bootstrap = bootstrap_sample['population']
    X_bootstrap = bootstrap_sample[['dummy_after_bridge_opened', 'log_income', 'area_km2', 'distance_m']]
    reg1_bootstrap = LinearRegression().fit(X_bootstrap, target_bootstrap)

    # 反実仮想を計算
    # treatment = 0 の場合
    treatment_0_bootstrap = X_bootstrap.copy()
    treatment_0_bootstrap['dummy_after_bridge_opened'] = 0
    target_0_bootstrap = reg1_bootstrap.predict(treatment_0_bootstrap)

    # treatment = 1 の場合
    treatment_1_bootstrap = X_bootstrap.copy()
    treatment_1_bootstrap['dummy_after_bridge_opened'] = 1
    target_1_bootstrap = reg1_bootstrap.predict(treatment_1_bootstrap)

    # 傾向スコアの計算
    covariates_bootstrap = bootstrap_sample[['log_income', 'area_km2', 'distance_m']]
    treatment_bootstrap = bootstrap_sample['dummy_after_bridge_opened']
    # ロジスティック回帰
    reg2_bootstrap = LogisticRegression().fit(covariates_bootstrap, treatment_bootstrap)
    # 傾向スコアを計算
    treatment_pred_bootstrap = reg2_bootstrap.predict_proba(covariates_bootstrap)

    # 処置群のITEを計算
    ITE_1_i_bootstrap = target_bootstrap / treatment_pred_bootstrap[:, 1] * treatment_bootstrap + (1 - treatment_bootstrap / treatment_pred_bootstrap[:, 1]) * target_1_bootstrap

    # 対照群のITEを計算
    ITE_0_i_bootstrap = target_bootstrap / treatment_pred_bootstrap[:, 0] * (1 - treatment_bootstrap) + (1 - (1 - treatment_bootstrap) / treatment_pred_bootstrap[:, 0]) * target_0_bootstrap

    # 推定ATEを計算
    bootstrapped_ate = 1 / len(target_bootstrap) * (ITE_1_i_bootstrap - ITE_0_i_bootstrap).sum()

    bootstrapped_ates.append(bootstrapped_ate)

# ATEの標準誤差を計算
standard_error = np.std(bootstrapped_ates)

# ATEの信頼区間を計算
lower_bound = np.percentile(bootstrapped_ates, 2.5)
upper_bound = np.percentile(bootstrapped_ates, 97.5)

print(f'ATE: {ATE:.2f}')
print(f'Standard Error: {standard_error:.2f}')
print(f'95% Confidence Interval: [{lower_bound:.2f}, {upper_bound:.2f}]')

ATE: 2147.42
Standard Error: 284.40
95% Confidence Interval: [1573.89, 2701.97]
