## 1.4.1 RCTを行ったデータの準備

In [154]:
# ライブラリインポート
import pandas as pd
from scipy import stats
import random
import numpy as np

In [155]:
# データ読み込み
email_data = pd.read_csv("http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv")
email_data.head()

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0.0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0
4,2,1) $0 - $100,45.34,1,0,Urban,0,Web,Womens E-Mail,0,0,0.0


In [156]:
# 女性向けのメールが配信されたデータを削除する
male_df = email_data[email_data["segment"] != "Womens E-Mail"]

# 介入を表すtreatment変数を追加
male_df["treatment"] = male_df["segment"].apply(lambda x: 1 if x == "Mens E-Mail" else 0)
male_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0,0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0,1
8,9,5) $500 - $750,675.07,1,1,Rural,1,Phone,Mens E-Mail,0,0,0.0,1
13,2,2) $100 - $200,101.64,0,1,Urban,0,Web,Mens E-Mail,1,0,0.0,1
14,4,3) $200 - $350,241.42,0,1,Rural,1,Multichannel,No E-Mail,0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
63994,7,1) $0 - $100,86.46,0,1,Urban,0,Web,Mens E-Mail,0,0,0.0,1
63995,10,2) $100 - $200,105.54,1,0,Urban,0,Web,Mens E-Mail,0,0,0.0,1
63996,5,1) $0 - $100,38.91,0,1,Urban,1,Phone,Mens E-Mail,0,0,0.0,1
63997,6,1) $0 - $100,29.99,1,0,Urban,1,Phone,Mens E-Mail,0,0,0.0,1


## 1.4.2 RCTデータの集計と有意差検定

In [157]:
# メールが配信されたグループとされなかったグループで，購入の発生確率と購入額の平均を算出
mean_comp = male_df.groupby("treatment")[["conversion", "spend"]].mean()
mean_comp

Unnamed: 0_level_0,conversion,spend
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.006,0.653
1,0.013,1.423


In [158]:
print("[RCTデータの場合]")

# conversion rateの差
diff_conv = mean_comp.iloc[1, 0] - mean_comp.iloc[0, 0]
print(f"conversion rateの差：{diff_conv:.3g}")

# spendの差
diff_spend = mean_comp.iloc[1, 1] - mean_comp.iloc[0, 1]
print(f"spendの差：{diff_spend:.3g}")

[RCTデータの場合]
conversion rateの差：0.00681
spendの差：0.77


In [159]:
# male_dfから各グループの売上金額データを抽出
mens_mail = male_df[male_df["treatment"] == 1].spend
no_mail = male_df[male_df["treatment"] == 0].spend

# 有意差検定
stats.ttest_ind(mens_mail, no_mail, equal_var = True)

Ttest_indResult(statistic=5.300090294465472, pvalue=1.163200872605869e-07)

## 1.4.3 バイアスのあるデータによる効果の検証

In [160]:
# シード
random.seed(0)

# 条件に反応するサンプルの量を半分にする
obs_rate_c = 0.5
obs_rate_t = 0.5

# male_dfに列obs_rate_cとobs_rate_tを作成
conditions = [(male_df["history"] > 300) | (male_df["recency"] < 6) | (male_df["channel"] == "Multichannel")]
male_df["obs_rate_c"] = np.select(conditions, [obs_rate_c], default = 1)
male_df["obs_rate_t"] = np.select(conditions, [1], default = obs_rate_t)
#male_df.head()

# バイアスのあるデータを作成
random_number = np.random.rand(len(male_df))
biased_data = male_df[((male_df["treatment"] == 0) & (random_number < male_df["obs_rate_c"])) | ((male_df["treatment"] == 1) & (random_number < male_df["obs_rate_t"]))]
biased_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment,obs_rate_c,obs_rate_t
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0,0,0.5,1.0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0,1,0.5,1.0
8,9,5) $500 - $750,675.07,1,1,Rural,1,Phone,Mens E-Mail,0,0,0.0,1,0.5,1.0
13,2,2) $100 - $200,101.64,0,1,Urban,0,Web,Mens E-Mail,1,0,0.0,1,0.5,1.0
15,3,1) $0 - $100,58.13,1,0,Urban,1,Web,No E-Mail,1,0,0.0,0,0.5,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63992,1,5) $500 - $750,519.69,1,1,Urban,1,Phone,Mens E-Mail,0,0,0.0,1,0.5,1.0
63994,7,1) $0 - $100,86.46,0,1,Urban,0,Web,Mens E-Mail,0,0,0.0,1,1.0,0.5
63996,5,1) $0 - $100,38.91,0,1,Urban,1,Phone,Mens E-Mail,0,0,0.0,1,0.5,1.0
63997,6,1) $0 - $100,29.99,1,0,Urban,1,Phone,Mens E-Mail,0,0,0.0,1,1.0,0.5


### バイアスのあるデータの集計と有意差の検定

In [161]:
# セレクションバイアスのあるデータで平均を比較
mean_comp_biased = biased_data.groupby("treatment")[["conversion", "spend"]].mean()
mean_comp_biased

Unnamed: 0_level_0,conversion,spend
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.005,0.622
1,0.014,1.535


In [162]:
print("[バイアスのあるデータの場合]")

# conversion rateの差
diff_conv_biased = mean_comp_biased.iloc[1, 0] - mean_comp_biased.iloc[0, 0]
print(f"conversion rateの差：{diff_conv_biased:.3g}")

# spendの差
diff_spend_biased = mean_comp_biased.iloc[1, 1] - mean_comp_biased.iloc[0, 1]
print(f"spendの差：{diff_spend_biased:.3g}")

[バイアスのあるデータの場合]
conversion rateの差：0.00838
spendの差：0.913


In [163]:
# biased_dataから各グループの売上金額データを抽出
mens_mail_biased = biased_data[biased_data["treatment"] == 1].spend
no_mail_biased = biased_data[biased_data["treatment"] == 0].spend

# 有意差検定
stats.ttest_ind(mens_mail_biased, no_mail_biased, equal_var = True)

Ttest_indResult(statistic=5.175763285567286, pvalue=2.2835475063398481e-07)