# 1.4 Rによるメールマーケティングの効果の検証

## 1.4.1 RCTを行ったデータの準備

In [13]:
# ライブラリインポート
import pandas as pd
from scipy import stats
import random
import numpy as np

# warningsを表示しない
import warnings
warnings.filterwarnings("ignore")

In [14]:
# データ読み込み
email_data = pd.read_csv("http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv")
email_data.head()

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0.0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0
4,2,1) $0 - $100,45.34,1,0,Urban,0,Web,Womens E-Mail,0,0,0.0


In [15]:
# 女性向けのメールが配信されたデータを削除する
male_df = email_data[email_data["segment"] != "Womens E-Mail"].copy()

# 介入を表すtreatment変数を追加
male_df["treatment"] = male_df["segment"].apply(lambda x: 1 if x == "Mens E-Mail" else 0)
male_df

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0,0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0,1
8,9,5) $500 - $750,675.07,1,1,Rural,1,Phone,Mens E-Mail,0,0,0.0,1
13,2,2) $100 - $200,101.64,0,1,Urban,0,Web,Mens E-Mail,1,0,0.0,1
14,4,3) $200 - $350,241.42,0,1,Rural,1,Multichannel,No E-Mail,0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
63994,7,1) $0 - $100,86.46,0,1,Urban,0,Web,Mens E-Mail,0,0,0.0,1
63995,10,2) $100 - $200,105.54,1,0,Urban,0,Web,Mens E-Mail,0,0,0.0,1
63996,5,1) $0 - $100,38.91,0,1,Urban,1,Phone,Mens E-Mail,0,0,0.0,1
63997,6,1) $0 - $100,29.99,1,0,Urban,1,Phone,Mens E-Mail,0,0,0.0,1


## 1.4.2 RCTデータの集計と有意差検定

In [16]:
# メールが配信されたグループとされなかったグループで，購入の発生確率と購入額の平均を算出
mean_comp = male_df.groupby("treatment")[["conversion", "spend"]].mean()
mean_comp

Unnamed: 0_level_0,conversion,spend
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.005726,0.652789
1,0.012531,1.422617


In [17]:
print("[RCTデータの場合]")

# conversion rateの差
diff_conv = mean_comp.iloc[1, 0] - mean_comp.iloc[0, 0]
print(f"conversion rateの差：{diff_conv:.3g}")

# spendの差
diff_spend = mean_comp.iloc[1, 1] - mean_comp.iloc[0, 1]
print(f"spendの差：{diff_spend:.3g}")

[RCTデータの場合]
conversion rateの差：0.00681
spendの差：0.77


In [18]:
# male_dfから各グループの売上金額データを抽出
mens_mail = male_df[male_df["treatment"] == 1].spend
no_mail = male_df[male_df["treatment"] == 0].spend

# 有意差検定
stats.ttest_ind(mens_mail, no_mail, equal_var = True)

Ttest_indResult(statistic=5.300090294465472, pvalue=1.163200872605869e-07)

## 1.4.3 バイアスのあるデータによる効果の検証

In [19]:
# バイアスのあるデータを作成

# 購買傾向が一定以上あるユーザーの条件
conditions = (male_df["history"] > 300) | (male_df["recency"] < 6) | (male_df["channel"] == "Multichannel")

# 購買傾向が一定以上のユーザーに重点的にメール配信したかのようなデータを作成
# 以下のユーザをデータから落とす
# ・購買傾向が一定以上かつメール配信されていない
# ・購買傾向が一定未満かつメール配信された
biased_data = pd.concat([
                         
                         # 購買傾向が一定以上のユーザー
                         male_df[(conditions) & (male_df["treatment"] == 0)].sample(frac = 0.5, random_state = 1), 
                         male_df[(conditions) & (male_df["treatment"] == 1)], 
                         
                         # 購買傾向が一定未満のユーザー
                         male_df[(~conditions) & (male_df["treatment"] == 0)], 
                         male_df[(~conditions) & (male_df["treatment"] == 1)].sample(frac = 0.5, random_state = 1)

], axis = 0, ignore_index = True)
biased_data

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment
0,8,5) $500 - $750,572.65,1,0,Urban,1,Web,No E-Mail,0,0,0.0,0
1,5,1) $0 - $100,42.38,1,0,Urban,1,Phone,No E-Mail,1,0,0.0,0
2,1,"7) $1,000 +",3003.48,1,1,Urban,1,Phone,No E-Mail,0,0,0.0,0
3,1,5) $500 - $750,662.10,0,1,Urban,1,Web,No E-Mail,0,0,0.0,0
4,5,1) $0 - $100,44.37,0,1,Urban,0,Web,No E-Mail,0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31920,12,1) $0 - $100,29.99,1,0,Surburban,1,Web,Mens E-Mail,0,0,0.0,1
31921,6,2) $100 - $200,156.37,1,0,Surburban,0,Web,Mens E-Mail,0,0,0.0,1
31922,11,1) $0 - $100,62.56,0,1,Urban,0,Phone,Mens E-Mail,0,0,0.0,1
31923,11,2) $100 - $200,149.71,1,0,Surburban,1,Phone,Mens E-Mail,0,0,0.0,1


### バイアスのあるデータの集計と有意差の検定

In [20]:
# セレクションバイアスのあるデータで平均を比較
mean_comp_biased = biased_data.groupby("treatment")[["conversion", "spend"]].mean()
mean_comp_biased

Unnamed: 0_level_0,conversion,spend
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.00454,0.557954
1,0.013572,1.541704


In [21]:
print("[バイアスのあるデータの場合]")

# conversion rateの差
diff_conv_biased = mean_comp_biased.iloc[1, 0] - mean_comp_biased.iloc[0, 0]
print(f"conversion rateの差：{diff_conv_biased:.3g}")

# spendの差
diff_spend_biased = mean_comp_biased.iloc[1, 1] - mean_comp_biased.iloc[0, 1]
print(f"spendの差：{diff_spend_biased:.3g}")

[バイアスのあるデータの場合]
conversion rateの差：0.00903
spendの差：0.984


In [22]:
# biased_dataから各グループの売上金額データを抽出
mens_mail_biased = biased_data[biased_data["treatment"] == 1].spend
no_mail_biased = biased_data[biased_data["treatment"] == 0].spend

# 有意差検定
stats.ttest_ind(mens_mail_biased, no_mail_biased, equal_var = True)

Ttest_indResult(statistic=5.595867225527975, pvalue=2.21319841336543e-08)