In [1]:
from dowhy import CausalModel
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

In [None]:
plt.rcParams["font.family"] = "서울한강 장체L"
river = pd.read_csv("data/all.csv")
print(river.shape)
river.head()

In [None]:
def month_to_season(x):
    if x in [3,4,5]:
        y = "spring"
    elif x in [6,7,8]:
        y = "summer"
    elif x in [9,10,11]:
        y = "autumn"
    else:
        y = "winter"
    return (y)

def weather(x):
    if   x==1:
        y = "clear"
    elif x==2:
        y = "cloudy"
    elif x==3:
        y = "light_rainy"
    else:
        y = "heavy_rainy"
    return (y)

In [None]:
# 1. 원핫 인코딩 생성
land_dummies = pd.get_dummies(river['부동산더미'])
restore_dummies= pd.get_dummies(river['복원하천더미'])
river = pd.concat([river,land_dummies,restore_dummies],axis=1)

# 2. 변수 삭제
river = river.drop(["casual","registered"],axis=1)

In [None]:

fig,((ax1,ax2),(ax3,ax4)) = plt.subplots(2,2)
fig.set_size_inches(10,10)
sns.barplot(x="관측소",y="미세먼지", data = river,ax=ax1)
sns.barplot(x="관측소",y="교통량", data = river,ax=ax2)
sns.barplot(x="관측소",y="토지", data = river,ax=ax3)
sns.barplot(x="관측소",y="녹지", data = river,ax=ax4)
ax1.set(title = "관측소별 미세먼지")
ax2.set(title = "관측소별 교통량")
ax3.set(title = "관측소별 토지집중도")
ax4.set(title = "관측소별 녹지면적")

In [None]:
fig,((ax1,ax2),(ax3,ax4)) = plt.subplots(2,2)
fig.set_size_inches(10,10)

sns.boxplot(x="관측소",y="미세먼지", data = river,ax=ax1)
sns.boxplot(x="관측소",y="교통량", data = river,ax=ax2)
sns.boxplot(x="관측소",y="토지", data = river,ax=ax3)
sns.boxplot(x="관측소",y="녹지", data = river,ax=ax4)
ax1.set(title = "관측소별 미세먼지")
ax2.set(title = "관측소별 교통량")
ax3.set(title = "관측소별 토지집중도")
ax4.set(title = "관측소별 녹지면적")

In [None]:
fig, (ax1,ax2,ax3,ax4,ax5) = plt.subplots(nrows=5)
fig.set_size_inches(18,25)

sns.pointplot(data=river,x="관측소",y="미세먼지",ax=ax1)
sns.pointplot(data=river,x="관측소",y="미세먼지",hue="교통량",ax=ax2)
sns.pointplot(data=river,x="관측소",y="미세먼지",hue="토지이용",ax=ax3)
sns.pointplot(data=river,x="관측소",y="미세먼지",hue="녹지면적",ax=ax4)
sns.pointplot(data=river,x="관측소",y="미세먼지",hue="복원하천유무",ax=ax5)

In [None]:
river["is_restore"]=river["restore"].apply(lambda x: True if x == 1 else False)
river["is_ready"]=river["복원하천면적"].apply(lambda x: True if x > 0 else False)
river["is_토지이용"]=river["토지종"].apply(lambda x: True if (x=="상업지구") or (x=="준공업지구")  else False)
river["is_2017"] = river["year"].apply(lambda x: True if x==2017 else False)

In [None]:
restore_effect = CausalModel(
    data=river,
    treatment = "is_restore",
    outcome = "미세먼지",
    common_causes = "교통량+토지+녹지면적+시간더미".split("+"))

restore_identified = restore_effect.identify_effect()

restore_matching = restore_effect.estimate_effect(restore_identified,method_name = "backdoor.propensity_score_matching")
restore_stratify = restore_effect.estimate_effect(restore_identified,method_name = "backdoor.propensity_score_stratification")

print("Causal Estimate of Matching: {}".format(restore_matching.value))
print("Causal Estimate of Stratification: {}".format(restore_stratify.value))

In [None]:
#모디파이어 고려(트리트먼트와 결과에 동시에 영향을 주는 요소)
mass_effect = CausalModel(
    data=river,
    treatment = "복원하천면적",
    outcome   = "미세먼지",
    common_causes = "교통량+녹지면적+토지이용+시간더미".split("+"),
    effect_modifiers = "하천폭".split("+"))

mass_identified = mass_effect.identify_effect()

mass_matching = mass_effect.estimate_effect(mass_identified,method_name = "backdoor.propensity_score_matching")
mass_stratify = mass_effect.estimate_effect(mass_identified,method_name = "backdoor.propensity_score_stratification")

print("Causal Estimate of Matching: {}".format(mass_matching.value))
print("Causal Estimate of Stratification: {}".format(mass_stratify.value))

In [None]:
# 1. 무관한 교란변수 생성
restore_random    = restore_effect.refute_estimate(restore_identified, restore_matching,method_name="random_common_cause")
# 2. 가짜 처리 생성
restore_placebo   = restore_effect.refute_estimate(restore_identified, restore_matching,method_name="placebo_treatment_refuter", placebo_type="permute")
# 3. 일부만 추출(서브셋 변경할것)
restore_subset    = restore_effect.refute_estimate(restore_identified, restore_matching,method_name="data_subset_refuter", subset_fraction=0.8)

restore_random_str    = restore_effect.refute_estimate(restore_identified, restore_stratify,method_name="random_common_cause")
restore_placebo_str   = restore_effect.refute_estimate(restore_identified, restore_stratify,method_name="placebo_treatment_refuter", placebo_type="permute")
restore_subset_str    = restore_effect.refute_estimate(restore_identified, restore_stratify,method_name="data_subset_refuter", subset_fraction=0.8)

In [None]:
print(restore_random)
print(restore_placebo)
print(restore_subset)

print(restore_random_str)
print(restore_placebo_str)
print(restore_subset_str)