## ``Causal_sampler`` Demo. 02: 

### Simulation on the semi-synthetic background-MNIST classification task

> Front-door confounding

Front-door confounding scenario assumes there are multiple latent confounders that affect both the cause and effect variables, while there is a mediator variable that intercepts all direct causal effect from the cause to the effect.

In this demonstration, we suppose $Z$ is the mediator, $Y$ is the cause and $X$ is the effect.

In [1]:
import causal_sampler.pipeline as cs_pipe
from scipy.ndimage import maximum_filter
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams['font.family'] = 'Calibri'
plt.rcParams["xtick.labelsize"] = 14
plt.rcParams["ytick.labelsize"] = 14
plt.rcParams["axes.labelsize"] = 14
plt.rcParams["legend.fontsize"] = 14
plt.rcParams["axes.titlesize"] = 16

semisyn_data_dir = r"../test_data/semi_synthetic_data/frontdoor_data/"

def maxPooling_imgArr(img_flatArr, kernel_size, padding = "nearest", flatten = False):
    n_imgs = img_flatArr.shape[0]
    img_size = int(img_flatArr.shape[1]**0.5)
    img_arr = img_flatArr.reshape(n_imgs, img_size, img_size)
    resized_imgs = []
    for i in range(n_imgs):
        resized_imgs.append(maximum_filter(img_arr[i], size=kernel_size, mode=padding)[::kernel_size, ::kernel_size])
    resized_imgs = np.array(resized_imgs)
    if flatten:
        resized_imgs = resized_imgs.reshape(n_imgs, -1)
    return resized_imgs

0. Define the front-door causal graph

In [None]:
causal_graph = '"Front-door Causal Graph"; \
                Y; X; Z; \
                Y->Z; \
                Z->X; \
                Y<->X; '

1. Load datasets

In [2]:
X_train_conf = pd.read_csv(semisyn_data_dir + "X_train_conf.csv")
Y_train_conf = pd.read_csv(semisyn_data_dir + "Y_train_conf.csv")
Z_train_conf = pd.read_csv(semisyn_data_dir + "Z_train_conf.csv")
X_train_conf = np.array(X_train_conf)
X_train_conf = maxPooling_imgArr(X_train_conf, kernel_size=3, flatten=True)
Y_train_conf = np.array(Y_train_conf).reshape(-1,1)
Z_train_conf = np.array(Z_train_conf).reshape(-1,1)

X_test_unconf = pd.read_csv(semisyn_data_dir + "X_test_unconf.csv")
Y_test_unconf = pd.read_csv(semisyn_data_dir + "Y_test_unconf.csv")
X_test_unconf = np.array(X_test_unconf)
X_test_unconf = maxPooling_imgArr(X_test_unconf, kernel_size=3, flatten=True)
Y_test_unconf = np.array(Y_test_unconf).reshape(-1,1)

X_test_conf = pd.read_csv(semisyn_data_dir + "X_test_conf.csv")
Y_test_conf = pd.read_csv(semisyn_data_dir + "Y_test_conf.csv")
X_test_conf = np.array(X_test_conf)
X_test_conf = maxPooling_imgArr(X_test_conf, kernel_size=3, flatten=True)
Y_test_conf = np.array(Y_test_conf).reshape(-1,1)

2. Assign the key-value pair for each variable and prepare essential parameters

In [None]:
conf_train_data = {"X": X_train_conf,
                   "Y'": Y_train_conf,
                   "Z": Z_train_conf}
n_bins = {"Y'": [0],
          "Z": [0]}
n_samples = 5000

3. Initialize a CW-GMM based deconfounding pipeline and fit a deconfounded KNN

In [4]:
cwgmm_flow = cs_pipe.CausalGMMSampler(causal_graph=causal_graph,
                                      cause_var_name="Y",
                                      effect_var_name="X",
                                      intv_values = np.unique(Y_train_conf),
                                      data_dict = conf_train_data,
                                      est_method = "histogram",
                                      n_bins = n_bins
                                    )

cwgmm_model = cwgmm_flow.fit(comp_k = 1000,
                             max_iter = 500,
                             tol = 1e-3,
                             init_method = "kmeans++",
                             cov_type = "diag",
                             cov_reg = 1e-6,
                             min_variance_value = 1e-6,
                             random_seed = None,
                             weight_kernel = None,
                             verbose = 2,
                             return_model = True)

CW-GMMs fitting:   0%|          | 0/2 [00:00<?, ?model/s]

EM iter:   0%|          | 0/500 [00:00<?, ?it/s]

EM iter:   0%|          | 0/500 [00:00<?, ?it/s]

In [5]:
deconf_cwgmm_X, deconf_cwgmm_Y = cwgmm_flow.resample(n_samples = n_samples,
                                                     shuffle = True,
                                                     return_samples = True,
                                                     random_seed = None)
deconf_gmm_clf = cwgmm_flow.fit_deconf_model(ml_model = KNeighborsClassifier(n_neighbors = 3))

> Optional: You can save the fitted CW-GMM model easily for later resampling use.

In [6]:
cwgmm_model.save("frontdoor_cwgmm_model")
with open("frontdoor_cwgmm_model.pkl", "rb") as f:
    cwgmm_model_loaded = pickle.load(f)
cwgmm_model_loaded.model_meta

Model saved successfully at c:\Users\jxm1417\Documents\experiment_code\causal_sampler\package\v0.0.1\Tutorial\frontdoor_cwgmm_model.pkl.


{'model_0': {'base_model': <causal_sampler.gmmSampler.WeightedGMM at 0x2647c1c63d0>,
  'intv_value': 1,
  'hyperparams': {'K': 1000,
   'cov_type': 'diag',
   'cov_reg': 1e-06,
   'min_variance_value': 1e-06,
   'max_iter': 500,
   'tol': 0.001,
   'init_method': 'kmeans++',
   'user_assigned_mus': None}},
 'model_1': {'base_model': <causal_sampler.gmmSampler.WeightedGMM at 0x2647c1c6940>,
  'intv_value': 2,
  'hyperparams': {'K': 1000,
   'cov_type': 'diag',
   'cov_reg': 1e-06,
   'min_variance_value': 1e-06,
   'max_iter': 500,
   'tol': 0.001,
   'init_method': 'kmeans++',
   'user_assigned_mus': None}}}

4. Initialize a causal bootstrapping based deconfounding pipeline and fit a deconfounded KNN

In [7]:
cb_flow = cs_pipe.CausalBootstrapSampler(causal_graph=causal_graph,
                                          cause_var_name="Y",
                                          effect_var_name="X",
                                          intv_values = np.unique(Y_train_conf),
                                          data_dict = conf_train_data,
                                          est_method = "histogram",
                                          n_bins = n_bins)
deconf_cb_X, deconf_cb_Y = cb_flow.resample(n_samples = n_samples,
                                            kernel = None,
                                            cb_mode = "fast",
                                            shuffle = True,
                                            return_samples = True,
                                            random_seed = None,
                                            verbose = 1)
deconf_cb_clf = cb_flow.fit_deconf_model(ml_model = KNeighborsClassifier(n_neighbors = 3))

CB Resampling:   0%|          | 0/2 [00:00<?, ?it/s]

5. Train another two KNNs using the original confounded dataset and the non-confounded (test) dataset.

In [8]:
nonconf_clf = KNeighborsClassifier(n_neighbors = 3)
nonconf_clf = nonconf_clf.fit(X_test_unconf, Y_test_unconf.reshape(-1))

conf_clf = KNeighborsClassifier(n_neighbors = 3)
conf_clf = conf_clf.fit(X_train_conf, Y_train_conf.reshape(-1))

6. compare thier performance on non-confounded and confounded test set

In [9]:
print("Test on the non-confounded test set:")

y_pred_gmm_deconf_unconf = deconf_gmm_clf.predict(X_test_unconf)
print("Report of CW-GMM based deconfounded model:")
print(classification_report(Y_test_unconf, y_pred_gmm_deconf_unconf, digits=4))
print("-"*20)
y_pred_cb_deconf_unconf = deconf_cb_clf.predict(X_test_unconf)
print("Report of CB-based deconfounded model:")
print(classification_report(Y_test_unconf, y_pred_cb_deconf_unconf, digits=4))
print("-"*20)
y_pred_unconf_unconf = nonconf_clf.predict(X_test_unconf)
print("Report of non-confounded model:")
print(classification_report(Y_test_unconf, y_pred_unconf_unconf, digits=4))
print("-"*20)
y_pred_conf_unconf = conf_clf.predict(X_test_unconf)
print("Report of confonded model:")
print(classification_report(Y_test_unconf, y_pred_conf_unconf, digits=4))

Test on the non-confounded test set:
Report of CW-GMM based deconfounded model:
              precision    recall  f1-score   support

           1     0.9366    0.9279    0.9322       430
           2     0.9402    0.9475    0.9438       514

    accuracy                         0.9386       944
   macro avg     0.9384    0.9377    0.9380       944
weighted avg     0.9385    0.9386    0.9385       944

--------------------
Report of CB-based deconfounded model:
              precision    recall  f1-score   support

           1     0.9340    0.9209    0.9274       430
           2     0.9346    0.9455    0.9400       514

    accuracy                         0.9343       944
   macro avg     0.9343    0.9332    0.9337       944
weighted avg     0.9343    0.9343    0.9343       944

--------------------
Report of non-confounded model:
              precision    recall  f1-score   support

           1     0.9423    0.9116    0.9267       430
           2     0.9280    0.9533    0.9405 

In [None]:
print("Test on the confounded test set:")

y_pred_gmm_deconf_conf = deconf_gmm_clf.predict(X_test_conf)
print("Report of deconfounded model using mechanism learning:")
print(classification_report(Y_test_conf, y_pred_gmm_deconf_conf, digits=4))
print("-"*20)
y_pred_cb_deconf_conf = deconf_cb_clf.predict(X_test_conf)
print("Report of deconfounded model using CB-based method:")
print(classification_report(Y_test_conf, y_pred_cb_deconf_conf, digits=4))
print("-"*20)
y_pred_unconf_conf = nonconf_clf.predict(X_test_conf)
print("Report of non-confounded model:")
print(classification_report(Y_test_conf, y_pred_unconf_conf, digits=4))
print("-"*20)
y_pred_conf_conf = conf_clf.predict(X_test_conf)
print("Report of confounded model:")
print(classification_report(Y_test_conf, y_pred_conf_conf, digits=4))

Test on the confounded test set:
Report of deconfounded model using mechanism learning:
              precision    recall  f1-score   support

           1     0.9565    0.9429    0.9496       490
           2     0.9393    0.9537    0.9464       454

    accuracy                         0.9481       944
   macro avg     0.9479    0.9483    0.9480       944
weighted avg     0.9482    0.9481    0.9481       944

--------------------
Report of deconfounded model using CB-based method:
              precision    recall  f1-score   support

           1     0.9451    0.9490    0.9470       490
           2     0.9447    0.9405    0.9426       454

    accuracy                         0.9449       944
   macro avg     0.9449    0.9448    0.9448       944
weighted avg     0.9449    0.9449    0.9449       944

--------------------
Report of non-confounded model:
              precision    recall  f1-score   support

           1     0.9521    0.9327    0.9423       490
           2     0.9289