In [1]:
import os, importlib, sys, time

In [2]:
import numpy as np
import scipy.sparse as sparse
import matplotlib.pyplot as plt
import pandas as pd
import pyarrow
from tqdm import tqdm

In [3]:
import KECENI

# load data

In [4]:
data_dir = 'data'

In [5]:
i_sim = 1

In [6]:
data_network = pd.read_feather('%s/network.feather'%(data_dir))
data_latent = pd.read_feather('%s/latent.feather'%data_dir)
data_feature = pd.read_feather('%s/feature_%.3d.feather'%(data_dir, i_sim))

In [7]:
data_network

Unnamed: 0,row,col
0,1,5
1,1,17
2,2,3
3,2,9
4,3,2
...,...,...
3621,999,837
3622,999,930
3623,999,989
3624,999,993


In [8]:
data_feature

Unnamed: 0,X1,X2,X3,p,T,m,Y
0,-0.447027,-0.705126,0.266350,0.391050,1,0.688503,0.692363
1,1.219795,0.637921,1.272363,0.827075,0,-7.103306,-6.488132
2,0.790879,0.837265,0.608502,0.753678,1,-3.974438,-2.137433
3,-1.079978,-1.804343,-2.574626,0.061256,0,8.294056,9.525241
4,-0.437606,0.003347,0.423150,0.498611,0,-2.155245,-3.193287
...,...,...,...,...,...,...,...
995,0.566763,-0.501376,0.086047,0.518920,0,3.092675,4.496140
996,-1.454455,1.275978,1.752639,0.687204,1,-2.954723,-5.044276
997,-0.740551,-0.501887,1.173634,0.491400,0,-3.518165,-3.671195
998,0.143803,1.283810,0.483779,0.722259,0,-3.085424,-3.713241


In [9]:
n_node = len(data_latent); n_node

1000

In [10]:
len(data_network)

3626

In [33]:
Adj = sparse.csr_matrix((
    np.full(len(data_network), True), 
    (np.array(data_network.row)-1, np.array(data_network.col)-1)
), shape=(n_node,n_node)).toarray()

In [34]:
G = KECENI.Graph(Adj)

In [35]:
Ys = data_feature.iloc[:,6].values
Ts = data_feature.iloc[:,4].values
Xs = data_feature.iloc[:,0:3].values

In [36]:
data = KECENI.Data(Ys, Ts, Xs, G)

# counterfactual of interest

In [37]:
i0 = 0

In [38]:
T0s_0 = np.full(n_node, 0)
T0s_1 = np.full(n_node, 1)

In [39]:
T0s_0[G.N1(i0)], T0s_1[G.N1(i0)]

(array([0, 0, 0]), array([1, 1, 1]))

# groundtruth

In [40]:
YTR_0, YTR_1 = (-2, 2)
YTR_d = YTR_1 - YTR_0

In [41]:
YTR_0, YTR_1, YTR_d

(-2, 2, 4)

# estimation

In [42]:
from sklearn.linear_model import LinearRegression, LogisticRegression

In [43]:
pi_fit = LogisticRegression(penalty=None).fit(Xs, Ts)

In [44]:
pi_hat = pi_fit.predict_proba(Xs)[np.arange(n_node), Ts]

In [45]:
np.concatenate([Xs, Ts[:,None]], -1)

array([[-0.44702679, -0.70512624,  0.26635026,  1.        ],
       [ 1.21979523,  0.63792082,  1.27236305,  0.        ],
       [ 0.79087941,  0.83726517,  0.60850156,  1.        ],
       ...,
       [-0.74055143, -0.50188708,  1.17363408,  0.        ],
       [ 0.14380285,  1.28380978,  0.48377914,  0.        ],
       [-0.27111388,  2.19423003,  1.5742496 ,  1.        ]])

In [46]:
mu_fit = LinearRegression().fit(np.concatenate([Xs, Ts[:,None]], -1), Ys)

In [47]:
mu_hat = mu_fit.predict(np.concatenate([Xs, Ts[:,None]], -1))

In [48]:
mu0_hat = mu_fit.predict(np.concatenate([Xs, np.full((n_node, 1), 0)], -1))
mu1_hat = mu_fit.predict(np.concatenate([Xs, np.full((n_node, 1), 1)], -1))

In [49]:
np.mean(mu0_hat + (Ts == 0) * (Ys - mu_hat) / pi_hat)

-0.7973376630065124

In [50]:
np.mean(mu1_hat + (Ts == 1) * (Ys - mu_hat) / pi_hat)

1.1878237088895482