In [1]:
from pathlib import Path
import plotly.express as px
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import pandas as pd
import numpy as np
import wandb
import torch
from torch import nn
import sys
from pyparcs.graph_builder.parsers import graph_file_parser
from pyparcs.cdag.graph_objects import Graph
sys.path.append(str(Path.cwd().parent))
import yaml
from viz_utils import load_dfiv_model, predict_dfiv_model, load_dfiv_runs, load_dfiv_result

In [2]:
TEST_SEED = 9999

# Univariate Case

In [3]:
entity, project = "jasmineqy0", "dfiv_fully_random_univariate"  
runs_df = load_dfiv_runs(entity, project)
runs_df

Unnamed: 0,summary,config,name,id,state
0,"{'stage 2 train loss': 19.322303771972656, 'st...",{'data_configs': {'data_name': 'fully_random_i...,fancy-sweep-300,77f6o31b,finished
1,"{'stage 1 train loss': 7.9312615394592285, 'ep...",{'data_configs': {'data_name': 'fully_random_i...,cerulean-sweep-299,ki2ro8ly,finished
2,"{'_timestamp': 1684056461.9528265, 'stage 1 va...",{'data_configs': {'data_name': 'fully_random_i...,gentle-sweep-298,maqv1nvo,finished
3,"{'_step': 1988, '_runtime': 118.16577577590942...",{'data_configs': {'data_name': 'fully_random_i...,zany-sweep-297,h1z5j8ga,finished
4,"{'_wandb': {'runtime': 105}, '_timestamp': 168...",{'data_configs': {'data_name': 'fully_random_i...,soft-sweep-296,gmqpn8ey,finished
...,...,...,...,...,...
1191,{'_wandb': {'runtime': 7}},{'data_configs': {'data_name': 'fully_random_i...,comic-sweep-5,jyd03667,finished
1192,{'_wandb': {'runtime': 7}},{'data_configs': {'data_name': 'fully_random_i...,deft-sweep-4,3ztfzwtu,finished
1193,{'_wandb': {'runtime': 8}},{'data_configs': {'data_name': 'fully_random_i...,fragrant-sweep-3,hz8uc25l,finished
1194,{'_wandb': {'runtime': 7}},{'data_configs': {'data_name': 'fully_random_i...,leafy-sweep-2,h23vce61,finished


In [4]:
num_seed = 4
dfs = []

for i in range(1, num_seed+1):
    dfs.append(runs_df[runs_df['config'].apply(lambda x: x['data_configs']['bootstrap_seed'] == i)])
    print(f'loaded {len(dfs[-1])} runs for seed {i}')
                   

loaded 299 runs for seed 1
loaded 299 runs for seed 2
loaded 300 runs for seed 3
loaded 298 runs for seed 4


In [5]:
min_value_mse = []
min_index_mse = []

for i in range(num_seed):
    test_losses = dfs[i]['summary'].apply(lambda x: x['test loss'] if 'test loss' in x else None)
    test_losses = test_losses[~np.isnan(test_losses)]
    min_index, min_value = np.argmin(test_losses), np.min(test_losses).item()
    min_value_mse.append(min_value)
    min_index_mse.append(min_index)
    print(f'min test mse for seed {i+1} with index {min_index}: {min_value_mse[-1]}')

best_seed_idx = min_value_mse.index(min(min_value_mse))
best_run_idx = min_index_mse[best_seed_idx]

min test mse for seed 1 with index 4: 25.780092239379883
min test mse for seed 2 with index 54: 3125.436767578125
min test mse for seed 3 with index 12: 1.5912082195281982
min test mse for seed 4 with index 236: 2.4377970695495605


In [6]:
best_seed_idx = min_value_mse.index(min(min_value_mse))
best_seed = best_seed_idx + 1
best_group_df = dfs[best_seed_idx]
print(f'best seed: {best_seed}, best run index: {min_index_mse[best_seed_idx]}')

best seed: 3, best run index: 12


In [7]:
best_run = best_group_df.iloc[[best_run_idx]]
best_run_name = best_run['name'].values[0]
best_run_id = best_run['id'].values[0]
print(f'best run name: {best_run_name}, best run id: {best_run_id}')

best run name: pleasant-sweep-288, best run id: b5f9lhal


In [8]:
run_path = '/'.join([entity, project, best_run_id])

In [9]:
config_path = f'/home/stud/shen/test/MA/DeepFeatureIV/final_configs/dfiv_fully_random_univariate_{best_seed}/0/config.yml'
var_info_path = f'/home/stud/shen/test/MA/DeepFeatureIV/final_configs/dfiv_fully_random_univariate_{best_seed}/0/var_info.yml'
simulation_info = { 'config_path': config_path, 'var_info_path': var_info_path }
dfiv_model = load_dfiv_model(run_path, data_config_ops={'simulation_info': simulation_info})

In [10]:
dfiv_result = load_dfiv_result(run_path)

treatment = dfiv_result['treatment']
covariate = dfiv_result['covariate']
prediction = dfiv_result['prediction']
target = dfiv_result['target']
oos_loss = dfiv_result['oos_loss']

print(f'out of sample loss: {oos_loss}')

out of sample loss: 1.5912082195281982


## CATE

In [11]:
num_points = 50

In [12]:
import numpy as np

# cf_0: lognormal(mu_=0.2269316931421712,sigma_=1.897494554367658)

mu = 0.2269316931421712
sigma = 1.897494554367658

mean_covariate = np.exp(mu + 0.5 * sigma**2)

In [None]:
# fix covariate to be the mean
min_treatment = np.min(treatment, axis=0)
min_treatment = 0 if min_treatment < 0 else min_treatment
max_treatment = np.max(treatment, axis=0) 
max_treatment = np.median(treatment, axis=0)
print(f'min treatment: {min_treatment}, max treatment: {max_treatment.item()}')

In [None]:
test_treatment = np.linspace(min_treatment, max_treatment, num_points)
test_covariate = np.tile(mean_covariate, (num_points, 1))
estimated_structural = predict_dfiv_model(dfiv_model, test_treatment, test_covariate)

In [None]:
with open (var_info_path, 'r') as f:
    var_info = yaml.safe_load(f)

ts_name = 'ts_0'
cf_name = var_info['observed']['cf'][0]
# directly retrieving ot is the structural, ot + noise_ot is the outcome
ot_name = var_info['observed']['ot'][0]

In [None]:
from scipy.stats import expon
lamb = 1.6129142609724316 + 0.09 * mean_covariate + 1.65 * test_treatment + 1.63 * mean_covariate * test_treatment
scale = 1 / lamb
lamb = 0.5
true_structural = lamb * np.exp(-lamb * test_treatment)

In [None]:
# nodes, edges = graph_file_parser(config_path)
# g = Graph(nodes=nodes, edges=edges)
# interventions = [{ts_name: test_treatment[i], cf_name: test_covariate[i]} for i in range(num_points)]

# # get the initial random state
# start_random_state = np.random.get_state()
# np.random.seed(TEST_SEED)

# true_structural = []
# for intervention in interventions:
#     true_structural.append(g.do(size=1, interventions=intervention)[ot_name].to_numpy())
# true_structural = np.vstack(true_structural).squeeze()

# # reset the random state
# np.random.set_state(start_random_state)

In [None]:
np.sum((true_structural - estimated_structural) ** 2 /  num_points)

In [None]:
fig, ax = plt.subplots()
test_treatment = test_treatment.squeeze()

l1 = ax.scatter(test_treatment, true_structural, color='red', label='Truth')
l2 = ax.scatter(test_treatment, estimated_structural, color='blue', label='DFIV')
ax.legend()

ax.set_xlabel('Time of the year t')
ax.set_ylabel('CATE')
ax.set_ylim([0, 5])
# plt.savefig(f'{img_dir}/CATE hst2.pdf')
plt.show()

# Increased Influence of Interaction

# 5x

In [None]:
entity, project = "jasmineqy0", "dfiv_fully_random_interaction_5x"  
runs_df = load_dfiv_runs(entity, project)
runs_df

In [None]:
num_seed = 4
dfs = []

for i in range(1, num_seed+1):
    dfs.append(runs_df[runs_df['config'].apply(lambda x: x['data_configs']['bootstrap_seed'] == i)])
    print(f'loaded {len(dfs[-1])} runs for seed {i}')
                   

In [None]:
min_value_mse = []
min_index_mse = []

for i in range(num_seed):
    test_losses = dfs[i]['summary'].apply(lambda x: x['test loss'] if 'test loss' in x else None)
    test_losses = test_losses[~np.isnan(test_losses)]
    min_index, min_value = np.argmin(test_losses), np.min(test_losses).item()
    min_value_mse.append(min_value)
    min_index_mse.append(min_index)
    print(f'min test mse for seed {i+1} with index {min_index}: {min_value_mse[-1]}')

best_seed_idx = min_value_mse.index(min(min_value_mse))
best_run_idx = min_index_mse[best_seed_idx]

In [None]:
best_seed_idx = min_value_mse.index(min(min_value_mse))
best_seed = best_seed_idx + 1
best_group_df = dfs[best_seed_idx]
print(f'best seed: {best_seed}, best run index: {min_index_mse[best_seed_idx]}')

In [None]:
best_run = best_group_df.iloc[[best_run_idx]]
best_run_name = best_run['name'].values[0]
best_run_id = best_run['id'].values[0]
print(f'best run name: {best_run_name}, best run id: {best_run_id}')

In [None]:
run_path = '/'.join([entity, project, best_run_id])

## 10x

In [None]:
entity, project = "jasmineqy0", "dfiv_fully_random_interaction_10x"  
runs_df = load_dfiv_runs(entity, project)
runs_df

In [None]:
num_seed = 4
dfs = []

for i in range(1, num_seed+1):
    dfs.append(runs_df[runs_df['config'].apply(lambda x: x['data_configs']['bootstrap_seed'] == i)])
    print(f'loaded {len(dfs[-1])} runs for seed {i}')
                   

In [None]:
min_value_mse = []
min_index_mse = []

for i in range(num_seed):
    test_losses = dfs[i]['summary'].apply(lambda x: x['test loss'] if 'test loss' in x else None)
    test_losses = test_losses[~np.isnan(test_losses)]
    min_index, min_value = np.argmin(test_losses), np.min(test_losses).item()
    min_value_mse.append(min_value)
    min_index_mse.append(min_index)
    print(f'min test mse for seed {i+1} with index {min_index}: {min_value_mse[-1]}')

best_seed_idx = min_value_mse.index(min(min_value_mse))
best_run_idx = min_index_mse[best_seed_idx]

In [None]:
best_seed_idx = min_value_mse.index(min(min_value_mse))
best_seed = best_seed_idx + 1
best_group_df = dfs[best_seed_idx]
print(f'best seed: {best_seed}, best run index: {min_index_mse[best_seed_idx]}')

In [None]:
best_run = best_group_df.iloc[[best_run_idx]]
best_run_name = best_run['name'].values[0]
best_run_id = best_run['id'].values[0]
print(f'best run name: {best_run_name}, best run id: {best_run_id}')

In [None]:
run_path = '/'.join([entity, project, best_run_id])

# Multivariate Case