## Transforming data in csv and parents format

In [1]:
import os
import numpy as np
import sys
import random
random.seed(0)
np.random.seed(0)

import pandas as pd
sys.path.append('../')

data_names = ['pm25', 'medical', 'traffic']

def matrix_graph_to_parents(matrix_graph: np.ndarray) -> dict[int, list[int]]:
    """
    Convert a matrix graph to a dictionary representation of parents.

    Args:
        matrix_graph (np.ndarray): The adjacency matrix representing the graph.

    Returns:
        dict[int, list[int]]: A dictionary where keys are node indices and values are lists of parent node indices.
    """
    parents = {}
    for i in range(matrix_graph.shape[0]):
        parents[i] = np.where(matrix_graph[i] == 1)[0].tolist()
    
    parents = {k: [(v, -1) for v in vs] for k, vs in parents.items()}
    return parents

SAMPLE_NUM = 480 # All the datasets have 480 samples
for data_name in data_names:
    data = np.load('./' + data_name + '/gen_data.npy')
    data = data[:, 20:, :data.shape[2] // 2]  # Forget the residuals and for some reason first 20 values are random
    matrix_graph = np.load('./' + data_name + '/graph.npy')

    print(f"Data Name: {data_name}")
    print(f'Shape of Graph H: {matrix_graph.shape}')
    print(f'Shape of Time-series Data: {data.shape} (Sample_num, Time_step, Node_num)')

    os.makedirs(f'./data_{data_name}', exist_ok=True)
    for i in range(data.shape[0]): # Iterate over samples
        with open(f'./data_{data_name}/{i}_node_parents.txt', 'w') as f:
            f.write(str(matrix_graph_to_parents(matrix_graph))) # Same matrix graph for all samples
        
        current_data = data[i, :, :]  # Select the i-th sample
        
        pd.DataFrame(current_data).to_csv(f'./data_{data_name}/{i}_data.csv', index=False, header=False)


Data Name: pm25
Shape of Graph H: (36, 36)
Shape of Time-series Data: (480, 20, 36) (Sample_num, Time_step, Node_num)
Data Name: medical
Shape of Graph H: (20, 20)
Shape of Time-series Data: (480, 20, 20) (Sample_num, Time_step, Node_num)
Data Name: traffic
Shape of Graph H: (20, 20)
Shape of Time-series Data: (480, 20, 20) (Sample_num, Time_step, Node_num)


## Find the groups we are going to use

In [5]:
import shutil
import os

from group_causation.groups_extraction import GeneticCausalGroupsExtractor


datasets_groups = {k: None for k in data_names}
def extract_and_save_groups(data_name):
    data = pd.read_csv(f'./data_{data_name}/0_data.csv', header=None).values
    if data.shape[1] > 30: # Since there are many variables, consider the harmonic variance
        group_extractor = GeneticCausalGroupsExtractor(data, 
                                                    scores=['harmonic_variance_explained', 'explainability_score'], 
                                                    scores_weights=[0.1, 1.0])
    else:
        group_extractor = GeneticCausalGroupsExtractor(data, 
                                                    scores=['explainability_score'], 
                                                    scores_weights=[1.0])
        
    groups = group_extractor.extract_groups()
    datasets_groups[data_name] = groups

    print(data_name, 'dataset obtained the groups:', groups)
    
    with open(f'./data_{data_name}/0_groups.txt', 'w') as f:
        f.write(str(groups))
    
    for i in range(1, SAMPLE_NUM):
        shutil.copyfile(f'./data_{data_name}/0_groups.txt', f'./data_{data_name}/{i}_groups.txt')

for data_name in data_names:
    extract_and_save_groups(data_name)

pm25 dataset obtained the groups: [{0, 4}, {35}, {2}, {3, 11, 7}, {5, 6}, {8, 9}, {10, 26, 22}, {12, 15, 16, 19, 23}, {17, 13, 21, 25}, {1, 18, 34, 14}, {24, 20, 28}, {27, 31}, {33, 29}, {30}, {32}]
medical dataset obtained the groups: [{0, 11, 5}, {2, 3, 6, 7, 13, 18}, {4, 8, 9, 10, 19}, {1, 12, 14, 15, 16, 17}]
traffic dataset obtained the groups: [{0, 1, 3, 5, 7, 8, 9, 10, 14, 16}, {4, 6, 11, 12, 13, 17, 18, 19}, {2, 15}]


## Convert node-level parents to group-level parents

In [None]:
def find_index_with_element(groups, x):
    for i, group in enumerate(groups):
        if x in group: return i
    return None

for data_name, groups in datasets_groups.items():
    with open(f'./data_{data_name}/0_node_parents.txt', 'r') as f:
        node_parents = eval(f.read())
    
    group_parents = {}
    for son_group_idx, son_group in enumerate(groups):
        group_parents[son_group_idx] = []
        for son_node in son_group:
            for parent_node, lag in node_parents[son_node]:
                parent_group_idx = find_index_with_element(groups, parent_node)
                if (parent_group_idx, lag) not in group_parents[son_group_idx]:
                    group_parents[son_group_idx].append((parent_group_idx, -1))
                
    with open(f'./data_{data_name}/0_parents.txt', 'w') as f:
        f.write(str(group_parents))
    for i in range(1, SAMPLE_NUM):
        shutil.copyfile(f'./data_{data_name}/0_parents.txt', f'./data_{data_name}/{i}_parents.txt')

In [14]:
find_index_with_element(groups, 11)

1

## Perform the benchmark for each of the datasets

In [None]:
from matplotlib import pyplot as plt

from group_causation.benchmark import BenchmarkGroupCausalDiscovery

from group_causation.utils import static_parameters
from group_causation.group_causal_discovery import DimensionReductionGroupCausalDiscovery
from group_causation.group_causal_discovery import MicroLevelGroupCausalDiscovery
from group_causation.group_causal_discovery import HybridGroupCausalDiscovery

algorithms = {
    'group-embedding': HybridGroupCausalDiscovery,
    'subgroups': HybridGroupCausalDiscovery,
    'pca+pcmci': DimensionReductionGroupCausalDiscovery,
    'pca+dynotears': DimensionReductionGroupCausalDiscovery,
    'micro-level': MicroLevelGroupCausalDiscovery,
}
algorithms_parameters = {
    'pca+pcmci': {'dimensionality_reduction': 'pca', 'node_causal_discovery_alg': 'pcmci',
                            'node_causal_discovery_params': {'min_lag': 1, 'max_lag': 3, 'pc_alpha': 0.05}},
    
    'pca+dynotears': {'dimensionality_reduction': 'pca', 'node_causal_discovery_alg': 'dynotears',
                            'node_causal_discovery_params': {'max_lag': 3, 'lambda_w': 0.05, 'lambda_a': 0.05}},
    
    'micro-level': {'node_causal_discovery_alg': 'pcmci',
                            'node_causal_discovery_params': {'min_lag': 1, 'max_lag': 3, 'pc_alpha': 0.05}},
    
    'group-embedding': {'dimensionality_reduction': 'pca', 
               'dimensionality_reduction_params': {'explained_variance_threshold': 0.3,
                                                   'groups_division_method': 'group_embedding'},
                'node_causal_discovery_alg': 'pcmci',
                'node_causal_discovery_params': {'min_lag': 1, 'max_lag': 3, 'pc_alpha': 0.05},
                'verbose': 0},
    
    'subgroups': {'dimensionality_reduction': 'pca', 
               'dimensionality_reduction_params': {'explained_variance_threshold': 0.3,
                                                   'groups_division_method': 'subgroups'},
                'node_causal_discovery_alg': 'pcmci',
                'node_causal_discovery_params': {'min_lag': 1, 'max_lag': 3, 'pc_alpha': 0.05},
                'verbose': 0},
}

data_generation_options = {}

benchmark_options = {
    'static_parameters': (static_parameters, {}),
}

chosen_option = 'static_parameters'


def execute_benchmark(data_name):    
    benchmark = BenchmarkGroupCausalDiscovery()
    results_folder = f'results_{data_name}'
    datasets_folder = f'data_{data_name}'
    
    options_generator, options_kwargs = benchmark_options[chosen_option]
    parameters_iterator = options_generator(data_generation_options,
                                                algorithms_parameters,
                                                **options_kwargs)
    results = benchmark.benchmark_causal_discovery(algorithms=algorithms,
                                        parameters_iterator=parameters_iterator,
                                        datasets_folder=datasets_folder,
                                        generate_toy_data=False,
                                        results_folder=results_folder,
                                        n_executions=5,
                                        verbose=1)
    
    return results, benchmark

In [None]:
from group_causation.benchmark import BenchmarkGroupCausalDiscovery

plt.style.use('default')
plt.rcParams['text.usetex'] = True
plt.rcParams['font.family'] = 'serif'

for data_name in (data_names:=['pm25', 'medical', 'traffic']):
    print('Executing benchmark of', data_name)
    results, benchmark = execute_benchmark(data_name)
    
# Plot graphs
for data_name in (data_names:=['pm25', 'medical', 'traffic']):
    results_folder = f'results_{data_name}'
    benchmark = BenchmarkGroupCausalDiscovery()
    # benchmark.plot_particular_result(results_folder, results_folder + '/summary',
    #                                 scores=[f'{score}_summary' for score in \
    #                                                 ['shd', 'f1', 'precision', 'recall']],
    #                                 dataset_iteration_to_plot=0)

Executing benchmark of pm25

--------------------------------------------------
[34m Datasets have been loaded. [0m
[32m Executing algorithm group-embedding [0m


  0%|          | 1/203 [00:01<06:33,  1.95s/it]

predicted_parents={0: [], 1: [], 2: [(1, -1)], 3: [(1, -1)], 4: [], 5: [(1, -1)], 6: [(1, -1)], 7: [(1, -1)], 8: [(1, -1)], 9: [], 10: [(4, -1)], 11: [], 12: [(9, -1)]}
causal_dataset.parents_dict={0: [(None, -1)], 1: [(None, -1)], 2: [(None, -1)], 3: [(None, -1)], 4: [(None, -1)], 5: [(None, -1)], 6: [(None, -1)], 7: [(None, -1)], 8: [(None, -1)], 9: [(None, -1)], 10: [(None, -1)], 11: [(None, -1)], 12: [(None, -1)]}


  1%|          | 2/203 [00:04<07:51,  2.34s/it]

predicted_parents={0: [(2, -1)], 1: [], 2: [(1, -3)], 3: [(2, -1), (6, -1)], 4: [(6, -1)], 5: [(7, -1), (2, -1)], 6: [(10, -3)], 7: [], 8: [], 9: [(7, -1)], 10: [(10, -1)], 11: [(1, -1)], 12: []}
causal_dataset.parents_dict={0: [(None, -1)], 1: [(None, -1)], 2: [(None, -1)], 3: [(None, -1)], 4: [(None, -1)], 5: [(None, -1)], 6: [(None, -1)], 7: [(None, -1)], 8: [(None, -1)], 9: [(None, -1)], 10: [(None, -1)], 11: [(None, -1)], 12: [(None, -1)]}


  1%|▏         | 3/203 [00:07<07:56,  2.38s/it]

predicted_parents={0: [(0, -1)], 1: [(12, -1)], 2: [(7, -1), (3, -1)], 3: [(6, -2), (3, -2)], 4: [(7, -1), (1, -1)], 5: [(8, -1), (0, -1)], 6: [(7, -1), (4, -1)], 7: [(4, -2), (3, -1)], 8: [(3, -1)], 9: [], 10: [(10, -1)], 11: [(0, -3), (9, -1)], 12: []}
causal_dataset.parents_dict={0: [(None, -1)], 1: [(None, -1)], 2: [(None, -1)], 3: [(None, -1)], 4: [(None, -1)], 5: [(None, -1)], 6: [(None, -1)], 7: [(None, -1)], 8: [(None, -1)], 9: [(None, -1)], 10: [(None, -1)], 11: [(None, -1)], 12: [(None, -1)]}


  2%|▏         | 4/203 [00:09<08:09,  2.46s/it]

predicted_parents={0: [(10, -2), (7, -1)], 1: [], 2: [(8, -1)], 3: [(1, -1)], 4: [], 5: [(7, -1), (3, -1), (0, -2)], 6: [(10, -1), (0, -1)], 7: [(0, -1)], 8: [(9, -3)], 9: [(9, -1)], 10: [(10, -2)], 11: [(8, -1)], 12: [(4, -1), (9, -3), (4, -2)]}
causal_dataset.parents_dict={0: [(None, -1)], 1: [(None, -1)], 2: [(None, -1)], 3: [(None, -1)], 4: [(None, -1)], 5: [(None, -1)], 6: [(None, -1)], 7: [(None, -1)], 8: [(None, -1)], 9: [(None, -1)], 10: [(None, -1)], 11: [(None, -1)], 12: [(None, -1)]}


  2%|▏         | 5/203 [00:11<07:51,  2.38s/it]

predicted_parents={0: [(11, -2), (12, -3)], 1: [(2, -1), (9, -1)], 2: [(4, -2), (9, -1)], 3: [(4, -3)], 4: [], 5: [(7, -1)], 6: [(7, -1)], 7: [(7, -1)], 8: [(8, -1)], 9: [(6, -3), (4, -3), (12, -1)], 10: [(2, -2)], 11: [(11, -1)], 12: [(12, -1)]}
causal_dataset.parents_dict={0: [(None, -1)], 1: [(None, -1)], 2: [(None, -1)], 3: [(None, -1)], 4: [(None, -1)], 5: [(None, -1)], 6: [(None, -1)], 7: [(None, -1)], 8: [(None, -1)], 9: [(None, -1)], 10: [(None, -1)], 11: [(None, -1)], 12: [(None, -1)]}


  3%|▎         | 6/203 [00:13<07:17,  2.22s/it]

predicted_parents={0: [(2, -2), (8, -2), (11, -1), (10, -3)], 1: [(5, -3)], 2: [(10, -1), (1, -3)], 3: [(2, -1), (11, -1)], 4: [(3, -1)], 5: [(4, -2), (3, -3)], 6: [(3, -1)], 7: [(3, -1), (9, -3)], 8: [(10, -1)], 9: [], 10: [(10, -1)], 11: [(8, -2), (9, -1)], 12: [(10, -1)]}
causal_dataset.parents_dict={0: [(None, -1)], 1: [(None, -1)], 2: [(None, -1)], 3: [(None, -1)], 4: [(None, -1)], 5: [(None, -1)], 6: [(None, -1)], 7: [(None, -1)], 8: [(None, -1)], 9: [(None, -1)], 10: [(None, -1)], 11: [(None, -1)], 12: [(None, -1)]}


  3%|▎         | 7/203 [00:15<06:56,  2.13s/it]

predicted_parents={0: [(0, -1)], 1: [], 2: [], 3: [(4, -1), (2, -1)], 4: [(2, -1), (9, -3)], 5: [(5, -1)], 6: [], 7: [(8, -2)], 8: [(10, -2)], 9: [], 10: [], 11: [], 12: [(12, -3)]}
causal_dataset.parents_dict={0: [(None, -1)], 1: [(None, -1)], 2: [(None, -1)], 3: [(None, -1)], 4: [(None, -1)], 5: [(None, -1)], 6: [(None, -1)], 7: [(None, -1)], 8: [(None, -1)], 9: [(None, -1)], 10: [(None, -1)], 11: [(None, -1)], 12: [(None, -1)]}


  4%|▍         | 8/203 [00:17<07:01,  2.16s/it]

predicted_parents={0: [(8, -2), (6, -2)], 1: [], 2: [(10, -2), (1, -2)], 3: [(4, -1), (1, -1)], 4: [(4, -2)], 5: [(4, -1)], 6: [(4, -1), (9, -1)], 7: [(4, -1), (6, -2)], 8: [(4, -1), (3, -1)], 9: [(1, -1)], 10: [(10, -1)], 11: [(11, -1)], 12: [(12, -1)]}
causal_dataset.parents_dict={0: [(None, -1)], 1: [(None, -1)], 2: [(None, -1)], 3: [(None, -1)], 4: [(None, -1)], 5: [(None, -1)], 6: [(None, -1)], 7: [(None, -1)], 8: [(None, -1)], 9: [(None, -1)], 10: [(None, -1)], 11: [(None, -1)], 12: [(None, -1)]}


  4%|▍         | 9/203 [00:20<07:10,  2.22s/it]

predicted_parents={0: [(6, -1)], 1: [], 2: [(1, -1)], 3: [(3, -1)], 4: [(4, -1)], 5: [(11, -3)], 6: [], 7: [(10, -1), (2, -1)], 8: [(8, -1)], 9: [(5, -2)], 10: [(5, -3)], 11: [], 12: []}
causal_dataset.parents_dict={0: [(None, -1)], 1: [(None, -1)], 2: [(None, -1)], 3: [(None, -1)], 4: [(None, -1)], 5: [(None, -1)], 6: [(None, -1)], 7: [(None, -1)], 8: [(None, -1)], 9: [(None, -1)], 10: [(None, -1)], 11: [(None, -1)], 12: [(None, -1)]}


  5%|▍         | 10/203 [00:22<06:42,  2.08s/it]

predicted_parents={0: [], 1: [(2, -3), (2, -1)], 2: [(2, -1)], 3: [], 4: [], 5: [], 6: [(8, -1)], 7: [(7, -1)], 8: [], 9: [(9, -1)], 10: [(2, -2)], 11: [], 12: [(2, -1), (9, -1)]}
causal_dataset.parents_dict={0: [(None, -1)], 1: [(None, -1)], 2: [(None, -1)], 3: [(None, -1)], 4: [(None, -1)], 5: [(None, -1)], 6: [(None, -1)], 7: [(None, -1)], 8: [(None, -1)], 9: [(None, -1)], 10: [(None, -1)], 11: [(None, -1)], 12: [(None, -1)]}


  5%|▌         | 11/203 [00:23<06:17,  1.97s/it]

predicted_parents={0: [], 1: [(6, -2)], 2: [(10, -3)], 3: [], 4: [(6, -2)], 5: [], 6: [], 7: [], 8: [(7, -1), (6, -2)], 9: [(2, -3)], 10: [(12, -3)], 11: [(7, -1), (7, -2)], 12: [(1, -1)]}
causal_dataset.parents_dict={0: [(None, -1)], 1: [(None, -1)], 2: [(None, -1)], 3: [(None, -1)], 4: [(None, -1)], 5: [(None, -1)], 6: [(None, -1)], 7: [(None, -1)], 8: [(None, -1)], 9: [(None, -1)], 10: [(None, -1)], 11: [(None, -1)], 12: [(None, -1)]}


  6%|▌         | 12/203 [00:25<06:07,  1.92s/it]

predicted_parents={0: [(2, -1), (0, -1)], 1: [(10, -2)], 2: [(2, -1)], 3: [(5, -1)], 4: [(12, -3), (12, -2)], 5: [(5, -1)], 6: [(5, -1)], 7: [], 8: [(12, -2)], 9: [], 10: [(0, -1)], 11: [(1, -2)], 12: [(12, -3)]}
causal_dataset.parents_dict={0: [(None, -1)], 1: [(None, -1)], 2: [(None, -1)], 3: [(None, -1)], 4: [(None, -1)], 5: [(None, -1)], 6: [(None, -1)], 7: [(None, -1)], 8: [(None, -1)], 9: [(None, -1)], 10: [(None, -1)], 11: [(None, -1)], 12: [(None, -1)]}


  6%|▋         | 13/203 [00:28<06:37,  2.09s/it]

predicted_parents={0: [(9, -1)], 1: [(9, -1)], 2: [(2, -1)], 3: [(10, -1)], 4: [(4, -1), (8, -2)], 5: [(4, -1)], 6: [(6, -1), (0, -2)], 7: [(4, -1), (5, -2), (12, -2)], 8: [(2, -1), (9, -1)], 9: [(8, -1), (9, -1)], 10: [(9, -1)], 11: [(9, -1)], 12: [(3, -2)]}
causal_dataset.parents_dict={0: [(None, -1)], 1: [(None, -1)], 2: [(None, -1)], 3: [(None, -1)], 4: [(None, -1)], 5: [(None, -1)], 6: [(None, -1)], 7: [(None, -1)], 8: [(None, -1)], 9: [(None, -1)], 10: [(None, -1)], 11: [(None, -1)], 12: [(None, -1)]}


  6%|▋         | 13/203 [00:29<07:06,  2.25s/it]Process MemTimer-14:
Traceback (most recent call last):
  File "/home/joaquin/miniconda3/envs/causal-inference/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/joaquin/miniconda3/envs/causal-inference/lib/python3.9/site-packages/memory_profiler.py", line 262, in run
    stop = self.pipe.poll(self.interval)
  File "/home/joaquin/miniconda3/envs/causal-inference/lib/python3.9/multiprocessing/connection.py", line 257, in poll
    return self._poll(timeout)
  File "/home/joaquin/miniconda3/envs/causal-inference/lib/python3.9/multiprocessing/connection.py", line 424, in _poll
    r = wait([self], timeout)
  File "/home/joaquin/miniconda3/envs/causal-inference/lib/python3.9/multiprocessing/connection.py", line 931, in wait
    ready = selector.select(timeout)
  File "/home/joaquin/miniconda3/envs/causal-inference/lib/python3.9/selectors.py", line 416, in select
    fd_event_list = self._selector.po

UnboundLocalError: local variable 'predicted_parents' referenced before assignment