In [1]:
import pandas as pd
import os

data_name = 'UltraProcessed_Food'
data = pd.read_csv(f'{data_name}/DatasetUF.csv', index_col=0)
data.to_csv(f'data_{data_name}/0_data.csv', index=False, header=False)


def adjacency_to_parents_dict(filename):
    parents_dict = {}
    with open(filename, 'r') as f:
        lines = f.readlines()
        for child_index, line in enumerate(lines):
            values = list(map(int, line.strip().split('\t')))
            parents_dict[child_index] = [(child_index,-1)]
            for parent_index, value in enumerate(values):
                if value == 1:
                    parents_dict[child_index].append((parent_index,-1))
    return parents_dict

parents = adjacency_to_parents_dict(f'{data_name}/UFGroundTruth.txt')
with open(f'data_{data_name}/0_node_parents.txt', 'w') as f:
    f.write(str(parents))

In [2]:
from group_causation.groups_extraction import GeneticCausalGroupsExtractor


def extract_and_save_groups(data_name):
    data = pd.read_csv(f'./data_{data_name}/0_data.csv', header=None).values
    # group_extractor = GeneticCausalGroupsExtractor(data, 
    #                                             scores=['harmonic_variance_explained', 'explainability_score'], 
    #                                             scores_weights=[0.01, 1.0])
    group_extractor = GeneticCausalGroupsExtractor(data, 
                                                scores=['explainability_score'], 
                                                scores_weights=[1.0])
        
    groups = group_extractor.extract_groups()

    print(data_name, 'dataset obtained the groups:', groups)
    
    with open(f'./data_{data_name}/0_groups.txt', 'w') as f:
        f.write(str(groups))
    

# extract_and_save_groups(data_name)

## Convert node-level parents to group-level parents

In [3]:
def find_index_with_element(groups, x):
    for i, group in enumerate(groups):
        if x in group: return i
    return None

with open(f'./data_{data_name}/0_groups.txt', 'r') as f:
    groups = eval(f.read())
with open(f'./data_{data_name}/0_node_parents.txt', 'r') as f:
    node_parents = eval(f.read())

group_parents = {}
for son_group_idx, son_group in enumerate(groups):
    group_parents[son_group_idx] = []
    for son_node in son_group:
        for parent_node, lag in node_parents[son_node]:
            parent_group_idx = find_index_with_element(groups, parent_node)
            if parent_group_idx is None:
                print(f'Error: parent node {parent_node} not found in any group')
                continue
            if (parent_group_idx, lag) not in group_parents[son_group_idx]:
                group_parents[son_group_idx].append((parent_group_idx, -1))
            
with open(f'./data_{data_name}/0_parents.txt', 'w') as f:
    f.write(str(group_parents))

## Perform the Benchmark

In [4]:
from matplotlib import pyplot as plt

from group_causation.benchmark import BenchmarkGroupCausalDiscovery

from group_causation.utils import static_parameters
from group_causation.group_causal_discovery import DimensionReductionGroupCausalDiscovery
from group_causation.group_causal_discovery import MicroLevelGroupCausalDiscovery
from group_causation.group_causal_discovery import HybridGroupCausalDiscovery

algorithms = {
    'group-embedding': HybridGroupCausalDiscovery,
    'subgroups': HybridGroupCausalDiscovery,
    'pca+pcmci': DimensionReductionGroupCausalDiscovery,
    'pca+dynotears': DimensionReductionGroupCausalDiscovery,
    'micro-level': MicroLevelGroupCausalDiscovery,
}
algorithms_parameters = {
    'pca+pcmci': {'dimensionality_reduction': 'pca', 'node_causal_discovery_alg': 'pcmci',
                            'node_causal_discovery_params': {'min_lag': 1, 'max_lag': 3, 'pc_alpha': 0.05}},
    
    'pca+dynotears': {'dimensionality_reduction': 'pca', 'node_causal_discovery_alg': 'dynotears',
                            'node_causal_discovery_params': {'min_lag': 1, 'max_lag': 3, 'lambda_w': 0.05, 'lambda_a': 0.05}},
    
    'micro-level': {'node_causal_discovery_alg': 'pcmci',
                            'node_causal_discovery_params': {'min_lag': 1, 'max_lag': 3, 'pc_alpha': 0.05}},
    
    'group-embedding': {'dimensionality_reduction': 'pca', 
               'dimensionality_reduction_params': {'explained_variance_threshold': 0.7,
                                                   'groups_division_method': 'group_embedding'},
                'node_causal_discovery_alg': 'pcmci',
                'node_causal_discovery_params': {'min_lag': 1, 'max_lag': 3, 'pc_alpha': 0.05},
                'verbose': 0},
    
    'subgroups': {'dimensionality_reduction': 'pca', 
               'dimensionality_reduction_params': {'explained_variance_threshold': 0.7,
                                                   'groups_division_method': 'subgroups'},
                'node_causal_discovery_alg': 'pcmci',
                'node_causal_discovery_params': {'min_lag': 1, 'max_lag': 3, 'pc_alpha': 0.05},
                'verbose': 0},
}

data_generation_options = {}

benchmark_options = {
    'static_parameters': (static_parameters, {}),
}

chosen_option = 'static_parameters'


def execute_benchmark(data_name):    
    benchmark = BenchmarkGroupCausalDiscovery()
    results_folder = f'results_{data_name}'
    datasets_folder = f'data_{data_name}'
    
    options_generator, options_kwargs = benchmark_options[chosen_option]
    parameters_iterator = options_generator(data_generation_options,
                                                algorithms_parameters,
                                                **options_kwargs)
    results = benchmark.benchmark_causal_discovery(algorithms=algorithms,
                                        parameters_iterator=parameters_iterator,
                                        datasets_folder=datasets_folder,
                                        generate_toy_data=False,
                                        results_folder=results_folder,
                                        n_executions=5,
                                        verbose=1)
    
    return results, benchmark

In [5]:
execute_benchmark(data_name)


--------------------------------------------------
[34m Datasets have been loaded. [0m
[32m Executing algorithm group-embedding [0m


100%|██████████| 1/1 [00:25<00:00, 25.84s/it]


[32m Executing algorithm subgroups [0m


100%|██████████| 1/1 [00:55<00:00, 55.99s/it]


[32m Executing algorithm pca+pcmci [0m


100%|██████████| 1/1 [00:03<00:00,  3.37s/it]


[32m Executing algorithm pca+dynotears [0m


100%|██████████| 1/1 [00:01<00:00,  1.12s/it]


[32m Executing algorithm micro-level [0m


100%|██████████| 1/1 [01:32<00:00, 92.67s/it]


({'group-embedding': [{'time': 25.81352472305298,
    'memory': 339.49696,
    'TP': 12,
    'FP': 23,
    'FN': 2,
    'precision': 0.34285714285714286,
    'recall': 0.8571428571428571,
    'f1': 0.4897959183673469,
    'shd': 25,
    'TP_summary': 13,
    'FP_summary': 2,
    'FN_summary': 1,
    'precision_summary': 0.8666666666666667,
    'recall_summary': 0.9285714285714286,
    'f1_summary': 0.896551724137931,
    'shd_summary': 3,
    'dataset_iteration': 0}],
  'subgroups': [{'time': 55.907198429107666,
    'memory': 346.73049599999996,
    'TP': 12,
    'FP': 28,
    'FN': 2,
    'precision': 0.3,
    'recall': 0.8571428571428571,
    'f1': 0.4444444444444444,
    'shd': 30,
    'TP_summary': 13,
    'FP_summary': 2,
    'FN_summary': 1,
    'precision_summary': 0.8666666666666667,
    'recall_summary': 0.9285714285714286,
    'f1_summary': 0.896551724137931,
    'shd_summary': 3,
    'dataset_iteration': 0}],
  'pca+pcmci': [{'time': 3.3376758098602295,
    'memory': 347.054

In [6]:
results = [pd.read_csv(f'results_{data_name}/{file}') for file in os.listdir(f'results_{data_name}') if file.endswith('.csv')]
algorithms = [file.split('_')[1].split('.')[0] for file in os.listdir(f'results_{data_name}') if file.endswith('.csv')]

for result, algorithm in zip(results, algorithms): result['algorithm'] = algorithm
results = pd.concat(results, ignore_index=True)

results = results[['algorithm', 'precision_summary', 'recall_summary', 'f1_summary', 'shd_summary', 'time']]
# results
print(results.to_latex(float_format="%.3f"))

\begin{tabular}{llrrrrr}
\toprule
 & algorithm & precision_summary & recall_summary & f1_summary & shd_summary & time \\
\midrule
0 & micro-level & 0.875 & 1.000 & 0.933 & 2 & 92.645 \\
1 & subgroups & 0.867 & 0.929 & 0.897 & 3 & 55.907 \\
2 & group-embedding & 0.867 & 0.929 & 0.897 & 3 & 25.814 \\
3 & pca+pcmci & 0.917 & 0.786 & 0.846 & 4 & 3.338 \\
4 & pca+dynotears & 0.000 & 0.000 & 0.000 & 14 & 1.061 \\
\bottomrule
\end{tabular}

