## Libraries

In [1]:
from scipy.sparse import csr_matrix  # Used to represent large datasets compactly
import numpy as np                   # Linear Algebra.  Arrays
import os                            # File Operations
import pandas as pd                  # Dataframes
import pickle                        # File storage

## Scoring metrics
These are the loss metrics used by the competition.  G_score is for individual datasets.  Rank score is for all the datasets together.

In [2]:
def g_score(true_positives, false_positives, false_negatives):
    return max(0, true_positives - false_positives) / (true_positives + false_negatives + 1e-7)  # Added epsilon to avoid division by zero

def rank_score(g_scores):
    return sum(1**g for g in g_scores)

# Dataset Conversion
Function used to unpack datasets from their compact form.  Specify datset number ( 0 - 3) and slice ( 0 - 1023 ).

In [3]:
def convert_sparse_to_dense(dataset_num, slice_num):
    # Load sparse data
    base_path = '/kaggle/input/causal-structure-learning-from-event-sequences/'
    data_path = f'{base_path}CSL Sparse Datasets/dataset_{dataset_num}/subfolder_{(slice_num//256)}/dataset_{dataset_num}_{slice_num}.npz'
    causal_path = f'{base_path}CSL Sparse Datasets/dataset_{dataset_num}/subfolder_{(slice_num//256)}/dataset_{dataset_num}_{slice_num}_causal.pkl'
    mapping_path = f'{base_path}dataset_{dataset_num}_alarm_id_mapping.pkl'
    loaded_data = np.load(data_path)
    sparse_mat = csr_matrix((loaded_data['data'], loaded_data['indices'], loaded_data['indptr']), shape=loaded_data['shape'])
    
    # Load causal data and mapping
    with open(causal_path, 'rb') as f:
        causal_data = pickle.load(f)

    causal_data = [np.array(arr, dtype=np.int8) for arr in causal_data]  # Convert to NumPy int8 arrays

    with open(mapping_path, 'rb') as f:
        alarm_id_mapping = pickle.load(f)

    # Initialize the dictionary to hold the dense data
    data_dict = {}
    
    for alarm_id_idx in range(sparse_mat.shape[0]):
        alarm_id = alarm_id_mapping[alarm_id_idx]  # Retrieve actual alarm_id from the mapping
        row = sparse_mat.getrow(alarm_id_idx).toarray().flatten()
        
        # Initialize an empty list to hold device arrays for each time_stamp
        device_arrays_for_alarm = []
        
        for time_stamp in range(1024):  # 2 ** 10 = 1024
            # Initialize a zero array of size 2**8
            device_array = np.zeros(2**8, dtype=int)
            
            for device_id in range(2**8):
                col_index = time_stamp * 2**8 + device_id
                
                if row[col_index] != 0:
                    device_array[device_id] = 1  # Mark device as involved

            # Append the device_array to the list for this alarm_id
            device_arrays_for_alarm.append(device_array)
        
        # Insert the full list of device arrays for this alarm_id into the dictionary
        data_dict[alarm_id] = device_arrays_for_alarm

    # Convert to a DataFrame
    final_df = pd.DataFrame.from_dict(data_dict, orient='index', columns=range(1024))

    # Add causal_data column to DataFrame
    final_df['causal_data'] = final_df.index.map(lambda idx: causal_data[idx])

    return final_df

# Implementation

In [4]:
dataset_num = 0
slice_num = 0
ds_0_0 = convert_sparse_to_dense(dataset_num, slice_num)