# Readme  
This code is intended for use as a base for building models on.  You should include the data from [Causal Structure Learning from Event Sequences](https://www.kaggle.com/datasets/lukemiller1987/causal-structure-learning-from-event-sequences)
## Libraries
This code base was implemented in Python 3.10.12.  If there is a mismatch, please run:  
  
`conda create --name myenv python=3.10.12  
conda activate myenv  
conda install jupyter  
jupyter notebook  `
```

### Install Specific Package Versions
- Uses `pip` to install specific versions of the following Python packages:
    - `scipy`: version 1.11.2
    - `numpy`: version 1.23.5
    - `pandas`: version 2.0.3
    - `sklearn`: version 1.2.2
## Import and Version Check
Imports the installed packages and performs a version check using `assert` statements.
## Check Python Version
Checks if the Python version starts with '3.10.12'.
## Import Standard Libraries
Imports `os`, `pickle`, `json`, `zipfile`, `collections`, and `multiprocessing`, whose versions are tied to the Python version.

Each `assert`statement checks if the current package or Python version matches the expected version. If not, it raises an exception displaying the expected and current versions.

In [7]:
# Install specific versions
!pip install scipy==1.11.2 numpy==1.23.5 pandas==2.0.3 

# Import specific versions
import scipy
assert scipy.__version__ == '1.11.2', f'Expected scipy version 1.11.2, got {scipy.__version__}'

import numpy as np
assert np.__version__ == '1.23.5', f'Expected numpy version 1.23.5, got {np.__version__}'

import pandas as pd
assert pd.__version__ == '2.0.3', f'Expected pandas version 2.0.3, got {pd.__version__}'

import sys
assert sys.version.startswith('3.10.12'), f'Expected Python version 3.10.12, got {sys.version}'

import sklearn
assert sklearn.__version__ == '1.2.2', f'Expected sklearn 1.2.2, got {sklearn.__version__}'

from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

# standard libraries and their versions are tied to Python version.
from multiprocessing import Pool
import os
import json
import zipfile
from collections import defaultdict
import pickle




## Function Descriptions

### `create_datasets_core_logic(args)`
This function is the core logic for creating datasets. It receives a tuple `args` containing the dataset index (`idx`), alarm data (`alarm`), and causal data (`causal`).

#### Parameters:
- `idx`: Index of the dataset.
- `alarm`: DataFrame containing alarm data.
- `causal`: List containing causal information.

#### Process:
1. `unique_alarm_ids`: Extracts unique alarm IDs.
2. `dataset_folder`: Creates a new folder for each dataset.
3. Iterates through time windows, creating subfolders and populating data.
4. `sparse_matrix`: Constructs a sparse matrix from the data.
5. Saves the sparse matrix and causal data to disk.

### `create_datasets(dataset_list)`
Manages the parallel execution of `create_datasets_core_logic` for multiple datasets.

#### Parameters:
- `dataset_list`: List of tuples, each containing alarm data and causal information for a dataset.

#### Process:
1. Uses a pool of worker threads to execute `create_datasets_core_logic` in parallel if `pool=True`.
2. Zips the generated datasets into a single `.zip` file.

### Important Variables:
- `n_alarms`: Number of unique alarms.
- `data, row_indices, col_indices`: Lists to construct the sparse matrix.
- `subfolder`: Subfolders to group files.

### File Outputs:
1. Pickle files for causal data (`*_causal.pkl`).
2. Compressed `.npz` files for the sparse matrices.
3. A zipped file `Datasets.zip` containing all datasets.

### Additional Notes:
- `2**10` and `2**8` are used as constants to define the time windows and device IDs, respectively.
- Multiprocessing is optional and can be toggled by setting the `pool` variable.


In [None]:
def create_datasets_core_logic(args):
    idx, alarm, causal = args
    unique_alarm_ids = alarm['alarm_id'].unique()
    n_alarms = len(unique_alarm_ids)

    # Create dataset folders
    dataset_folder = f"dataset_{idx}"
    os.makedirs(dataset_folder, exist_ok=True)

    for window in range(2**10):
        data, row_indices, col_indices = [], [], []
        
        # Create subfolders for each group of 256 files
        subfolder = os.path.join(dataset_folder, f"subfolder_{window // 256}")
        os.makedirs(subfolder, exist_ok=True)

        for alarm_id_idx, alarm_id in enumerate(unique_alarm_ids):
            rows = alarm[alarm['alarm_id'] == alarm_id]
            
            for _, row in rows.iterrows():
                for t in range(row['start_timestamp'], row['end_timestamp']):
                    if t // 2**10 == window:
                        data.append(1)
                        row_indices.append(alarm_id_idx)
                        col_indices.append(t % 2**10 * 2**8 + row['device_id'])
        
        sparse_matrix = csr_matrix((data, (row_indices, col_indices)), shape=(n_alarms, 2**10 * 2**8))
        
        with open(f"{subfolder}/dataset_{idx}_{window}_causal.pkl", 'wb') as f:
            pickle.dump([causal[alarm_id] for alarm_id in unique_alarm_ids], f)
        
        np.savez(f"{subfolder}/dataset_{idx}_{window}.npz", data=sparse_matrix.data, indices=sparse_matrix.indices, indptr=sparse_matrix.indptr, shape=sparse_matrix.shape)

def create_datasets(dataset_list):
    pool = True
    if pool:
        with Pool() as pool:
            pool.map(create_datasets_core_logic, [(idx, alarm, causal) for idx, (alarm, causal) in enumerate(dataset_list)])
    else:
        for idx, (alarm, causal) in enumerate(dataset_list):
            create_datasets_core_logic((idx, alarm, causal))
    
    # Zip entire datasets folder
    with zipfile.ZipFile('Datasets.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk('.'):
            for file_ in files:
                if file_.endswith('.npz') or file_.endswith('.pkl'):
                    zipf.write(os.path.join(root, file_))

## Function Description: `generate_alarm_id_mapping(dataset_list)`

### Purpose:
This function generates a mapping of unique alarm IDs for each dataset in the given `dataset_list`.

#### Parameters:
- `dataset_list`: List of tuples, each containing alarm data and causal information for a dataset.

#### Process:
1. Iterates through each dataset in the `dataset_list`.
2. `dataset_folder`: Creates a folder for each dataset if it doesn't already exist.
3. `unique_alarm_ids`: Fetches the unique alarm IDs from the alarm data and sorts them.
4. `mapping`: Generates a dictionary that maps index to alarm ID.
5. Saves this mapping as a pickle file (`alarm_id_mapping.pkl`) in the dataset folder.

### Important Variables:
- `unique_alarm_ids`: Sorted list of unique alarm IDs from the alarm data.
- `mapping`: Dictionary containing the index-to-alarm ID mapping.

### File Outputs:
- Pickle file (`alarm_id_mapping.pkl`) containing the index-to-alarm ID mapping for each dataset.

### Additional Notes:
- This function is typically run before creating datasets to ensure that alarm IDs are consistently mapped across different files and operations.


In [None]:
def generate_alarm_id_mapping(dataset_list):
    for idx, (alarm, _) in enumerate(dataset_list):
        # Create dataset folder if not exists
        dataset_folder = f"dataset_{idx}"
        os.makedirs(dataset_folder, exist_ok=True)
        
        unique_alarm_ids = sorted(alarm['alarm_id'].unique())
        mapping = {alarm_id_idx: alarm_id for alarm_id_idx, alarm_id in enumerate(unique_alarm_ids)}
        
        # Save this mapping
        with open(f"{dataset_folder}/alarm_id_mapping.pkl", 'wb') as f:
            pickle.dump(mapping, f)



# Definition of Source files/ creation of dataset list


In [None]:
alarm1 = pd.read_csv('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_1/alarm.csv')
causal1 = np.load('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_1/causal_prior.npy' , allow_pickle = True)

alarm2 = pd.read_csv('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_2/alarm.csv')
causal2 = np.load('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_2/causal_prior.npy' , allow_pickle = True)

alarm3 = pd.read_csv('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_3/alarm.csv')
causal3 = np.load('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_3/causal_prior.npy' , allow_pickle = True)

alarm4 = pd.read_csv('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_4/alarm.csv')
causal4 = np.load('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_4/causal_prior.npy' , allow_pickle = True)

rca1 = pd.read_csv('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_1/rca_prior.csv')
topology1 = np.load('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_1/topology.npy' , allow_pickle = True)
rca2 = pd.read_csv('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_2/rca_prior.csv')
topology2 = np.load('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_2/topology.npy' , allow_pickle = True)
rca3 = pd.read_csv('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_3/rca_prior.csv')
topology3 = np.load('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_3/topology.npy' , allow_pickle = True)

dataset_list = [(alarm1, causal1), (alarm2, causal2), (alarm3, causal3), (alarm4, causal4)]

# run the functions

In [None]:
generate_alarm_id_mapping(dataset_list)
create_datasets(dataset_list)
