In [1]:
from scipy.sparse import csr_matrix
import numpy as np
import pandas as pd
from multiprocessing import Pool
import os
import json
import zipfile

import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score




In [2]:
def g_score(true_positives, false_positives, false_negatives):
    return max(0, true_positives - false_positives) / (true_positives + false_negatives + 1e-7)  # Added epsilon to avoid division by zero

def rank_score(g_scores):
    return sum(1**g for g in g_scores)


In [3]:
class GCNLayer:
    def __init__(self, input_dim, output_dim):
        self.weights = initialize_weights(input_dim, output_dim)
        self.grads = np.zeros_like(self.weights)

    def relu(self, x):
        return np.maximum(0, x)

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def forward(self, A, X):
        return self.relu(A @ X @ self.weights)
    
    def backward(self, d_loss):
        # Compute gradients using the chain rule
        self.grads = d_loss  # Update this as per your specific loss and activation
        return self.grads

class GCN:
    def __init__(self, input_dim, hidden_dim, output_dim):
        self.gcn1 = GCNLayer(input_dim, hidden_dim)
        self.gcn2 = GCNLayer(hidden_dim, output_dim)

    def forward(self, A, X):
        self.H1 = self.gcn1.forward(A, X)
        self.H2 = self.gcn2.forward(A, self.H1)
        return self.H2

    def backward(self, d_loss):
        # Backward pass for the second layer
        d_loss2 = self.gcn2.backward(d_loss)
        
        # Backward pass for the first layer
        d_loss1 = self.gcn1.backward(d_loss2)
        
    def update_weights(self, learning_rate):
        self.gcn1.weights -= learning_rate * self.gcn1.grads
        self.gcn2.weights -= learning_rate * self.gcn2.grads


In [4]:
from scipy.sparse import csr_matrix
import numpy as np
import pandas as pd
from multiprocessing import Pool
import zipfile
import pickle

def create_datasets_core_logic(args):
    idx, alarm, causal = args
    unique_alarm_ids = alarm['alarm_id'].unique()
    n_alarms = len(unique_alarm_ids)
    
    for window in range(2**10):
        data, row_indices, col_indices = [], [], []
        
        for alarm_id_idx, alarm_id in enumerate(unique_alarm_ids):
            rows = alarm[alarm['alarm_id'] == alarm_id]
            
            for _, row in rows.iterrows():
                for t in range(row['start_timestamp'], row['end_timestamp']):
                    if t // 2**10 == window:
                        data.append(1)
                        row_indices.append(alarm_id_idx)
                        col_indices.append(t % 2**10 * 2**8 + row['device_id'])
        
        sparse_matrix = csr_matrix((data, (row_indices, col_indices)), shape=(n_alarms, 2**10 * 2**8))
        with open(f"dataset_{idx}_{window}_causal.pkl", 'wb') as f:
            pickle.dump([causal[alarm_id] for alarm_id in unique_alarm_ids], f)

        np.savez(f"dataset_{idx}_{window}.npz", data=sparse_matrix.data, indices=sparse_matrix.indices, indptr=sparse_matrix.indptr, shape=sparse_matrix.shape)

def create_datasets(dataset_list):
    pool = True
    if pool:
        with Pool() as pool:
            pool.map(create_datasets_core_logic, [(idx, alarm, causal) for idx, (alarm, causal) in enumerate(dataset_list)])
    else:
        for idx, (alarm, causal) in enumerate(dataset_list):
            create_datasets_core_logic((idx, alarm, causal))
    
    with zipfile.ZipFile('Datasets.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
        for idx, _ in enumerate(dataset_list):
            for window in range(2**10):
                zipf.write(f"dataset_{idx}_{window}.npz")
                zipf.write(f"dataset_{idx}_{window}_causal.pkl")


In [5]:
alarm1 = pd.read_csv('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_1/alarm.csv')
causal1 = np.load('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_1/causal_prior.npy' , allow_pickle = True)

alarm2 = pd.read_csv('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_2/alarm.csv')
causal2 = np.load('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_2/causal_prior.npy' , allow_pickle = True)

alarm3 = pd.read_csv('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_3/alarm.csv')
causal3 = np.load('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_3/causal_prior.npy' , allow_pickle = True)

alarm4 = pd.read_csv('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_4/alarm.csv')
causal4 = np.load('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_4/causal_prior.npy' , allow_pickle = True)

rca1 = pd.read_csv('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_1/rca_prior.csv')
topology1 = np.load('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_1/topology.npy' , allow_pickle = True)
rca2 = pd.read_csv('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_2/rca_prior.csv')
topology2 = np.load('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_2/topology.npy' , allow_pickle = True)
rca3 = pd.read_csv('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_3/rca_prior.csv')
topology3 = np.load('/kaggle/input/causal-structure-learning-from-event-sequences/datasets/dataset_3/topology.npy' , allow_pickle = True)

dataset_list = [(alarm1, causal1), (alarm2, causal2), (alarm3, causal3), (alarm4, causal4)]

In [6]:
create_datasets(dataset_list)

In [7]:

times = {}
alarms = {}
devices = {}
times_list = []
alarms_list = []
devices_list = []
for i, (alarm, _) in enumerate(dataset_list):
    times[i] = list(alarm['start_timestamp'].unique())+list(alarm['end_timestamp'].unique())
    times_list += times[i]
    alarms[i] = list(alarm['alarm_id'].unique())
    alarms_list += alarms[i]
    devices[i] = list(alarm['device_id'].unique())
    devices_list += devices[i]
times = len(set(times_list))
alarms = len(set(alarms_list))
devices = len(set(devices_list))
print(times)
print(alarms)
print(devices)
print()
print(len(alarm1['alarm_id'].unique()))
print(len(alarm2['alarm_id'].unique()))
print(len(alarm3['alarm_id'].unique()))
print(len(alarm4['alarm_id'].unique()))
print()
print(len(alarm1['device_id'].unique()))
print(len(alarm2['device_id'].unique()))
print(len(alarm3['device_id'].unique()))
print(len(alarm4['device_id'].unique()))
print()
print(len(set(list(alarm1['start_timestamp'].unique())+list(alarm1['end_timestamp'].unique()))))
print(len(set(list(alarm2['start_timestamp'].unique())+list(alarm2['end_timestamp'].unique()))))
print(len(set(list(alarm3['start_timestamp'].unique())+list(alarm3['end_timestamp'].unique()))))
print(len(set(list(alarm4['start_timestamp'].unique())+list(alarm4['end_timestamp'].unique()))))
# Generate the dataset
###new_dataset = create_dataset(dataset_list)

616313
49
100

39
49
31
30

35
42
39
100

203619
147963
241340
223427


In [8]:
print(sorted(list(alarm1['alarm_id'].unique())))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38]


In [9]:
print(max(list(alarm4['end_timestamp'])))

2392489


In [10]:
print(len(list(alarm4['start_timestamp'].unique())))

117367


In [11]:
alarm4


Unnamed: 0,alarm_id,device_id,start_timestamp,end_timestamp
0,27,68,3,18067
1,16,58,3,18071
2,16,2,10,18082
3,27,58,16,18086
4,27,2,24,18088
...,...,...,...,...
126798,21,24,2332760,2333009
126799,27,17,2332762,2350823
126800,22,52,2332770,2332897
126801,20,33,2332789,2332821


In [12]:
ala1 = pd.DataFrame(alarm1['end_timestamp'] - alarm1['start_timestamp'])
ala2 = pd.DataFrame(alarm2['end_timestamp'] - alarm2['start_timestamp'])
ala3 = pd.DataFrame(alarm3['end_timestamp'] - alarm3['start_timestamp'])
ala4 = pd.DataFrame(alarm4['end_timestamp'] - alarm4['start_timestamp'])

print(ala1.describe())
print(ala2.describe())
print(ala3.describe())
print(ala4.describe())

                   0
count  141853.000000
mean       58.240643
std        40.794314
min         0.000000
25%        31.000000
50%        47.000000
75%        84.000000
max       512.000000
                   0
count  132688.000000
mean       31.529957
std        28.788882
min         0.000000
25%        11.000000
50%        25.000000
75%        41.000000
max       703.000000
                   0
count  198962.000000
mean       61.086424
std        35.611934
min         0.000000
25%        38.000000
50%        49.000000
75%        84.000000
max       680.000000
                   0
count  126803.000000
mean     1598.319464
std      8763.418705
min         0.000000
25%        64.000000
50%       134.000000
75%       598.000000
max    250949.000000
