In [3]:
import pandas as pd
from tqdm import tqdm
import numpy as np

## Load Clusters and Alarms

In [8]:
clusters = pd.read_parquet("real-time clusters/20230101-20240101_real_time_clusters_filtered.parquet")

In [2]:
mob = pd.read_parquet("alarms datasets/mob/20230101-20240101_inpas_mob_preprocess__an__last_event__last_event__ext1.parquet")

In [3]:
adsl = pd.read_parquet("alarms datasets/tx/20230101-20240101_inpas_tx_preprocess__adsl__last_event__last_event__ext1.parquet")

In [4]:
ptn = pd.read_parquet("alarms datasets/tx/20230101-20240101_inpas_tx_preprocess__ptn__last_event__last_event__ext1.parquet")

In [5]:
sdh = pd.read_parquet("alarms datasets/tx/20230101-20240101_inpas_tx_preprocess__sdh__last_event__last_event__ext1.parquet")

In [6]:
pdh = pd.read_parquet("alarms datasets/tx/20230101-20240101_inpas_tx_preprocess__pdh__last_event__last_event__ext1.parquet")

In [7]:
mob = mob[["alarm_id", "last_occurrence"]]

In [8]:
adsl = adsl[["alarm_id", "last_occurrence"]]
ptn = ptn[["alarm_id", "last_occurrence"]]
sdh = sdh[["alarm_id", "last_occurrence"]]
pdh = pdh[["alarm_id", "last_occurrence"]]


In [9]:
alarms_with_last_occurrence = pd.concat([mob, adsl, ptn, sdh, pdh])

In [10]:
alarms_with_last_occurrence.to_parquet("alarms datasets/alarms_with_last_occurrence.parquet")

## Clusters Statistics

In [11]:
clusters = pd.read_parquet("real-time clusters/20230101-20240101_real_time_clusters_filtered.parquet")
alarms_with_last_occurrence = pd.read_parquet("alarms datasets/alarms_with_last_occurrence.parquet")

In [12]:
clusters_filtered = clusters[["alarm_id", "cluster_id2", "first_occurrence"]]

In [13]:
clusters_with_last_occurrence = clusters_filtered.merge(alarms_with_last_occurrence, on="alarm_id", how="left")

In [14]:
clusters_with_last_occurrence["first_occurrence"] = pd.to_datetime(clusters_with_last_occurrence["first_occurrence"])

In [15]:
clusters_with_last_occurrence_count_null = clusters_with_last_occurrence[clusters_with_last_occurrence["last_occurrence"].isnull()]

In [16]:
clusters_with_last_occurrence = clusters_with_last_occurrence.dropna(subset=["last_occurrence"])

In [17]:
clusters_with_last_occurrence.to_parquet("real-time clusters/clusters_with_last_occurrence.parquet")

### Clusters Size

Number of alarms within a cluster

In [16]:
clusters_with_last_occurrence = pd.read_parquet("real-time clusters/clusters_with_last_occurrence.parquet")

In [17]:
clusters_grouped_by_id = clusters_with_last_occurrence.groupby(["cluster_id2"])

In [18]:
size_df = clusters_grouped_by_id.size().reset_index(name='size')

In [19]:
size_df.to_parquet("real-time clusters/size_df.parquet")

### Clusters Lifespan

Time difference from the last last_occurrence and the first first_occurrence of alarms 

In [20]:

def calculate_lifespan(group):
    return group['last_occurrence'].max() - group['first_occurrence'].min()

tqdm.pandas()


lifespan_df = clusters_grouped_by_id.apply(calculate_lifespan, include_groups=False).reset_index(name='lifespan')


KeyboardInterrupt: 

In [None]:
lifespan_df.to_parquet("real-time clusters/lifespan_df.parquet")

### Clusters Delta First Occurrence

Distance between the last available first_occurrence and the first

In [None]:
def calculate_lifespan(group):
    return group['first_occurrence'].max() - group['first_occurrence'].min()


tqdm.pandas()


delta_df = clusters_grouped_by_id.progress_apply(calculate_lifespan).reset_index(name='delta first occurrence')

In [None]:
delta_df.to_parquet("real-time clusters/delta_df.parquet")

In [None]:
delta_df = pd.read_parquet("real-time clusters/delta_df.parquet")

In [None]:
delta_df.sort_values(by="delta first occurrence", ascending=False)

## Merge Statistics

In [None]:
size_df = pd.read_parquet("real-time clusters/size_df.parquet")
lifespan_df = pd.read_parquet("real-time clusters/lifespan_df.parquet")
delta_df = pd.read_parquet("real-time clusters/delta_df.parquet")

In [None]:
clusters_statistics = size_df.merge(lifespan_df, on=["cluster_id", "cluster_id2"], how="left").merge(delta_df, on=["cluster_id", "cluster_id2"], how="left")

In [None]:
clusters_statistics

In [None]:
clusters_statistics.to_parquet("real-time clusters/clusters_statistics.parquet")