In [None]:
import pandas as pd
from tqdm import tqdm

## Load Clusters and Alarms

In [None]:
clusters = pd.read_parquet("real-time clusters/20230101-20240101_real_time_clusters.parquet")

In [None]:
mob = pd.read_parquet("alarms datasets/mob/20230101-20240101_inpas_mob_preprocess__an__last_event__last_event__ext1.parquet")

In [None]:
adsl = pd.read_parquet("alarms datasets/tx/20230101-20240101_inpas_tx_preprocess__adsl__last_event__last_event__ext1.parquet")

In [None]:
ptn = pd.read_parquet("alarms datasets/tx/20230101-20240101_inpas_tx_preprocess__ptn__last_event__last_event__ext1.parquet")

In [None]:
sdh = pd.read_parquet("alarms datasets/tx/20230101-20240101_inpas_tx_preprocess__sdh__last_event__last_event__ext1.parquet")

In [None]:
pdh = pd.read_parquet("alarms datasets/tx/20230101-20240101_inpas_tx_preprocess__pdh__last_event__last_event__ext1.parquet")

In [None]:
mob = mob[["alarm_id", "last_occurrence"]]

In [None]:
adsl = adsl[["alarm_id", "last_occurrence"]]
ptn = ptn[["alarm_id", "last_occurrence"]]
sdh = sdh[["alarm_id", "last_occurrence"]]
pdh = pdh[["alarm_id", "last_occurrence"]]


In [None]:
alarms_with_last_occurrence = pd.concat([mob, adsl, ptn, sdh, pdh])

In [None]:
alarms_with_last_occurrence.to_parquet("alarms datasets/alarms_with_last_occurrence.parquet")

## Clusters Statistics

In [None]:
clusters = pd.read_parquet("real-time clusters/20230101-20240101_real_time_clusters.parquet")
alarms_with_last_occurrence = pd.read_parquet("alarms datasets/alarms_with_last_occurrence.parquet")

In [None]:
clusters_filtered = clusters[["alarm_id", "cluster_id", "cluster_id2", "first_occurrence"]]

In [None]:
clusters_with_last_occurrence = clusters_filtered.merge(alarms_with_last_occurrence, on="alarm_id", how="left")

In [None]:
clusters_with_last_occurrence["first_occurrence"] = pd.to_datetime(clusters_with_last_occurrence["first_occurrence"])

In [None]:
clusters_with_last_occurrence = clusters_with_last_occurrence.dropna(subset=["last_occurrence"])

In [None]:
clusters_with_last_occurrence.to_parquet("real-time clusters/clusters_with_last_occurrence.parquet")

### Clusters Size

Number of alarms within a cluster

In [None]:
clusters_with_last_occurrence = pd.read_parquet("real-time clusters/clusters_with_last_occurrence.parquet")

In [None]:
clusters_grouped_by_id = clusters_with_last_occurrence.groupby(["cluster_id", "cluster_id2"])

In [None]:
size_df = clusters_grouped_by_id.size().reset_index(name='size')

In [None]:
size_df.to_parquet("real-time clusters/size_df.parquet")

### Clusters Lifespan

Time difference from the last last_occurrence and the first first_occurrence of alarms 

In [None]:

def calculate_lifespan(group):
    return group['last_occurrence'].max() - group['first_occurrence'].min()

tqdm.pandas()


lifespan_df = clusters_grouped_by_id.apply(calculate_lifespan, include_groups=False).reset_index(name='lifespan')


In [None]:
lifespan_df.to_parquet("real-time clusters/lifespan_df.parquet")

### Clusters Delta First Occurrence

Distance between the last available first_occurrence and the first

In [None]:


def calculate_lifespan(group):
    return group['first_occurrence'].max() - group['first_occurrence'].min()


tqdm.pandas()


delta_df = clusters_grouped_by_id.progress_apply(calculate_lifespan).reset_index(name='delta first occurrence')

In [None]:
delta_df.to_parquet("real-time clusters/delta_df.parquet")

In [None]:
delta_df = pd.read_parquet("real-time clusters/delta_df.parquet")

In [None]:
delta_df.sort_values(by="delta first occurrence", ascending=False)

## Merge Statistics

In [None]:
size_df = pd.read_parquet("real-time clusters/size_df.parquet")
lifespan_df = pd.read_parquet("real-time clusters/lifespan_df.parquet")
delta_df = pd.read_parquet("real-time clusters/delta_df.parquet")

In [None]:
clusters_statistics = size_df.merge(lifespan_df, on=["cluster_id", "cluster_id2"], how="left").merge(delta_df, on=["cluster_id", "cluster_id2"], how="left")

In [None]:
clusters_statistics

In [None]:
clusters_statistics.to_parquet("real-time clusters/clusters_statistics.parquet")

## Extract Unique Slogans from Alarms


In [None]:
mob_slogan = mob[["alarm_id", "mob_slogan"]]
adsl_slogan = adsl[["alarm_id", "std_probable_cause_no"]]## , "summary"]
ptn_slogan = ptn[["alarm_id", "std_probable_cause_no"]]
sdh_slogan = sdh[["alarm_id", "std_probable_cause_no"]]
pdh_slogan = pdh[["alarm_id", "std_probable_cause_no"]]


In [None]:
mob_slogan.rename(columns={"mob_slogan": "slogan"}, inplace=True)
adsl_slogan.rename(columns={"std_probable_cause_no": "slogan"}, inplace=True)
ptn_slogan.rename(columns={"std_probable_cause_no": "slogan"}, inplace=True)
sdh_slogan.rename(columns={"std_probable_cause_no": "slogan"}, inplace=True)
pdh_slogan.rename(columns={"std_probable_cause_no": "slogan"}, inplace=True)



In [None]:
alarms_slogan = pd.concat([mob_slogan, adsl_slogan, ptn_slogan, sdh_slogan, pdh_slogan])

In [None]:
alarms_slogan.to_parquet("alarms datasets/alarms_slogan.parquet")

In [None]:
alarms_slogan = pd.read_parquet("alarms datasets/alarms_slogan.parquet")

In [None]:
alarms_slogan = alarms_slogan.drop("alarm_id", axis=1)

## Associate for each Alarm of the Cluster Dataset the corresponding slogan

In [None]:
alarms_slogan = pd.read_parquet("alarms datasets/alarms_slogan.parquet")

In [None]:
grouped_alarms = alarms_slogan.groupby("alarm_id")

In [None]:
grouped_alarms.size().sort_values(ascending=False)

In [None]:
clusters_filtered = clusters[["alarm_id", "cluster_id", "cluster_id2"]]

In [None]:
clusters_with_slogans = pd.merge(clusters_filtered, alarms_slogan, on='alarm_id', how='left')

In [None]:
clusters_with_slogans.to_parquet("real-time clusters/clusters_with_slogans.parquet")

## Create a Bitmap Clusters/Slogan

In [None]:

clusters_with_slogans = pd.read_parquet("real-time clusters/clusters_with_slogans.parquet")

In [None]:
grouped_clusters = clusters_with_slogans.groupby(["cluster_id", "cluster_id2"])
group_sizes = grouped_clusters.size()

# Ottieni gli indici dei gruppi con più di una riga
multi_row_indices = group_sizes[group_sizes > 1].index

# Filtra il DataFrame originale per mantenere solo le righe che appartengono a gruppi con più di una riga
filtered_clusters = clusters_with_slogans[clusters_with_slogans.set_index(["cluster_id", "cluster_id2"]).index.isin(multi_row_indices)]

In [None]:
grouped_clusters = filtered_clusters.groupby(["cluster_id", "cluster_id2"])

In [None]:
unique_slogans = filtered_clusters['slogan'].unique()

In [None]:

# Definisci una funzione per calcolare la presenza di ciascuno slogan
def calculate_presence(group):
    presence = {slogan: 0 for slogan in unique_slogans}
    for slogan in group['slogan']:
        presence[slogan] = 1
    return pd.Series(presence)

tqdm.pandas()

# Applica la funzione a ciascun gruppo e crea un nuovo dataframe con i risultati
result_df = grouped_clusters.progress_apply(calculate_presence)


In [None]:
result_df.to_parquet("real-time clusters/result_df_gt_2.parquet")

### Bitmap for clusters with one alarm inside

In [None]:
clusters_with_one_alarm = clusters_with_slogans[clusters_with_slogans.set_index(["cluster_id", "cluster_id2"]).index.isin(multi_row_indices) == False]

In [None]:
unique_slogans = clusters_with_one_alarm['slogan'].unique()

In [None]:
import numpy as np

# Dividi il DataFrame in 2 parti
parts = np.array_split(clusters_with_one_alarm, 4)

# Ora, parts[0] e parts[1] sono le 2 parti del DataFrame

In [None]:
grouped_clusters_with_one_alarm = parts[3].groupby(["cluster_id", "cluster_id2"])

In [None]:

# Definisci una funzione per calcolare la presenza di ciascuno slogan
def calculate_presence(group):
    presence = {slogan: 0 for slogan in unique_slogans}
    for slogan in group['slogan']:
        presence[slogan] = 1
    return pd.Series(presence)

tqdm.pandas()

# Applica la funzione a ciascun gruppo e crea un nuovo dataframe con i risultati
unique_df = grouped_clusters_with_one_alarm.progress_apply(calculate_presence)





In [None]:
unique_df.to_parquet("real-time clusters/unique_df_pt4.parquet")

## Manage Bitmap

In [None]:
clusters_bitmap_gt_2 = pd.read_parquet("real-time clusters/result_df_gt_2.parquet")

In [None]:
unique_df_pt1 = pd.read_parquet("real-time clusters/unique_df_pt1.parquet")

In [None]:
unique_df_pt2 = pd.read_parquet("real-time clusters/unique_df_pt2.parquet")

In [None]:
unique_df_pt3 = pd.read_parquet("real-time clusters/unique_df_pt3.parquet")

In [None]:
unique_df_pt4 = pd.read_parquet("real-time clusters/unique_df_pt4.parquet")

In [None]:
unique_df = pd.concat([unique_df_pt1, unique_df_pt2, unique_df_pt3, unique_df_pt4])


In [None]:
unique_df.to_parquet("real-time clusters/unique_df.parquet")

In [None]:
missing_columns = clusters_bitmap_gt_2.columns.difference(unique_df.columns)

In [None]:
unique_df = unique_df.reindex(columns=unique_df.columns.union(missing_columns), fill_value=0)

In [None]:
len(clusters_bitmap_gt_2.columns)

In [None]:
unique_slogans = clusters_with_slogans['slogan'].unique()

In [None]:
len(unique_slogans)

In [None]:
missing_columns = unique_df.columns.difference(clusters_bitmap_gt_2.columns) 

In [None]:
missing_columns

In [None]:
clusters_bitmap_gt_2 = clusters_bitmap_gt_2.reindex(columns=clusters_bitmap_gt_2.columns.union(missing_columns), fill_value=0)

In [None]:
len(clusters_bitmap_gt_2.columns)

In [None]:
len(unique_df.columns)

In [None]:
unique_df = unique_df.reindex(columns=clusters_bitmap_gt_2.columns)

In [None]:
clusters_bitmap = pd.concat([clusters_bitmap_gt_2, unique_df])

In [None]:
clusters_bitmap = clusters_bitmap.sample(frac=1, random_state=42) # shuffle the rows

In [None]:
clusters_bitmap.to_parquet("real-time clusters/clusters_bitmap.parquet")