In [4]:
import pandas as pd


## Load Clusters and Alarms

In [5]:
clusters = pd.read_parquet("real-time clusters/20230101-20240101_real_time_clusters.parquet")

In [3]:
mob = pd.read_parquet("alarms datasets/mob/20230101-20240101_inpas_mob_preprocess__an__last_event__last_event__ext1.parquet")

In [4]:
adsl = pd.read_parquet("alarms datasets/tx/20230101-20240101_inpas_tx_preprocess__adsl__last_event__last_event__ext1.parquet")

In [5]:
ptn = pd.read_parquet("alarms datasets/tx/20230101-20240101_inpas_tx_preprocess__ptn__last_event__last_event__ext1.parquet")

In [6]:
sdh = pd.read_parquet("alarms datasets/tx/20230101-20240101_inpas_tx_preprocess__sdh__last_event__last_event__ext1.parquet")

In [7]:
pdh = pd.read_parquet("alarms datasets/tx/20230101-20240101_inpas_tx_preprocess__pdh__last_event__last_event__ext1.parquet")

In [8]:
mob = mob[["alarm_id", "last_occurrence"]]

In [9]:
adsl = adsl[["alarm_id", "last_occurrence"]]
ptn = ptn[["alarm_id", "last_occurrence"]]
sdh = sdh[["alarm_id", "last_occurrence"]]
pdh = pdh[["alarm_id", "last_occurrence"]]


In [10]:
alarms = pd.concat([mob, adsl, ptn, sdh, pdh])


In [11]:
alarms.to_parquet("alarms datasets/alarms.parquet")

## Clusters Statistics

In [30]:
clusters = pd.read_parquet("real-time clusters/20230101-20240101_real_time_clusters.parquet")
alarms = pd.read_parquet("alarms datasets/alarms.parquet")

In [31]:
clusters_filtered = clusters[["alarm_id", "cluster_id", "cluster_id2","first_occurrence"]]

In [35]:
clusters_complete = clusters_filtered.merge(alarms, on="alarm_id", how="left")

In [44]:
clusters_complete["first_occurrence"] = pd.to_datetime(clusters_complete["first_occurrence"])

In [54]:
clusters_complete = clusters_complete.dropna(subset=["last_occurrence"])

In [56]:
clusters_complete.to_parquet("real-time clusters/clusters_complete.parquet")

### Clusters Size

Number of alarms within a cluster

In [6]:
clusters_complete = pd.read_parquet("real-time clusters/clusters_complete.parquet")

In [7]:
clusters_grouped_by_id = clusters_complete.groupby(["cluster_id", "cluster_id2"])

In [12]:
clusters_grouped = clusters_grouped_by_id 

In [13]:
size_df = clusters_grouped_by_id.size().reset_index(name='size')

In [14]:
size_df.to_parquet("real-time clusters/size_df.parquet")

### Clusters Lifespan

Time difference from the last last_occurrence and the first first_occurrence of alarms 

In [73]:

def calculate_lifespan(group):
    return group['last_occurrence'].max() - group['first_occurrence'].min()

tqdm.pandas()


lifespan_df = clusters_grouped.apply(calculate_lifespan, include_groups=False).reset_index(name='lifespan')


In [75]:
lifespan_df.to_parquet("real-time clusters/lifespan_df.parquet")

### Clusters Delta First Occurrence

Distance between the last available first_occurrence and the first

In [76]:


def calculate_lifespan(group):
    return group['first_occurrence'].max() - group['first_occurrence'].min()


tqdm.pandas()


delta_df = clusters_grouped.progress_apply(calculate_lifespan).reset_index(name='delta first occurrence')

100%|██████████| 6369371/6369371 [17:32<00:00, 6048.92it/s]


In [77]:
delta_df

Unnamed: 0,cluster_id,cluster_id2,lifespan
0,202301010000_1,202301010000_33,0 days
1,202301010000_10,202301010000_133,0 days
2,202301010000_104,202301010000_79,0 days
3,202301010000_105,202301010000_80,0 days
4,202301010000_106,202301010000_81,0 days
...,...,...,...
6369366,202312312358_5,202312312354_5,0 days
6369367,202312312358_6,202312312354_5,0 days
6369368,202312312358_7,202312312354_5,0 days
6369369,202312312358_8,202312312354_7,0 days


In [78]:
delta_df.to_parquet("real-time clusters/delta_df.parquet")

In [3]:
delta_df = pd.read_parquet("real-time clusters/delta_df.parquet")

In [5]:
delta_df.sort_values(by="delta first occurrence", ascending=False)

Unnamed: 0,cluster_id,cluster_id2,lifespan
5187223,202311041120_19,202311041120_6,0 days 02:07:54
4355824,202309272328_4,202309272328_3,0 days 01:53:46
3079985,202307270332_16,202307270332_10,0 days 01:34:24
5710744,202311272202_10,202311272202_3,0 days 01:32:50
399034,202302020154_50,202302020154_8,0 days 01:22:56
...,...,...,...
2419111,202306300446_37,202306300434_9,0 days 00:00:00
2419110,202306300446_36,202306300434_9,0 days 00:00:00
2419109,202306300446_35,202306300434_9,0 days 00:00:00
2419105,202306300446_31,202306300442_9,0 days 00:00:00


## Merge Statistics

In [15]:
size_df = pd.read_parquet("real-time clusters/size_df.parquet")
lifespan_df = pd.read_parquet("real-time clusters/lifespan_df.parquet")
delta_df = pd.read_parquet("real-time clusters/delta_df.parquet")

In [18]:
clusters_statistics = size_df.merge(lifespan_df, on=["cluster_id", "cluster_id2"], how="left").merge(delta_df, on=["cluster_id", "cluster_id2"], how="left")

In [19]:
clusters_statistics

Unnamed: 0,cluster_id,cluster_id2,size,lifespan_x,lifespan_y
0,202301010000_1,202301010000_33,1,1 days 23:25:34,0 days
1,202301010000_10,202301010000_133,1,0 days 01:14:29,0 days
2,202301010000_104,202301010000_79,1,0 days 01:38:28,0 days
3,202301010000_105,202301010000_80,1,0 days 01:58:25,0 days
4,202301010000_106,202301010000_81,1,0 days 06:55:03,0 days
...,...,...,...,...,...
6369366,202312312358_5,202312312354_5,1,0 days 00:04:34,0 days
6369367,202312312358_6,202312312354_5,1,0 days 00:03:57,0 days
6369368,202312312358_7,202312312354_5,1,0 days 00:00:00,0 days
6369369,202312312358_8,202312312354_7,1,0 days 00:05:31,0 days


In [20]:
clusters_statistics.to_parquet("real-time clusters/clusters_statistics.parquet")

## Extract Unique Slogans from Alarms


In [8]:
mob_slogan = mob[["alarm_id", "mob_slogan"]]
adsl_slogan = adsl[["alarm_id", "std_probable_cause_no"]]## , "summary"]
ptn_slogan = ptn[["alarm_id", "std_probable_cause_no"]]
sdh_slogan = sdh[["alarm_id", "std_probable_cause_no"]]
pdh_slogan = pdh[["alarm_id", "std_probable_cause_no"]]


In [10]:
mob_slogan.rename(columns={"mob_slogan": "slogan"}, inplace=True)
adsl_slogan.rename(columns={"std_probable_cause_no": "slogan"}, inplace=True)
ptn_slogan.rename(columns={"std_probable_cause_no": "slogan"}, inplace=True)
sdh_slogan.rename(columns={"std_probable_cause_no": "slogan"}, inplace=True)
pdh_slogan.rename(columns={"std_probable_cause_no": "slogan"}, inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mob_slogan.rename(columns={"mob_slogan": "slogan"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adsl_slogan.rename(columns={"std_probable_cause_no": "slogan"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ptn_slogan.rename(columns={"std_probable_cause_no": "slogan"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_

In [11]:
adsl_slogan

Unnamed: 0,alarm_id,slogan
0,COS4P;86723842,fermo totale apparato
1,COS3P;66481069,loss of signal los
2,COS4P;86723883,loss of signal los
3,COS4P;86723880,loss of signal los
4,COS4P;86723881,loss of signal los
...,...,...
9326841,EFMTX;6591efa50d1c1f0001da3661,loss of signal los
9326842,EFMTX;6591f1e60d1c1f0001da3841,allarme alimentazione
9326843,EFMTX;6591f101975f5100015a4868,apparato isolato
9326844,EFMTX;6591f092feea70000175b052,allarme alimentazione


In [12]:
alarms_slogan = pd.concat([mob_slogan, adsl_slogan, ptn_slogan, sdh_slogan, pdh_slogan])

In [13]:
alarms_slogan

Unnamed: 0,alarm_id,slogan
0,EFM;63b098b01a65d200011620f8,
1,EFM;63b0b476c623b40001a68b45,
2,EFM;63b0b5a8c623b40001a68b9b,
3,EFM;63b0b5c6c623b40001a68ba2,
4,EFM;63b0b675c623b40001a68c03,
...,...,...
805948,EFMTX;6483fbf332d3d9000193e3a2,fama no alarm since hh
805949,EFMTX;64c92bbe008e1d0001c7ff49,offline sw fama nport
805950,EFMTX;65661efc590d4500011934d1,fama no alarm since hh
805951,EFMTX;6591f1b6975f5100015a48b6,signal degraded


In [14]:
alarms_slogan.to_parquet("alarms datasets/alarms_slogan.parquet")

In [6]:
alarms_slogan = pd.read_parquet("alarms datasets/alarms_slogan.parquet")

In [7]:
alarms_slogan = alarms_slogan.drop("alarm_id", axis=1)

In [8]:
alarms_slogan

Unnamed: 0,slogan
0,
1,
2,
3,
4,
...,...
805948,fama no alarm since hh
805949,offline sw fama nport
805950,fama no alarm since hh
805951,signal degraded


In [9]:
unique_slogans = alarms_slogan['slogan'].unique()

alarms_slogan_unique_slogans = pd.DataFrame({'slogan': unique_slogans})


In [10]:
alarms_slogan_unique_slogans

Unnamed: 0,slogan
0,
1,guasto sistema di alimentazione
2,allarme interruttore batteria b
3,guasto grave condizionamento
4,mancanza rete
...,...
21531,anorm rete raddriz
21532,avar diese quadro com
21533,cdz tx locale tx pal
21534,pot tx pr siae rt


In [8]:
alarms_slogan_unique_slogans.to_parquet("alarms datasets/alarms_slogan_unique_slogans.parquet")

## Associate for each Cluster a bitmap with the slogans 

In [11]:
alarms_slogan = pd.read_parquet("alarms datasets/alarms_slogan.parquet")

In [12]:
alarms_slogan

Unnamed: 0,alarm_id,slogan
0,EFM;63b098b01a65d200011620f8,
1,EFM;63b0b476c623b40001a68b45,
2,EFM;63b0b5a8c623b40001a68b9b,
3,EFM;63b0b5c6c623b40001a68ba2,
4,EFM;63b0b675c623b40001a68c03,
...,...,...
805948,EFMTX;6483fbf332d3d9000193e3a2,fama no alarm since hh
805949,EFMTX;64c92bbe008e1d0001c7ff49,offline sw fama nport
805950,EFMTX;65661efc590d4500011934d1,fama no alarm since hh
805951,EFMTX;6591f1b6975f5100015a48b6,signal degraded


In [13]:
grouped_alarms = alarms_slogan.groupby("alarm_id")

In [6]:
grouped_alarms.size().sort_values(ascending=False)

alarm_id
COS1B;107016710                   1
EFMTX;644ea414adefac00017e2cdf    1
EFMTX;644ea3da4ed0ed00011c5a22    1
EFMTX;644ea4044ed0ed00011c5a40    1
EFMTX;644ea405e9081200013c14d7    1
                                 ..
EFM;64a2ac80a4a90400015bf004      1
EFM;64a2ac813a4a890001d85ae4      1
EFM;64a2ac813a4a890001d85ae5      1
EFM;64a2ac822a3b2e00012a2726      1
ROS1P;172620901                   1
Length: 49966959, dtype: int64

In [14]:
clusters_filtered = clusters[["alarm_id", "cluster_id", "cluster_id2"]]

In [15]:
clusters_filtered

Unnamed: 0,alarm_id,cluster_id,cluster_id2
0,EFM;63af9dfe2aaf2c000125807a,202301010000_3,202301010000_1
1,EFM;63af9dfec623b40001a5c9cf,202301010000_3,202301010000_1
2,COS1P;136902468,202301010000_20,202301010000_2
3,EFM;63b0beaf2aaf2c0001258736,202301010000_9,202301010000_3
4,COS1P;136891403,202301010000_21,202301010000_4
...,...,...,...
38,EFMTX;6591efb9d7d6bc00019444d8,202312312350_16,202312312350_13
39,EFMTX;6591f046d8beb800016deaef,202312312348_17,202312312344_9
40,EFM;6591f199d4c961000177739d,202312312358_1,202312312358_13
41,EFM;6591f19aba09b0000160c08d,202312312358_1,202312312358_13


In [16]:
clusters_with_slogans = pd.merge(clusters_filtered, alarms_slogan, on='alarm_id', how='left')

In [17]:
clusters_with_slogans.to_parquet("real-time clusters/clusters_with_slogans.parquet")

In [18]:
unique_slogans = pd.read_parquet("alarms datasets/alarms_slogan_unique_slogans.parquet")

In [19]:
grouped_clusters = clusters_with_slogans.groupby(["cluster_id", "cluster_id2"])

In [20]:
cluster_slogan_bitmap = {'cluster': [], **{slogan: [] for slogan in unique_slogans}}

In [21]:
from tqdm.notebook import tqdm

# Itera sui gruppi
for name, group in tqdm(grouped_clusters):
    # Verifica la presenza di ciascuno slogan univoco nel cluster
    presence = {slogan: 0 for slogan in unique_slogans}
    for slogan in group['slogan']:
        presence[slogan] = 1
    
    # Aggiungi il cluster e la presenza degli slogan al dataframe dei risultati
    cluster_slogan_bitmap['cluster'].append(name)
    for slogan in unique_slogans:
        cluster_slogan_bitmap[slogan].append(presence[slogan])

# Crea un nuovo dataframe con i risultati
result_df = pd.DataFrame(cluster_slogan_bitmap)
result_df

KeyboardInterrupt: 

In [22]:
from tqdm.notebook import tqdm
tqdm.pandas()

# Definisci una funzione per calcolare la presenza di ciascuno slogan
def calculate_presence(group):
    presence = {slogan: 0 for slogan in unique_slogans}
    for slogan in group['slogan']:
        presence[slogan] = 1
    return pd.Series(presence)

# Applica la funzione a ciascun gruppo e crea un nuovo dataframe con i risultati
result_df = grouped_clusters.progress_apply(calculate_presence)

print(result_df)

KeyboardInterrupt: 