In [1]:
import pandas as pd
from tqdm import tqdm

## Load Clusters and Alarms

In [2]:
clusters = pd.read_parquet("real-time clusters/20230101-20240101_real_time_clusters.parquet")

In [4]:
mob = pd.read_parquet("alarms datasets/mob/20230101-20240101_inpas_mob_preprocess__an__last_event__last_event__ext1.parquet")

In [3]:
adsl = pd.read_parquet("alarms datasets/tx/20230101-20240101_inpas_tx_preprocess__adsl__last_event__last_event__ext1.parquet")

In [4]:
ptn = pd.read_parquet("alarms datasets/tx/20230101-20240101_inpas_tx_preprocess__ptn__last_event__last_event__ext1.parquet")

In [5]:
sdh = pd.read_parquet("alarms datasets/tx/20230101-20240101_inpas_tx_preprocess__sdh__last_event__last_event__ext1.parquet")

In [6]:
pdh = pd.read_parquet("alarms datasets/tx/20230101-20240101_inpas_tx_preprocess__pdh__last_event__last_event__ext1.parquet")

In [9]:
mob = mob[["alarm_id", "last_occurrence"]]

In [11]:
adsl = adsl[["alarm_id", "last_occurrence"]]
ptn = ptn[["alarm_id", "last_occurrence"]]
sdh = sdh[["alarm_id", "last_occurrence"]]
pdh = pdh[["alarm_id", "last_occurrence"]]


In [12]:
alarms = pd.concat([mob, adsl, ptn, sdh, pdh])


In [13]:
alarms.to_parquet("alarms datasets/alarms.parquet")

## Clusters Statistics

In [30]:
clusters = pd.read_parquet("real-time clusters/20230101-20240101_real_time_clusters.parquet")
alarms = pd.read_parquet("alarms datasets/alarms.parquet")

In [31]:
clusters_filtered = clusters[["alarm_id", "cluster_id", "cluster_id2","first_occurrence"]]

In [35]:
clusters_complete = clusters_filtered.merge(alarms, on="alarm_id", how="left")

In [44]:
clusters_complete["first_occurrence"] = pd.to_datetime(clusters_complete["first_occurrence"])

In [54]:
clusters_complete = clusters_complete.dropna(subset=["last_occurrence"])

In [56]:
clusters_complete.to_parquet("real-time clusters/clusters_complete.parquet")

### Clusters Size

Number of alarms within a cluster

In [6]:
clusters_complete = pd.read_parquet("real-time clusters/clusters_complete.parquet")

In [7]:
clusters_grouped_by_id = clusters_complete.groupby(["cluster_id", "cluster_id2"])

In [12]:
clusters_grouped = clusters_grouped_by_id 

In [13]:
size_df = clusters_grouped_by_id.size().reset_index(name='size')

In [14]:
size_df.to_parquet("real-time clusters/size_df.parquet")

### Clusters Lifespan

Time difference from the last last_occurrence and the first first_occurrence of alarms 

In [73]:

def calculate_lifespan(group):
    return group['last_occurrence'].max() - group['first_occurrence'].min()

tqdm.pandas()


lifespan_df = clusters_grouped.apply(calculate_lifespan, include_groups=False).reset_index(name='lifespan')


In [75]:
lifespan_df.to_parquet("real-time clusters/lifespan_df.parquet")

### Clusters Delta First Occurrence

Distance between the last available first_occurrence and the first

In [76]:


def calculate_lifespan(group):
    return group['first_occurrence'].max() - group['first_occurrence'].min()


tqdm.pandas()


delta_df = clusters_grouped.progress_apply(calculate_lifespan).reset_index(name='delta first occurrence')

100%|██████████| 6369371/6369371 [17:32<00:00, 6048.92it/s]


In [77]:
delta_df

Unnamed: 0,cluster_id,cluster_id2,lifespan
0,202301010000_1,202301010000_33,0 days
1,202301010000_10,202301010000_133,0 days
2,202301010000_104,202301010000_79,0 days
3,202301010000_105,202301010000_80,0 days
4,202301010000_106,202301010000_81,0 days
...,...,...,...
6369366,202312312358_5,202312312354_5,0 days
6369367,202312312358_6,202312312354_5,0 days
6369368,202312312358_7,202312312354_5,0 days
6369369,202312312358_8,202312312354_7,0 days


In [78]:
delta_df.to_parquet("real-time clusters/delta_df.parquet")

In [3]:
delta_df = pd.read_parquet("real-time clusters/delta_df.parquet")

In [5]:
delta_df.sort_values(by="delta first occurrence", ascending=False)

Unnamed: 0,cluster_id,cluster_id2,lifespan
5187223,202311041120_19,202311041120_6,0 days 02:07:54
4355824,202309272328_4,202309272328_3,0 days 01:53:46
3079985,202307270332_16,202307270332_10,0 days 01:34:24
5710744,202311272202_10,202311272202_3,0 days 01:32:50
399034,202302020154_50,202302020154_8,0 days 01:22:56
...,...,...,...
2419111,202306300446_37,202306300434_9,0 days 00:00:00
2419110,202306300446_36,202306300434_9,0 days 00:00:00
2419109,202306300446_35,202306300434_9,0 days 00:00:00
2419105,202306300446_31,202306300442_9,0 days 00:00:00


## Merge Statistics

In [15]:
size_df = pd.read_parquet("real-time clusters/size_df.parquet")
lifespan_df = pd.read_parquet("real-time clusters/lifespan_df.parquet")
delta_df = pd.read_parquet("real-time clusters/delta_df.parquet")

In [18]:
clusters_statistics = size_df.merge(lifespan_df, on=["cluster_id", "cluster_id2"], how="left").merge(delta_df, on=["cluster_id", "cluster_id2"], how="left")

In [19]:
clusters_statistics

Unnamed: 0,cluster_id,cluster_id2,size,lifespan_x,lifespan_y
0,202301010000_1,202301010000_33,1,1 days 23:25:34,0 days
1,202301010000_10,202301010000_133,1,0 days 01:14:29,0 days
2,202301010000_104,202301010000_79,1,0 days 01:38:28,0 days
3,202301010000_105,202301010000_80,1,0 days 01:58:25,0 days
4,202301010000_106,202301010000_81,1,0 days 06:55:03,0 days
...,...,...,...,...,...
6369366,202312312358_5,202312312354_5,1,0 days 00:04:34,0 days
6369367,202312312358_6,202312312354_5,1,0 days 00:03:57,0 days
6369368,202312312358_7,202312312354_5,1,0 days 00:00:00,0 days
6369369,202312312358_8,202312312354_7,1,0 days 00:05:31,0 days


In [20]:
clusters_statistics.to_parquet("real-time clusters/clusters_statistics.parquet")

## Extract Unique Slogans from Alarms


In [16]:
mob_slogan = mob[["alarm_id", "mob_slogan"]]
adsl_slogan = adsl[["alarm_id", "std_probable_cause_no"]]## , "summary"]
ptn_slogan = ptn[["alarm_id", "std_probable_cause_no"]]
sdh_slogan = sdh[["alarm_id", "std_probable_cause_no"]]
pdh_slogan = pdh[["alarm_id", "std_probable_cause_no"]]


In [20]:
mob_slogan.rename(columns={"mob_slogan": "slogan"}, inplace=True)
adsl_slogan.rename(columns={"std_probable_cause_no": "slogan"}, inplace=True)
ptn_slogan.rename(columns={"std_probable_cause_no": "slogan"}, inplace=True)
sdh_slogan.rename(columns={"std_probable_cause_no": "slogan"}, inplace=True)
pdh_slogan.rename(columns={"std_probable_cause_no": "slogan"}, inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mob_slogan.rename(columns={"mob_slogan": "slogan"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adsl_slogan.rename(columns={"std_probable_cause_no": "slogan"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ptn_slogan.rename(columns={"std_probable_cause_no": "slogan"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_

In [21]:
adsl_slogan

Unnamed: 0,alarm_id,slogan
0,COS4P;86723842,fermo totale apparato
1,COS3P;66481069,loss of signal los
2,COS4P;86723883,loss of signal los
3,COS4P;86723880,loss of signal los
4,COS4P;86723881,loss of signal los
...,...,...
9326841,EFMTX;6591efa50d1c1f0001da3661,loss of signal los
9326842,EFMTX;6591f1e60d1c1f0001da3841,allarme alimentazione
9326843,EFMTX;6591f101975f5100015a4868,apparato isolato
9326844,EFMTX;6591f092feea70000175b052,allarme alimentazione


In [22]:
alarms_slogan = pd.concat([mob_slogan, adsl_slogan, ptn_slogan, sdh_slogan, pdh_slogan])

In [23]:
alarms_slogan

Unnamed: 0,alarm_id,slogan
0,EFM;63b098b01a65d200011620f8,
1,EFM;63b0b476c623b40001a68b45,
2,EFM;63b0b5a8c623b40001a68b9b,
3,EFM;63b0b5c6c623b40001a68ba2,
4,EFM;63b0b675c623b40001a68c03,
...,...,...
408772,EFMTX;65913ce9c23c110001552471,tunnel aps degraded downgrade of aps tunnel pr...
408773,EFMTX;65913e2cc23c110001552527,tunnel aps degraded downgrade of aps tunnel pr...
408774,EFMTX;6530d7a8eedb1500019c416e,opttxlow reducedtransmitteroutputpower
408775,EFMTX;658cd3bf975f510001572fd9,tunnel aps degraded downgrade of aps tunnel pr...


In [24]:
alarms_slogan.to_parquet("alarms datasets/alarms_slogan.parquet")

In [3]:
alarms_slogan = pd.read_parquet("alarms datasets/alarms_slogan.parquet")

In [4]:
alarms_slogan = alarms_slogan.drop("alarm_id", axis=1)

In [5]:
alarms_slogan

Unnamed: 0,slogan
0,
1,
2,
3,
4,
...,...
408772,tunnel aps degraded downgrade of aps tunnel pr...
408773,tunnel aps degraded downgrade of aps tunnel pr...
408774,opttxlow reducedtransmitteroutputpower
408775,tunnel aps degraded downgrade of aps tunnel pr...


In [6]:
unique_slogans = alarms_slogan['slogan'].unique()

alarms_slogan_unique_slogans = pd.DataFrame({'slogan': unique_slogans})


                                                  slogan
0                                                   None
1                        guasto sistema di alimentazione
2                        allarme interruttore batteria b
3                           guasto grave condizionamento
4                                          mancanza rete
...                                                  ...
19055  lateitaq prer link bandwidth reduced due to mi...
19056                                         low bernsa
19057                  synchronization source mismatchmx
19058  normal cac kt cz qcva variazione stato operati...
19059          ptp sync packet correction field abnormal

[19060 rows x 1 columns]


In [8]:
alarms_slogan_unique_slogans.to_parquet("alarms datasets/alarms_slogan_unique_slogans.parquet")

## Associate for each Cluster a bitmap with the slogans 

In [7]:
alarms_slogan = pd.read_parquet("alarms datasets/alarms_slogan.parquet")

In [8]:
alarms_slogan

Unnamed: 0,alarm_id,slogan
0,EFM;63b098b01a65d200011620f8,
1,EFM;63b0b476c623b40001a68b45,
2,EFM;63b0b5a8c623b40001a68b9b,
3,EFM;63b0b5c6c623b40001a68ba2,
4,EFM;63b0b675c623b40001a68c03,
...,...,...
408772,EFMTX;65913ce9c23c110001552471,tunnel aps degraded downgrade of aps tunnel pr...
408773,EFMTX;65913e2cc23c110001552527,tunnel aps degraded downgrade of aps tunnel pr...
408774,EFMTX;6530d7a8eedb1500019c416e,opttxlow reducedtransmitteroutputpower
408775,EFMTX;658cd3bf975f510001572fd9,tunnel aps degraded downgrade of aps tunnel pr...


In [9]:
grouped_alarms = alarms_slogan.groupby("alarm_id")

In [11]:
grouped_alarms.size().sort_values(ascending=False)

alarm_id
EFMTX;6499ea4d343d87000126b546    2
EFMTX;64fb6e453dbe2e000111ca1f    2
EFMTX;65096d9797d34f0001d7d811    2
EFMTX;650f2fc0347786000100ae20    2
EFMTX;650f2fc1352583000180f26f    2
                                 ..
EFM;64a277c2fbeac90001bf6c4b      1
EFM;64a277c3fbeac90001bf6c4e      1
EFM;64a277c42a3b2e00012a0b15      1
EFM;64a277c5a4a90400015bc7a8      1
ROS1P;172620901                   1
Length: 49161006, dtype: int64

In [4]:
unique_alarm_ids = alarms_slogan['alarm_id'].unique()

In [6]:
len(unique_alarm_ids)

49161006

In [8]:
tot_alarms = adsl.shape[0] + ptn.shape[0] + sdh.shape[0] + pdh.shape[0]
tot_alarms

21393339

In [6]:
tot = 21393339 + mob.shape[0]
tot

49569783

In [13]:
clusters_filtered = clusters[["alarm_id", "cluster_id", "cluster_id2"]]

In [14]:
clusters_filtered

Unnamed: 0,alarm_id,cluster_id,cluster_id2
0,EFM;63af9dfe2aaf2c000125807a,202301010000_3,202301010000_1
1,EFM;63af9dfec623b40001a5c9cf,202301010000_3,202301010000_1
2,COS1P;136902468,202301010000_20,202301010000_2
3,EFM;63b0beaf2aaf2c0001258736,202301010000_9,202301010000_3
4,COS1P;136891403,202301010000_21,202301010000_4
...,...,...,...
38,EFMTX;6591efb9d7d6bc00019444d8,202312312350_16,202312312350_13
39,EFMTX;6591f046d8beb800016deaef,202312312348_17,202312312344_9
40,EFM;6591f199d4c961000177739d,202312312358_1,202312312358_13
41,EFM;6591f19aba09b0000160c08d,202312312358_1,202312312358_13


In [25]:
alarms_duplicated = alarms_slogan[alarms_slogan.duplicated(subset=["alarm_id"], keep=False)]

In [26]:
alarms_duplicated

Unnamed: 0,alarm_id,slogan
0,COS1P;136904679,allarme sincronismo
1,COS1P;136904680,scheda guasta
2,COS1P;136904397,potenza ottica anomala
3,COS1P;136898692,apparato isolato
4,COS1P;136904264,oamloc locv
...,...,...
408772,EFMTX;65913ce9c23c110001552471,tunnel aps degraded downgrade of aps tunnel pr...
408773,EFMTX;65913e2cc23c110001552527,tunnel aps degraded downgrade of aps tunnel pr...
408774,EFMTX;6530d7a8eedb1500019c416e,opttxlow reducedtransmitteroutputpower
408775,EFMTX;658cd3bf975f510001572fd9,tunnel aps degraded downgrade of aps tunnel pr...


In [28]:
find_alarm_by_id = alarms_slogan[alarms_slogan["alarm_id"] == "COS1P;136904397"]

In [29]:
find_alarm_by_id

Unnamed: 0,alarm_id,slogan
2,COS1P;136904397,potenza ottica anomala
2,COS1P;136904397,potenza ottica anomala


In [18]:
alarms_slogan = alarms_slogan.drop_duplicates()

In [19]:
alarms_slogan

Unnamed: 0,alarm_id,slogan
0,EFM;63b098b01a65d200011620f8,
1,EFM;63b0b476c623b40001a68b45,
2,EFM;63b0b5a8c623b40001a68b9b,
3,EFM;63b0b5c6c623b40001a68ba2,
4,EFM;63b0b675c623b40001a68c03,
...,...,...
11248934,EFMTX;65645b6951c2e00001864ba8,problemi wdm
11248935,EFMTX;659148d5feea700001755c78,problema su bus di comunicazione
11248936,EFMTX;6591f1c3e294c600010bf336,sef sef
11248937,EFMTX;656f3be5f9d31e0001ccb86f,degrado laser tx


In [20]:
clusters_with_slogans = pd.merge(clusters_filtered, alarms_slogan, on='alarm_id', how='left')

In [21]:
clusters_with_slogans

Unnamed: 0,alarm_id,cluster_id,cluster_id2,slogan
0,EFM;63af9dfe2aaf2c000125807a,202301010000_3,202301010000_1,
1,EFM;63af9dfec623b40001a5c9cf,202301010000_3,202301010000_1,
2,COS1P;136902468,202301010000_20,202301010000_2,
3,EFM;63b0beaf2aaf2c0001258736,202301010000_9,202301010000_3,
4,COS1P;136891403,202301010000_21,202301010000_4,apparato isolato
...,...,...,...,...
13123609,EFMTX;6591efb9d7d6bc00019444d8,202312312350_16,202312312350_13,guasto ima
13123610,EFMTX;6591f046d8beb800016deaef,202312312348_17,202312312344_9,oamloc locv
13123611,EFM;6591f199d4c961000177739d,202312312358_1,202312312358_13,
13123612,EFM;6591f19aba09b0000160c08d,202312312358_1,202312312358_13,


In [15]:
import pandas as pd

# Creiamo due dataframe di esempio
df1 = pd.DataFrame({'ID': [1, 2, 3, 4], 'valore1': ['A', 'B', 'C', 'D']})
df2 = pd.DataFrame({'ID': [2, 3, 5], 'valore2': ['X', 'Y', 'Z']})

# Effettuiamo il merge mantenendo solo gli ID della prima colonna (df1)
merged_df = pd.merge(df1, df2, on='ID', how='left')

# Visualizziamo il dataframe risultante
print(merged_df)


   ID valore1 valore2
0   1       A     NaN
1   2       B       X
2   3       C       Y
3   4       D     NaN
