In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import ticker

In [67]:
train_df = pd.read_parquet("Autoencoders/Autoencoder/Dataset split/train_df.parquet")

In [4]:
val_df = pd.read_parquet("Autoencoders/Autoencoder/Dataset split/val_df.parquet")

In [22]:
test_df = pd.read_parquet("Autoencoders/Autoencoder/Dataset split/test_df.parquet")

In [46]:
def plot_occurrences(df, filename):
    """Creates a bar plot of the occurrences of each slogan in the DataFrame."""

    occurrences = df.sum(axis=0)
    occurrences = occurrences.sort_values(ascending=True)  # Ordina in modo ascendente per il grafico a barre orizzontali

    plt.figure(figsize=(15, 15))  # Aumenta le dimensioni del grafico

    plt.barh(occurrences.index, occurrences.values)  # Crea un grafico a barre orizzontali
    plt.ylabel("Slogan")
    plt.xlabel("Occorrenze")
    plt.title("Occorrenze di Slogan nel Dataset")

    ax = plt.gca()
    ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True))

    plt.tight_layout()  # Assicura che tutto sia visibile quando si salva l'immagine
    plt.savefig(filename)
    plt.close()

In [47]:
plot_occurrences(train_df, "train_occurrences.png")
plot_occurrences(val_df, "val_occurrences.png")
plot_occurrences(test_df, "test_occurrences.png")

In [15]:
dataset = pd.read_parquet("real-time clusters/result_df_gt_2.parquet")

In [71]:
occurrences = dataset.sum(axis=0)
occurrences = occurrences.sort_values(ascending=False)

In [72]:
occurrences

None                                                  499659
apparato isolato                                      176702
signal degraded                                       143259
loss of signal los                                     94152
apparato isolato pr                                    63635
                                                       ...  
cardinitcard initializing                                  1
vc unequipped ho vc unequipped                             1
net specific problem non inviato da agent                  1
unidentified specific problem non inviato da agent         1
crdpwroff card power off xexc xexc                         1
Length: 87, dtype: int64

In [81]:
def get_top_slogans(df, percentage):
    """Restituisce gli slogan che costituiscono una certa percentuale delle occorrenze totali."""
    occurrences = df.sum(axis=0)
    sorted_occurrences = occurrences.sort_values(ascending=False)
    cumulative_percentage = sorted_occurrences.cumsum() / sorted_occurrences.sum()
    top_slogans = cumulative_percentage[cumulative_percentage <= percentage].index
    return top_slogans

# Ottieni gli slogan per ciascun DataFrame
slogans_95_train = get_top_slogans(train_df, 0.99)
slogans_95_test = get_top_slogans(test_df, 0.99)
slogans_95_val = get_top_slogans(val_df, 0.99)

# Stampa gli slogan e il loro numero

print(len(slogans_95_train))

print(len(slogans_95_test))

print(len(slogans_95_val))

51
52
51


In [82]:
train_df_99 = train_df[slogans_95_train]
test_df_99 = test_df[slogans_95_test]
val_df_99 = val_df[slogans_95_val]


In [83]:
plot_occurrences(train_df_99, "train_occurrences_99.png")
plot_occurrences(val_df_99, "val_occurrences_99.png")
plot_occurrences(test_df_99, "test_occurrences_99.png")


In [86]:
def plot_occurrences(dfs, labels, filename):
    """Creates a bar plot of the occurrences of each slogan in the DataFrames."""

    plt.figure(figsize=(15, 15))  # Aumenta le dimensioni del grafico

    for df, label in zip(dfs, labels):
        occurrences = df.sum(axis=0)
        occurrences = occurrences.sort_values(ascending=True)  # Ordina in modo ascendente per il grafico a barre orizzontali
        plt.barh(occurrences.index, occurrences.values, alpha=0.5, label=label)  # Crea un grafico a barre orizzontali

    plt.ylabel("Slogan")
    plt.xlabel("Occorrenze")
    plt.title("Occorrenze di Slogan nel Dataset")
    plt.legend()

    ax = plt.gca()
    ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True))

    plt.tight_layout()  # Assicura che tutto sia visibile quando si salva l'immagine
    plt.savefig(filename)
    plt.close()
    
    
    
plot_occurrences([train_df, test_df, val_df], ['Train', 'Test', 'Validation'], 'occurrences.png')

In [93]:
def plot_percentage_occurrences(dfs, labels, colors, filename):
    """Crea un grafico a barre delle percentuali di occorrenze di ciascun slogan nei DataFrame."""

    plt.figure(figsize=(15, 15))  # Aumenta le dimensioni del grafico

    for df, label, color in zip(dfs, labels, colors):
        occurrences = df.sum(axis=0)
        percentages = occurrences / df.sum().sum() * 100  # Calcola le percentuali
        percentages = percentages.sort_values(ascending=True)  # Ordina in modo ascendente per il grafico a barre orizzontali
        plt.barh(percentages.index, percentages.values, color=color, alpha=0.5, label=label)  # Crea un grafico a barre orizzontali

    plt.ylabel("Slogan")
    plt.xlabel("Percentuale (%)")
    plt.title("Percentuale di Occorrenze di Slogan nel Dataset")
    plt.legend()

    ax = plt.gca()
    ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True))

    plt.tight_layout()  # Assicura che tutto sia visibile quando si salva l'immagine
    plt.savefig(filename)
    plt.close()

# Chiamata alla funzione con i colori specificati
plot_percentage_occurrences([train_df, test_df, val_df], ['Train', 'Test', 'Validation'], ['red', 'grey', 'yellow'], 'percentage_occurrences.png')

In [41]:
occurrences_train = train_df.sum(axis=0)
occurrences_train = occurrences_train.sort_values(ascending=True)
print(occurrences_train) 
print()
occurrences_test = test_df.sum(axis=0)
occurrences_test = occurrences_test.sort_values(ascending=True)
print(occurrences_test)
print()
occurrences_val = val_df.sum(axis=0)
occurrences_val = occurrences_val.sort_values(ascending=True)
occurrences_val

unidentified specific problem non inviato da agent         0.0
net specific problem non inviato da agent                  0.0
vc unequipped ho vc unequipped                             0.0
crdpwroff card power off xexc xexc                         1.0
guasto bus di comunicazione                                1.0
                                                        ...   
apparato isolato pr                                    44467.0
loss of signal los                                     65895.0
signal degraded                                       100215.0
apparato isolato                                      123784.0
None                                                  349210.0
Length: 87, dtype: float32

crdpwroff card power off xexc xexc                            0.0
oproutgoing channel optical power out of range xexc zb        0.0
guasto bus di comunicazione                                   0.0
net specific problem non inviato da agent                     0.0
cardinitcard in

crdpwroff card power off xexc xexc        0.0
guasto bus di comunicazione               0.0
serversignalfailure odu ssf               0.0
nan                                       0.0
cardinitcard initializing                 0.0
                                       ...   
apparato isolato pr                    9531.0
loss of signal los                    14181.0
signal degraded                       21618.0
apparato isolato                      26315.0
None                                  75288.0
Length: 87, dtype: float32

In [43]:
# Trova i "slogan" con occorrenza 0 nel gruppo di addestramento
zero_occurrences_train = occurrences_train.loc[occurrences_train == 0]
print("Slogan con occorrenza 0 nel gruppo di training:")
print(zero_occurrences_train)
print()

# Trova i "slogan" con occorrenza 0 nel gruppo di test
zero_occurrences_test = occurrences_test.loc[occurrences_test == 0]
print("Slogan con occorrenza 0 nel gruppo di test:")
print(zero_occurrences_test)
print()

# Trova i "slogan" con occorrenza 0 nel gruppo di validazione
zero_occurrences_val = occurrences_val.loc[occurrences_val == 0]
print("Slogan con occorrenza 0 nel gruppo di validazione:")
print(zero_occurrences_val)

Slogan con occorrenza 0 nel gruppo di training:
unidentified specific problem non inviato da agent    0.0
net specific problem non inviato da agent             0.0
vc unequipped ho vc unequipped                        0.0
dtype: float32

Slogan con occorrenza 0 nel gruppo di test:
crdpwroff card power off xexc xexc                        0.0
oproutgoing channel optical power out of range xexc zb    0.0
guasto bus di comunicazione                               0.0
net specific problem non inviato da agent                 0.0
cardinitcard initializing                                 0.0
unidentified specific problem non inviato da agent        0.0
serversignalfailure odu ssf                               0.0
dtype: float32

Slogan con occorrenza 0 nel gruppo di validazione:
crdpwroff card power off xexc xexc    0.0
guasto bus di comunicazione           0.0
serversignalfailure odu ssf           0.0
nan                                   0.0
cardinitcard initializing             0.0
vc uneq

In [45]:
# Converti le serie in DataFrame
zero_occurrences_train_df = zero_occurrences_train.to_frame('Occorrenze')
zero_occurrences_test_df = zero_occurrences_test.to_frame('Occorrenze')
zero_occurrences_val_df = zero_occurrences_val.to_frame('Occorrenze')

# Stampa i DataFrame
print("Slogan con occorrenza 0 nel gruppo di training:")
print(zero_occurrences_train_df)
print()

print("Slogan con occorrenza 0 nel gruppo di test:")
print(zero_occurrences_test_df)
print()

print("Slogan con occorrenza 0 nel gruppo di validazione:")
print(zero_occurrences_val_df)

Slogan con occorrenza 0 nel gruppo di training:
                                                    Occorrenze
unidentified specific problem non inviato da agent         0.0
net specific problem non inviato da agent                  0.0
vc unequipped ho vc unequipped                             0.0

Slogan con occorrenza 0 nel gruppo di test:
                                                    Occorrenze
crdpwroff card power off xexc xexc                         0.0
oproutgoing channel optical power out of range ...         0.0
guasto bus di comunicazione                                0.0
net specific problem non inviato da agent                  0.0
cardinitcard initializing                                  0.0
unidentified specific problem non inviato da agent         0.0
serversignalfailure odu ssf                                0.0

Slogan con occorrenza 0 nel gruppo di validazione:
                                    Occorrenze
crdpwroff card power off xexc xexc         0.0
guast

In [12]:
import pandas as pd

df = pd.read_csv("file.csv")

In [62]:
df

Unnamed: 0,slogan,f1
0,NNone,0.999973
1,alimentazione,0.982581
2,allarme alimentazione,0.994560
3,allarme interno,0.999694
4,allarme protezione,0.991708
...,...,...
82,tunnel aps outage outage of aps tunnel protect...,0.000000
83,unidentified specific problem non inviato da a...,0.000000
84,vc unequipped ho vc unequipped,0.000000
85,vc unequipped lo vc unequipped,0.996255


In [60]:
df_gt_70 = df[df['f1']]

KeyError: "None of [Index([0.9999733567237854, 0.9825814962387084, 0.9945603013038636,\n       0.9996935129165648, 0.9917080998420716, 0.9988258481025696,\n       0.9835280179977416, 0.9583213925361632, 0.7911920547485352,\n                      0.0,                0.0,                0.0,\n                      0.0,  0.999135673046112,                0.0,\n                      0.0,                0.0, 0.9435282349586488,\n        0.972577691078186,                0.0,                0.0,\n                      0.0,                1.0, 0.9798115491867064,\n                      0.0, 0.9797065258026124, 0.8508071899414062,\n        0.999204397201538,                1.0,                0.0,\n                      0.0,  0.999531090259552,                0.0,\n                      0.0,                1.0,                0.0,\n                      1.0,                0.0,                0.0,\n                      0.0,                0.0,                0.0,\n                      0.0,                0.0,                0.0,\n                      0.0,                0.0, 0.9680242538452148,\n        0.921280026435852,                0.0,                0.0,\n                      0.0,                0.0,                0.0,\n                      0.0,  0.996960461139679, 0.9519230723381042,\n                      0.0, 0.9964157938957214,                1.0,\n       0.9987399578094482, 0.9962714314460754, 0.9940476417541504,\n       0.9962547421455384, 0.9765174984931946,                0.0,\n                      0.0,                0.0,                0.0,\n                      0.0, 0.9997336864471436,                0.0,\n                      0.0,  0.996204137802124,                0.0,\n       0.9869281053543092,                0.0,                0.0,\n                      0.0,                1.0, 0.9991455674171448,\n                      0.0,                0.0,                0.0,\n                      0.0, 0.9962547421455384,                0.0],\n      dtype='float64')] are in the [columns]"

In [15]:
df_gt_70.sort_values(by='f1', ascending=False)

Unnamed: 0,slogan,f1
34,loss of pointer lop,1.0
22,fermo totale apparato,1.0
79,signal degraded,1.0
59,perdita sincornismo ptp,1.0
36,loss of signal los,1.0
28,lof,1.0
0,NNone,0.999973
70,scheda guasta,0.999734
3,allarme interno,0.999694
31,loss of frame lof,0.999531


In [21]:
df_gt_70[df_gt_70['slogan'] == "scheda guasta"]

Unnamed: 0,slogan,f1
70,scheda guasta,0.999734


In [58]:
test_df

Unnamed: 0_level_0,apparato isolato,loss of signal los,None,signal degraded,loss of frame lof,nan,alimentazione,guasto matrice,apparato disservito,fermo totale apparato,...,cardinitcard initializing,mpls tunnel rdi mpls tunnel rdi,vc unequipped ho vc unequipped,vlan megaco ko,net specific problem non inviato da agent,serversignalfailure odu ssf,unidentified specific problem non inviato da agent,replaceablemodulemissing eqpt,grave disservizio onuc,crdpwroff card power off xexc xexc
cluster_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cluster_id2=202307220448_7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cluster_id2=202311122052_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cluster_id2=202311291204_14,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cluster_id2=202307071718_13,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cluster_id2=202310240204_9,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
cluster_id2=202303261120_11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cluster_id2=202309210626_15,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cluster_id2=202311201104_12,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cluster_id2=202305200142_3,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
occurrences = test_df.sum(axis=0)
occurrences_df = occurrences.to_frame("occurrences")
occurrences_df = occurrences_df.sort_values(by="occurrences", ascending=False)
occurrences_df = occurrences_df.reset_index()
occurrences_df = occurrences_df.rename(columns={'index': 'slogan'})


In [54]:
occurrences_df

Unnamed: 0,slogan,occurrences
0,,75161.0
1,apparato isolato,26603.0
2,signal degraded,21426.0
3,loss of signal los,14076.0
4,apparato isolato pr,9637.0
...,...,...
82,oproutgoing channel optical power out of range...,0.0
83,net specific problem non inviato da agent,0.0
84,serversignalfailure odu ssf,0.0
85,unidentified specific problem non inviato da a...,0.0


In [63]:
merge_df = pd.merge(occurrences_df, df, on='slogan', how='left')

In [64]:
merge_df

Unnamed: 0,slogan,occurrences,f1
0,,75161.0,
1,apparato isolato,26603.0,0.958321
2,signal degraded,21426.0,1.000000
3,loss of signal los,14076.0,1.000000
4,apparato isolato pr,9637.0,0.791192
...,...,...,...
82,oproutgoing channel optical power out of range...,0.0,0.000000
83,net specific problem non inviato da agent,0.0,0.000000
84,serversignalfailure odu ssf,0.0,0.000000
85,unidentified specific problem non inviato da a...,0.0,0.000000


In [65]:
merge_df.to_csv("merge_df.csv", index=False)

In [69]:
occurrences = train_df.sum(axis=0)
occurrences_df = occurrences.to_frame("occurrences")
occurrences_df = occurrences_df.sort_values(by="occurrences", ascending=False)
occurrences_df = occurrences_df.reset_index()
occurrences_df = occurrences_df.rename(columns={'index': 'slogan'})

In [71]:
occurrences_df.to_csv("occurrences_df_train.csv", index=False)

In [73]:
reconstructed_df = pd.read_csv("reconstructed_vectors.csv")

In [74]:
reconstructed_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,77,78,79,80,81,82,83,84,85,86
0,1.000000e+00,9.051397e-30,1.123406e-27,2.433197e-30,4.631261e-30,2.445566e-36,4.291842e-20,7.942683e-21,2.058215e-24,1.471279e-28,...,1.945054e-32,4.420177e-29,1.237315e-30,1.498161e-25,3.675833e-32,4.212999e-31,6.866266e-30,5.697599e-35,0.0,6.801446e-34
1,1.000000e+00,1.000000e+00,1.640754e-21,1.000000e+00,1.568455e-33,2.896935e-35,7.439582e-27,2.824645e-24,5.801255e-14,3.179671e-35,...,2.935475e-31,1.475744e-33,5.224314e-31,1.240620e-29,1.088540e-34,2.108758e-34,9.042415e-32,3.766106e-39,0.0,1.155950e-33
2,2.595910e-23,1.804179e-33,1.000000e+00,2.629219e-29,5.819213e-31,3.177938e-34,1.189297e-24,0.000000e+00,0.000000e+00,0.000000e+00,...,1.132129e-30,5.624606e-28,1.718801e-28,2.761516e-26,1.002700e-31,7.276367e-28,4.217083e-31,1.421045e-34,0.0,3.786172e-31
3,3.181205e-24,1.000000e+00,1.657441e-24,1.954888e-24,1.234153e-36,3.782613e-35,3.237829e-24,2.457375e-22,0.000000e+00,0.000000e+00,...,6.919275e-37,3.982809e-32,9.172017e-35,1.238528e-31,5.093636e-35,7.192501e-35,2.127655e-37,2.470618e-38,0.0,3.093904e-35
4,2.409711e-19,1.000000e+00,1.000000e+00,1.000000e+00,3.262707e-35,4.562767e-37,9.503212e-28,0.000000e+00,0.000000e+00,0.000000e+00,...,9.337212e-32,1.174158e-34,1.774502e-31,5.018751e-32,1.439398e-34,5.798840e-34,6.957738e-36,0.000000e+00,0.0,4.818292e-35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64763,1.000000e+00,9.051397e-30,1.123406e-27,2.433197e-30,4.631261e-30,2.445566e-36,4.291842e-20,7.942683e-21,2.058215e-24,1.471279e-28,...,1.945054e-32,4.420177e-29,1.237315e-30,1.498161e-25,3.675833e-32,4.212999e-31,6.866266e-30,5.697599e-35,0.0,6.801446e-34
64764,1.000000e+00,7.503569e-34,1.000000e+00,7.294332e-32,2.241556e-31,4.658265e-37,1.248477e-22,0.000000e+00,1.393822e-24,1.708253e-33,...,3.365319e-32,1.138691e-29,2.651437e-29,1.432557e-26,2.914531e-32,1.578234e-30,1.472328e-29,2.317970e-35,0.0,5.208408e-34
64765,1.941274e-24,1.617136e-20,6.214958e-20,1.000000e+00,5.488552e-28,1.035001e-28,5.911723e-19,7.130444e-17,0.000000e+00,0.000000e+00,...,2.273196e-29,2.452470e-28,3.050180e-29,3.833264e-28,1.224584e-33,1.010905e-29,1.704686e-33,4.030830e-34,0.0,7.924250e-29
64766,1.000000e+00,9.051397e-30,1.123406e-27,2.433197e-30,4.631261e-30,2.445566e-36,4.291842e-20,7.942683e-21,2.058215e-24,1.471279e-28,...,1.945054e-32,4.420177e-29,1.237315e-30,1.498161e-25,3.675833e-32,4.212999e-31,6.866266e-30,5.697599e-35,0.0,6.801446e-34
