In [88]:
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth, association_rules
from mlxtend.preprocessing import TransactionEncoder
from tabulate import tabulate

### Load Clusters


In [89]:
DATASET = "ne_id_ne_address"  # ne_id_ne_address
AGGREGATION_FIELD = "cluster_id2"  # cluster_id2, cluster_id3
CONCATENATION = "slogan_alarm_group"  # "slogan_network", "slogan_alarm_group"
MIN_SUPPORT = 0.005

ASSOCIATION_RULES_METRIC = "confidence"  # confidence, lift
MIN_THRESHOLD = 0.01
NUM_ASSOCIATION_RULES = 30

In [90]:
clusters_ne_id_loc_name = pd.read_parquet(
    "20240601_20240828_clusters_rdg_all_ne_id_loc_name_aggregation_column_first_occurrence_5min_5min_5min_delta.parquet"
)

In [91]:
clusters_ne_id_ne_address_first_three_octets = pd.read_parquet(
    "20240601_20240828_clusters_rdg_all_ne_id_ne_address_first_three_octets_aggregation_column_first_occurrence_5min_5min_5min_delta.parquet"
)

### Filter clusters with at least 2 devices inside


In [92]:
if AGGREGATION_FIELD == "cluster_id" or AGGREGATION_FIELD == "cluster_id3":
    pass
else:
    valid_clusters = clusters_ne_id_loc_name.groupby("cluster_id2").ne_id.nunique() >= 2
    clusters_ne_id_loc_name = clusters_ne_id_loc_name[
        clusters_ne_id_loc_name.cluster_id2.isin(
            list(valid_clusters[valid_clusters].index)
        )
    ]
    valid_clusters = (
        clusters_ne_id_ne_address_first_three_octets.groupby(
            "cluster_id2"
        ).ne_id.nunique()
        >= 2
    )
    clusters_ne_id_ne_address_first_three_octets = (
        clusters_ne_id_ne_address_first_three_octets[
            clusters_ne_id_ne_address_first_three_octets.cluster_id2.isin(
                list(valid_clusters[valid_clusters].index)
            )
        ]
    )

### Filter only important columns


In [93]:
clusters_ne_id_loc_name_filtered = clusters_ne_id_loc_name[
    [
        "cluster_id",
        "cluster_id2",
        "cluster_id3",
        "ne_type",
        "probable_cause",
        "alarm_group",
        "network",
    ]
]

clusters_ne_id_ne_address_first_three_octets_filtered = (
    clusters_ne_id_ne_address_first_three_octets[
        [
            "cluster_id",
            "cluster_id2",
            "cluster_id3",
            "ne_type",
            "probable_cause",
            "alarm_group",
            "network",
        ]
    ]
)

## Statistics


In [94]:
total_rows = len(clusters_ne_id_loc_name_filtered)

# Statistiche per il campo ne_type
filtered_rows_ne_type_nd = clusters_ne_id_loc_name_filtered[
    clusters_ne_id_loc_name_filtered["ne_type"] == "n/d"
]
num_filtered_rows_ne_type_nd = len(filtered_rows_ne_type_nd)
percentage_ne_type_nd = (num_filtered_rows_ne_type_nd / total_rows) * 100

filtered_rows_ne_type_na = clusters_ne_id_loc_name_filtered[
    clusters_ne_id_loc_name_filtered["ne_type"].isna()
]
num_filtered_rows_ne_type_na = len(filtered_rows_ne_type_na)
percentage_ne_type_na = (num_filtered_rows_ne_type_na / total_rows) * 100

# Statistiche per il campo probable_cause
filtered_rows_probable_cause_na = clusters_ne_id_loc_name_filtered[
    clusters_ne_id_loc_name_filtered["probable_cause"].isna()
]
num_filtered_rows_probable_cause_na = len(filtered_rows_probable_cause_na)
percentage_probable_cause_na = (num_filtered_rows_probable_cause_na / total_rows) * 100

# Statistiche per il campo alarm_group
filtered_rows_alarm_group_na = clusters_ne_id_loc_name_filtered[
    clusters_ne_id_loc_name_filtered["alarm_group"].isna()
]
num_filtered_rows_alarm_group_na = len(filtered_rows_alarm_group_na)
percentage_alarm_group_na = (num_filtered_rows_alarm_group_na / total_rows) * 100

# Statistiche per il campo network

filtered_rows_network_na = clusters_ne_id_loc_name_filtered[
    clusters_ne_id_loc_name_filtered["network"].isna()
]
num_filtered_rows_network_na = len(filtered_rows_network_na)
percentage_network_na = (num_filtered_rows_network_na / total_rows) * 100

# Statistiche per il campo network con valore 'rdg_others'
filtered_rows_network_rdg_others = clusters_ne_id_loc_name_filtered[
    clusters_ne_id_loc_name_filtered["network"] == "rdg_others"
]
num_filtered_rows_network_rdg_others = len(filtered_rows_network_rdg_others)
percentage_network_rdg_others = (
    num_filtered_rows_network_rdg_others / total_rows
) * 100

# Stampa delle statistiche
print(f"Percentuale di righe con ne_type uguale a 'n/d': {percentage_ne_type_nd:.2f}%")
print(f"Percentuale di righe con ne_type uguale a NA: {percentage_ne_type_na:.2f}%")
print(
    f"Percentuale di righe con probable_cause uguale a NA: {percentage_probable_cause_na:.2f}%"
)
print(
    f"Percentuale di righe con alarm_group uguale a NA: {percentage_alarm_group_na:.2f}%"
)
print(f"Percentuale di righe con network uguale a NA: {percentage_network_na:.2f}%")
print(
    f"Percentuale di righe con network uguale a 'rdg_others': {percentage_network_rdg_others:.2f}%"
)

Percentuale di righe con ne_type uguale a 'n/d': 94.91%
Percentuale di righe con ne_type uguale a NA: 0.00%
Percentuale di righe con probable_cause uguale a NA: 0.00%
Percentuale di righe con alarm_group uguale a NA: 0.00%
Percentuale di righe con network uguale a NA: 0.00%
Percentuale di righe con network uguale a 'rdg_others': 83.84%


In [95]:
total_rows = len(clusters_ne_id_ne_address_first_three_octets_filtered)

# Statistiche per il campo ne_type
filtered_rows_ne_type_nd = clusters_ne_id_ne_address_first_three_octets_filtered[
    clusters_ne_id_ne_address_first_three_octets_filtered["ne_type"] == "n/d"
]
num_filtered_rows_ne_type_nd = len(filtered_rows_ne_type_nd)
percentage_ne_type_nd = (num_filtered_rows_ne_type_nd / total_rows) * 100

filtered_rows_ne_type_na = clusters_ne_id_ne_address_first_three_octets_filtered[
    clusters_ne_id_ne_address_first_three_octets_filtered["ne_type"].isna()
]
num_filtered_rows_ne_type_na = len(filtered_rows_ne_type_na)
percentage_ne_type_na = (num_filtered_rows_ne_type_na / total_rows) * 100

# Statistiche per il campo probable_cause
filtered_rows_probable_cause_na = clusters_ne_id_ne_address_first_three_octets_filtered[
    clusters_ne_id_ne_address_first_three_octets_filtered["probable_cause"].isna()
]
num_filtered_rows_probable_cause_na = len(filtered_rows_probable_cause_na)
percentage_probable_cause_na = (num_filtered_rows_probable_cause_na / total_rows) * 100


filtered_rows_alarm_group_na = clusters_ne_id_ne_address_first_three_octets_filtered[
    clusters_ne_id_ne_address_first_three_octets_filtered["alarm_group"].isna()
]
num_filtered_rows_alarm_group_na = len(filtered_rows_alarm_group_na)
percentage_alarm_group_na = (num_filtered_rows_alarm_group_na / total_rows) * 100


filtered_rows_network_na = clusters_ne_id_ne_address_first_three_octets_filtered[
    clusters_ne_id_ne_address_first_three_octets_filtered["network"].isna()
]
num_filtered_rows_network_na = len(filtered_rows_network_na)
percentage_network_na = (num_filtered_rows_network_na / total_rows) * 100

# Statistiche per il campo network con valore 'rdg_others'
filtered_rows_network_rdg_others = (
    clusters_ne_id_ne_address_first_three_octets_filtered[
        clusters_ne_id_ne_address_first_three_octets_filtered["network"] == "rdg_others"
    ]
)
num_filtered_rows_network_rdg_others = len(filtered_rows_network_rdg_others)
percentage_network_rdg_others = (
    num_filtered_rows_network_rdg_others / total_rows
) * 100

# Stampa delle statistiche
print(f"Percentuale di righe con ne_type uguale a 'n/d': {percentage_ne_type_nd:.2f}%")
print(f"Percentuale di righe con ne_type uguale a NA: {percentage_ne_type_na:.2f}%")
print(
    f"Percentuale di righe con probable_cause uguale a NA: {percentage_probable_cause_na:.2f}%"
)
print(
    f"Percentuale di righe con alarm_group uguale a NA: {percentage_alarm_group_na:.2f}%"
)
print(f"Percentuale di righe con network uguale a NA: {percentage_network_na:.2f}%")
print(
    f"Percentuale di righe con network uguale a 'rdg_others': {percentage_network_rdg_others:.2f}%"
)

Percentuale di righe con ne_type uguale a 'n/d': 77.06%
Percentuale di righe con ne_type uguale a NA: 0.14%
Percentuale di righe con probable_cause uguale a NA: 0.00%
Percentuale di righe con alarm_group uguale a NA: 0.00%
Percentuale di righe con network uguale a NA: 8.64%
Percentuale di righe con network uguale a 'rdg_others': 69.86%


### Create new column based on the Aggregation


In [96]:
clusters_ne_id_loc_name_filtered = clusters_ne_id_loc_name_filtered.rename(
    columns={"probable_cause": "slogan"}
)

clusters_ne_id_ne_address_first_three_octets_filtered = (
    clusters_ne_id_ne_address_first_three_octets_filtered.rename(
        columns={"probable_cause": "slogan"}
    )
)


if CONCATENATION == "slogan":
    clusters_ne_id_loc_name_filtered[CONCATENATION] = clusters_ne_id_loc_name_filtered[
        "slogan"
    ]
    clusters_ne_id_ne_address_first_three_octets_filtered["slogan_netype"] = (
        clusters_ne_id_ne_address_first_three_octets_filtered["slogan"]
    )
elif CONCATENATION == "slogan_network":
    clusters_ne_id_loc_name_filtered[CONCATENATION] = (
        clusters_ne_id_loc_name_filtered["slogan"]
        + "_"
        + clusters_ne_id_loc_name_filtered["network"]
    )
    clusters_ne_id_ne_address_first_three_octets_filtered["slogan_network"] = (
        clusters_ne_id_ne_address_first_three_octets_filtered["slogan"]
        + "_"
        + clusters_ne_id_ne_address_first_three_octets_filtered["network"]
    )
elif CONCATENATION == "slogan_alarm_group":
    clusters_ne_id_loc_name_filtered[CONCATENATION] = (
        clusters_ne_id_loc_name_filtered["slogan"]
        + "_"
        + clusters_ne_id_loc_name_filtered["alarm_group"]
    )
    clusters_ne_id_ne_address_first_three_octets_filtered["slogan_alarm_group"] = (
        clusters_ne_id_ne_address_first_three_octets_filtered["slogan"]
        + "_"
        + clusters_ne_id_ne_address_first_three_octets_filtered["alarm_group"]
    )
else:
    raise ValueError(
        "Valore di CONCATENATION non valido. Deve essere 'slogan', 'slogan_network' o 'slogan_alarm_group'."
    )

### Drop columns with network == NA


In [97]:
if CONCATENATION == "slogan_network":
    clusters_ne_id_ne_address_first_three_octets_filtered = (
        clusters_ne_id_ne_address_first_three_octets_filtered.dropna(subset=["network"])
    )

## Preprocessing and FP-Growth


In [98]:
def create_baskets(data, aggregation_field):
    # baskets = data.groupby(aggregation_field)["slogan_netype"].apply(list)
    baskets = data.groupby(aggregation_field)[CONCATENATION].apply(list)
    te = TransactionEncoder()
    te_ary = te.fit(baskets).transform(baskets)
    basket_df = pd.DataFrame(te_ary, columns=te.columns_)

    return basket_df

In [99]:
def print_frequent_itemsets(frequent_itemsets):
    frequent_itemsets["itemsets"] = frequent_itemsets["itemsets"].apply(
        lambda x: ", ".join(list(x))
    )
    table = frequent_itemsets.values.tolist()
    return tabulate(
        table, headers=frequent_itemsets.columns, tablefmt="grid", showindex=True
    )

In [100]:
def print_first_N_rules(rules, N):
    rules_df = rules[
        ["antecedents", "consequents", "support", "confidence", "lift"]
    ].head(N)

    rules_df["antecedents"] = rules_df["antecedents"].apply(
        lambda x: ", ".join(list(x))
    )
    rules_df["consequents"] = rules_df["consequents"].apply(
        lambda x: ", ".join(list(x))
    )

    print("\nAssociation Rules:")
    print(tabulate(rules_df, headers="keys", tablefmt="pretty", showindex=True))

### ITEMSETS AND ASSOCIATION RULES


In [101]:
if DATASET == "ne_id_loc_name":
    data = clusters_ne_id_loc_name_filtered
elif DATASET == "ne_id_ne_address":
    data = clusters_ne_id_ne_address_first_three_octets_filtered
else:
    raise ValueError(
        "Valore di DATASET non valido. Deve essere 'ne_id_loc_name' o 'ne_id_ne_address'."
    )


# Create baskets from dataframe of clusters
basket_df = create_baskets(data, AGGREGATION_FIELD)
# Find frequent itemsets with fpgrowth
frequent_itemsets = fpgrowth(basket_df, min_support=MIN_SUPPORT, use_colnames=True)
# Sort itemsets by support
frequent_itemsets = frequent_itemsets.sort_values(by="support", ascending=False)


# Filter out itemsets with only one item
frequent_itemsets_copy = frequent_itemsets.copy()
frequent_itemsets_copy = frequent_itemsets_copy[
    frequent_itemsets_copy["itemsets"].apply(lambda x: len(x) > 1)
]

support_distribution = frequent_itemsets["support"].describe()
print("\nDistribuzione dei supporti:")
print(support_distribution)
print("\n")


print("AGGREGATION: " + AGGREGATION_FIELD + "\n")
print(print_frequent_itemsets(frequent_itemsets_copy))


# file_name = f"frequent_itemsets_{AGGREGATION_FIELD}_{CONCATENATION}_{DATASET}.xlsx"

# frequent_itemsets_copy.to_excel(file_name, index=False)


Distribuzione dei supporti:
count    374.000000
mean       0.021045
std        0.044462
min        0.005056
25%        0.007235
50%        0.009790
75%        0.016864
max        0.481549
Name: support, dtype: float64


AGGREGATION: cluster_id2

+-----+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|     |    support | itemsets                                                                                                                                                                                                                                                                                 |
|   0 | 0.267994   | dns-node-down-alarms_NodeDown, dns-nodeunmanagable-alarms_NodeUnmanagable                                                   

### Print Association Rules


In [102]:
rules = association_rules(
    frequent_itemsets, metric=ASSOCIATION_RULES_METRIC, min_threshold=MIN_THRESHOLD
)

print("AGGREGATION: " + AGGREGATION_FIELD + "\n")
print_first_N_rules(rules, NUM_ASSOCIATION_RULES)


rules["antecedents"] = rules["antecedents"].apply(lambda x: ", ".join(list(x)))
rules["consequents"] = rules["consequents"].apply(lambda x: ", ".join(list(x)))


# file_name = f"association_rules-{AGGREGATION_FIELD}-{CONCATENATION}-{DATASET}.xlsx"

# rules.to_excel(file_name, index=False)

AGGREGATION: cluster_id2


Association Rules:
+----+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------+---------------------+---------------------+--------------------+
|    |                                  antecedents                                   |                                  consequents                                   |       support       |     confidence      |        lift        |
+----+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------+---------------------+---------------------+--------------------+
| 0  |                         dns-node-down-alarms_NodeDown                          |                   dns-nodeunmanagable-alarms_NodeUnmanagable                   | 0.26799354491662186 | 0.9529456771231829  | 1.9789164586371728 |
| 1  |            

### Association Rules Sorted by Metrics


In [103]:
METRIC = "confidence"  # support, confidence, lift


# Funzione per ordinare le regole in base a una metrica specifica (support, confidence o lift)
def sort_rules(rules, metric="confidence", ascending=False):
    if metric not in ["support", "confidence", "lift"]:
        raise ValueError("La metrica deve essere 'support', 'confidence' o 'lift'.")
    return rules.sort_values(by=metric, ascending=ascending)


sorted_rules = sort_rules(rules, metric=METRIC)

print("\nRegole ordinate per: " + METRIC)


sorted_rules_df = pd.DataFrame(sorted_rules)
print(tabulate(sorted_rules_df, headers="keys", tablefmt="pretty", showindex=True))


Regole ordinate per: confidence
+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+-----------------------+-----------------------+----------------------+---------------------+-------------------------+--------------------+------------------------+
|      |                                                                                                                        antecedents                                                                                                                        |     

## Exploration Functionalities


### Filter Itemsets containing a given string

In [104]:
FIELD = "down"


def filter_itemsets_by_string(itemsets, search_string):

    return itemsets[itemsets["itemsets"].apply(lambda x: search_string in x)]


filtered_itemsets = filter_itemsets_by_string(frequent_itemsets_copy, FIELD)


print(f"Itemsets che contengono '{FIELD}':")


print(tabulate(filtered_itemsets, headers="keys", tablefmt="pretty"))

Itemsets che contengono 'down':
+-----+-----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|     |        support        |                                                                                                                                         itemsets                                                                                                                                         |
+-----+-----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 26  |  0.267993544916

### Filter Ass. Rules containing a given string

In [105]:
# Funzione per filtrare le regole in base ad un particolare campo presente negli antecedents o nei consequents
def search_in_rules(rules, field, search_in):
    if search_in == "antecedents":
        return rules[rules["antecedents"].apply(lambda x: field in x)]
    elif search_in == "consequents":
        return rules[rules["consequents"].apply(lambda x: field in x)]
    elif search_in == "both":
        return rules[
            (rules["antecedents"].apply(lambda x: field in x))
            | (rules["consequents"].apply(lambda x: field in x))
        ]
    else:
        raise ValueError(
            "search_in deve essere 'antecedents', 'consequents', o 'both'."
        )

In [106]:
FIELD = "island"
SEARCH_IN = "consequents"  # antecedents, consequents, both
METRIC = "confidence"  # support, confidence, lift

searched_rules = search_in_rules(rules, field=FIELD, search_in=SEARCH_IN)


# sorted_rules = sort_rules(searched_rules, metric=METRIC)
# print("\nRegole ordinate per: " + METRIC)
# print_first_N_rules(sorted_rules, 1000)

print("Regole con <" + FIELD + "> negli " + SEARCH_IN + ":")
print(tabulate(searched_rules, headers="keys", tablefmt="pretty", showindex=True))

Regole con <island> negli consequents:
+------+----------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------+----------------------+----------------------+----------------------+--------------------+------------------------+--------------------+----------------------+
|      |                                                        antecedents                                                         |                                                           consequents                                                            |  antecedent support  |  consequent support  |       support        |      confidence      |        lift        |        leverage        |     conviction     |    zhangs_metric     |
+------+-----------------------------------------------------------

### Given an Association Rule, filter itemsets that contain all the elements of the rule


In [107]:
def find_rule_by_index(index, rules):
    specific_rule = rules.iloc[index]
    antecedent = specific_rule["antecedents"]
    consequent = specific_rule["consequents"]

    antecedent_list = [item.strip() for item in antecedent.split(",")]
    consequent_list = [item.strip() for item in consequent.split(",")]

    combined_rule = antecedent_list + consequent_list

    return antecedent, consequent, combined_rule



def find_itemsets_with_rule(itemsets, rule):

    rule_set = set(rule)

    return [itemset for itemset in itemsets if rule_set.issubset(set(itemset))]



def format_itemsets(itemsets):

    itemsets_list = itemsets.values.tolist()

    separated_itemsets_list = [
        [index] + [itemset[0]] + [item.strip() for item in itemset[1].split(", ")]
        for index, itemset in enumerate(itemsets_list)
    ]
    return separated_itemsets_list

In [108]:
# RULE = [
#     "dns-snmplinkup-alarms",
#     "dns-node-down-alarms",
#     "dns-ciscofruinserted-alarms",
#     "dns-snmpcoldstart-alarms",
#     "dns-nodeunmanagable-alarms",
# ]

RULE_INDEX = 842

antecedent, consequent, RULE = find_rule_by_index(RULE_INDEX, rules)


print(f"Regola di associazione scelta: {antecedent} => {consequent}")
print("\n")


itemsets = frequent_itemsets_copy
formatted_itemsets = format_itemsets(itemsets)
itemsets_with_rule = find_itemsets_with_rule(formatted_itemsets, RULE)


# Nice print of itemsets with rule
itemsets_with_rule_transformed = [
    [row[0], row[1], ", ".join(row[2:])] for row in itemsets_with_rule
]
headers = ["support", "itemsets"]
print(tabulate(itemsets_with_rule_transformed, headers=headers, tablefmt="grid"))

Regola di associazione scelta: dns-link-down-alarms_InterfaceDown, dns-link-down-alarms_AggregatorLinkDown, dns-link-down-alarms_SNMPLinkDown => dns-node-down-alarms_NodeDown, dns-nodeunmanagable-alarms_NodeUnmanagable


+-----+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|     |    support | itemsets                                                                                                                                                                                                                                                                                 |
| 119 | 0.012695   | dns-link-down-alarms_InterfaceDown, dns-node-down-alarms_NodeDown, dns-link-down-alarms_AggregatorLinkDown, dns-link-down-alarms_SNMPLinkDown, dns-nod

### Given an itemset, filter the clusters that contain the alarms


In [109]:
# Trova un itemset per indice e formatta ogni elemento come una lista di elementi
def find_itemset_by_index(index, itemsets):
    specific_itemset = itemsets.iloc[index]
    itemset_list = specific_itemset["itemsets"].split(", ")
    return itemset_list

In [110]:
def get_concatenation_by_cluster(clusters):
    cluster_slogans = []
    for cluster_id, cluster_data in clusters:
        slogans = list(set(cluster_data[CONCATENATION].tolist()))
        cluster_slogans.append({AGGREGATION_FIELD: cluster_id, CONCATENATION: slogans})
    return pd.DataFrame(cluster_slogans)

In [111]:
# Funzione per filtrare i cluster_id in base all'ITEMSET
def filter_cluster_ids_by_itemset(cluster_slogans_df, itemset):
    filtered_cluster_ids = cluster_slogans_df[
        cluster_slogans_df[CONCATENATION].apply(lambda x: set(itemset).issubset(set(x)))
    ][AGGREGATION_FIELD]
    return filtered_cluster_ids

In [112]:
ITEMSET_INDEX = 2

ITEMSET = find_itemset_by_index(ITEMSET_INDEX, frequent_itemsets_copy)

print(f"ITEMSET scelto: {ITEMSET}")

if DATASET == "ne_id_loc_name":
    data = clusters_ne_id_loc_name_filtered[
        [AGGREGATION_FIELD, "slogan", CONCATENATION]
    ]
elif DATASET == "ne_id_ne_address":
    data = clusters_ne_id_ne_address_first_three_octets_filtered[
        [AGGREGATION_FIELD, "slogan", CONCATENATION]
    ]
else:
    raise ValueError(
        "Valore di DATASET non valido. Deve essere 'ne_id_loc_name' o 'ne_id_ne_address'."
    )

# Raggruppa i dati per cluster
clusters = data.groupby(AGGREGATION_FIELD)


# Ottieni i cluster con i loro slogan
cluster_slogans_df = get_concatenation_by_cluster(clusters)


# Filtra i cluster che contengono completamente l'ITEMSET
filtered_cluster_ids = filter_cluster_ids_by_itemset(cluster_slogans_df, ITEMSET)


print("Filtered Cluster IDs (aggregation_field = {}):".format(AGGREGATION_FIELD))
print(filtered_cluster_ids.tolist())
print("Number of filtered clusters: " + str(len(filtered_cluster_ids)))
print("Total number of clusters: " + str(len(clusters)))
print(
    "Percentage of clusters filtered: "
    + str(round((len(filtered_cluster_ids) / len(clusters)) * 100, 2))
    + "%"
)

CLUSTER_ID_TO_VIEW = 3629


if CLUSTER_ID_TO_VIEW in filtered_cluster_ids.values:
    sample_cluster = data[data[AGGREGATION_FIELD] == CLUSTER_ID_TO_VIEW]
    print(f"{AGGREGATION_FIELD}: {CLUSTER_ID_TO_VIEW}")
    print(tabulate(sample_cluster, headers="keys", tablefmt="grid"))
else:
    print(
        f"{AGGREGATION_FIELD} {CLUSTER_ID_TO_VIEW} non trovato nella lista dei cluster filtrati."
    )

ITEMSET scelto: ['dns-link-down-alarms_InterfaceDown', 'dns-link-down-alarms_SNMPLinkDown']
Filtered Cluster IDs (aggregation_field = cluster_id2):
[113, 114, 115, 434, 2222, 2228, 2234, 2249, 6267, 6459, 6460, 6461, 6471, 6473, 6474, 6475, 6526, 6529, 6530, 6532, 6534, 6536, 6541, 6595, 6602, 6614, 6618, 6622, 6624, 6627, 6628, 6630, 6632, 6638, 6770, 6772, 6776, 6778, 7107, 7159, 7161, 7163, 7165, 7166, 7175, 7181, 7182, 7201, 7203, 7205, 7207, 7208, 7211, 7220, 7350, 7351, 7778, 7791, 7796, 7809, 7818, 7824, 7832, 7847, 8259, 8264, 8266, 8744, 8758, 8798, 8804, 8817, 8819, 8823, 8825, 8827, 9373, 9378, 10210, 10212, 10214, 10216, 10218, 10220, 10222, 10223, 10225, 10230, 10231, 10232, 10233, 10235, 10236, 10237, 10238, 10503, 10505, 10507, 10716, 10717, 10720, 10722, 10726, 10727, 10752, 10754, 10761, 10763, 10974, 10983, 10984, 10985, 10988, 10995, 10997, 11006, 11015, 11167, 11355, 11356, 11358, 11359, 11362, 11363, 11365, 11371, 11373, 11374, 11408, 11410, 11506, 11507, 11508, 11

### Given a list of alarms, filter all the clusters that contain totally or not contain that alarms


In [113]:
ITEMSET = [
    "dns-rrgnosecondary-alarms_RrgNoSecondary",
    "dns-rrgnoprimary-alarms_RrgNoPrimary",
]
MODE = "in"  # in,  not_in



print(f"ITEMSET scelto: {ITEMSET}")

print(f"MODE: {MODE}")



if DATASET == "ne_id_loc_name":


    data = clusters_ne_id_loc_name_filtered[
        [AGGREGATION_FIELD, "slogan", CONCATENATION]
    ]


elif DATASET == "ne_id_ne_address":


    data = clusters_ne_id_ne_address_first_three_octets_filtered[
        [AGGREGATION_FIELD, "slogan", CONCATENATION]
    ]
else:
    raise ValueError(
        "Valore di DATASET non valido. Deve essere 'ne_id_loc_name' o 'ne_id_ne_address'."
    )



# Raggruppa i dati per cluster


clusters = data.groupby(AGGREGATION_FIELD)



# Ottieni i cluster con i loro slogan


cluster_concatenation_df = get_concatenation_by_cluster(clusters)



if MODE == "not_in":


    filtered_cluster_ids = cluster_concatenation_df[
        cluster_concatenation_df[CONCATENATION].apply(
            lambda x: not all(item in x for item in ITEMSET)
        )

    ][AGGREGATION_FIELD]
elif MODE == "in":
    filtered_cluster_ids = filter_cluster_ids_by_itemset(
        cluster_concatenation_df, ITEMSET
    )
else:
    raise ValueError("MODE deve essere 'in', 'not_in'.")



print("Filtered Cluster IDs:")
print(filtered_cluster_ids.tolist())
print("Number of filtered clusters: " + str(len(filtered_cluster_ids)))
print("Total number of clusters: " + str(len(clusters)))
print(
    "Percentage of clusters filtered: "
    + str(round((len(filtered_cluster_ids) / len(clusters)) * 100, 2))
    + "%"
)


CLUSTER_ID_TO_VIEW = 1254


print("\n")



if CLUSTER_ID_TO_VIEW in filtered_cluster_ids.values:


    sample_cluster = data[data[AGGREGATION_FIELD] == CLUSTER_ID_TO_VIEW]


    print(f"{AGGREGATION_FIELD}: {CLUSTER_ID_TO_VIEW}")


    print(tabulate(sample_cluster, headers="keys", tablefmt="grid"))


else:
    print(

        f"{AGGREGATION_FIELD} {CLUSTER_ID_TO_VIEW} non trovato nella lista dei cluster filtrati."
    )

ITEMSET scelto: ['dns-rrgnosecondary-alarms_RrgNoSecondary', 'dns-rrgnoprimary-alarms_RrgNoPrimary']
MODE: in
Filtered Cluster IDs:
[228, 229, 329, 332, 333, 336, 337, 392, 398, 407, 784, 786, 2241, 6242, 6251, 6520, 7716, 7758, 7759, 7763, 7764, 7765, 7766, 7768, 7769, 7773, 7774, 7778, 7783, 7787, 7789, 7797, 7801, 7803, 7805, 7809, 7818, 7819, 7821, 7823, 7830, 7831, 7838, 7845, 7848, 7849, 7920, 7922, 8003, 8149, 8151, 8153, 8156, 8159, 8162, 8164, 9377, 10499, 11038, 11040, 11041, 11537, 11538, 12592, 12749, 12753, 13055, 13063, 13845, 13891, 13904, 13928, 13931, 13934, 13941, 13952, 13964, 15800, 15806, 15810, 15816, 15818, 15825, 15827, 15832, 15886, 15897, 15898, 15900, 15907, 15917, 15921, 16921, 16922, 17388, 17398, 17663, 18178, 18182, 18184, 18329, 18421, 18444, 18456, 18546, 18624, 18626, 18680, 18974, 19085, 19336, 19340, 19342, 19373, 19377, 19382, 19383, 19391, 19392, 19405, 19436, 19451, 19453, 19458, 19469, 19495, 19497, 19504, 19513, 19517, 19537, 19539, 19544, 19546

###


### Query with inclusion and exclusion conditions for alarms (e.g., all clusters that contain A and do NOT contain B)


In [114]:
def filter_clusters(clusters, inclusion_list, exclusion_list):
    inclusion_set = set(inclusion_list)
    exclusion_set = set(exclusion_list)

    filtered_cluster_ids = []
    for cluster_id, cluster_items in clusters:
        if inclusion_set.issubset(cluster_items) and exclusion_set.isdisjoint(
            cluster_items
        ):
            filtered_cluster_ids.append(cluster_id)
    return filtered_cluster_ids

In [115]:
inclusion_list = ["dns-nodeunmanagable-alarms_NodeUnmanagable", "dns-rrgnosecondary-alarms_RrgNoSecondary"]
exclusion_list = ["dns-snmpcoldstart-alarms_SNMPColdStart"]


if DATASET == "ne_id_loc_name":
    data = clusters_ne_id_loc_name_filtered[
        [AGGREGATION_FIELD, "slogan", CONCATENATION]
    ]
elif DATASET == "ne_id_ne_address":
    data = clusters_ne_id_ne_address_first_three_octets_filtered[
        [AGGREGATION_FIELD, "slogan", CONCATENATION]
    ]
else:
    raise ValueError(
        "Valore di DATASET non valido. Deve essere 'ne_id_loc_name' o 'ne_id_ne_address'."
    )

# Raggruppa i dati per cluster
grouped_data = (
    data.groupby(AGGREGATION_FIELD)
    .apply(lambda x: (x.name, set(x[CONCATENATION])))
    .tolist()
)


filtered_cluster_ids = filter_clusters(grouped_data, inclusion_list, exclusion_list)

print("Filtered Cluster IDs:")
print(filtered_cluster_ids)
print("Number of filtered clusters: " + str(len(filtered_cluster_ids)))
print("Total number of clusters: " + str(len(clusters)))
print(
    "Percentage of clusters filtered: "
    + str(round((len(filtered_cluster_ids) / len(clusters)) * 100, 2))
    + "%"
)

CLUSTER_ID_TO_VIEW = 516
print("\n")


if CLUSTER_ID_TO_VIEW in filtered_cluster_ids:
    sample_cluster = data[data[AGGREGATION_FIELD] == CLUSTER_ID_TO_VIEW]
    print(f"{AGGREGATION_FIELD}: {CLUSTER_ID_TO_VIEW}")
    print(tabulate(sample_cluster, headers="keys", tablefmt="grid", showindex=False))
else:
    print(
        f"{AGGREGATION_FIELD} {CLUSTER_ID_TO_VIEW} non trovato nella lista dei cluster filtrati."
    )

Filtered Cluster IDs:
[228, 329, 332, 333, 337, 392, 398, 400, 774, 782, 784, 786, 5097, 5098, 5107, 5957, 5966, 5970, 6007, 6242, 6251, 6520, 7139, 7150, 7716, 7758, 7759, 7763, 7764, 7765, 7766, 7768, 7773, 7774, 7778, 7783, 7787, 7789, 7797, 7801, 7803, 7805, 7809, 7845, 7848, 7849, 7920, 7922, 7991, 7995, 8003, 8149, 8151, 8153, 8156, 8162, 8264, 9377, 11038, 11040, 11041, 11537, 11538, 12592, 12658, 12660, 12664, 12666, 12668, 12669, 12671, 12673, 12675, 12677, 12679, 12681, 12683, 12685, 12687, 12689, 12691, 12693, 12695, 12697, 12699, 12701, 12705, 12707, 12709, 12711, 12713, 12714, 12716, 12718, 12720, 12721, 12732, 12734, 12735, 12737, 12739, 12741, 12743, 12745, 12747, 12748, 12749, 12750, 12753, 12754, 13049, 13050, 13053, 13055, 13765, 13786, 13845, 13849, 13855, 13891, 13904, 13924, 13928, 13931, 13934, 13941, 13952, 13964, 14463, 15800, 15806, 15810, 15816, 15818, 15825, 15827, 15832, 15887, 15897, 15898, 15900, 15907, 15917, 15921, 16574, 16576, 16921, 16922, 17388, 1739

  .apply(lambda x: (x.name, set(x[CONCATENATION])))


### Find Cluster by ID


In [119]:
if DATASET == "ne_id_loc_name":
    data = clusters_ne_id_loc_name_filtered[
        [AGGREGATION_FIELD, "slogan", CONCATENATION]
    ]
elif DATASET == "ne_id_ne_address":
    data = clusters_ne_id_ne_address_first_three_octets_filtered[
        [AGGREGATION_FIELD, "slogan", CONCATENATION]
    ]
else:
    raise ValueError(
        "Valore di DATASET non valido. Deve essere 'ne_id_loc_name' o 'ne_id_ne_address'."
    )


CLUSTER_ID_TO_VIEW = 333
print("\n")


sample_cluster = data[data[AGGREGATION_FIELD] == CLUSTER_ID_TO_VIEW]
print(f"{AGGREGATION_FIELD}: {CLUSTER_ID_TO_VIEW}")
print(tabulate(sample_cluster, headers="keys", tablefmt="grid"))




cluster_id2: 333
+-------+---------------+-------------------------------+--------------------------------------------------+
|       |   cluster_id2 | slogan                        | slogan_alarm_group                               |
| 26294 |           333 | dns-nodeunmanagable-alarms    | dns-nodeunmanagable-alarms_NodeUnmanagable       |
+-------+---------------+-------------------------------+--------------------------------------------------+
| 26271 |           333 | dns-node-down-alarms          | dns-node-down-alarms_NodeDown                    |
+-------+---------------+-------------------------------+--------------------------------------------------+
| 26281 |           333 | dns-rrgnoprimary-alarms       | dns-rrgnoprimary-alarms_RrgNoPrimary             |
+-------+---------------+-------------------------------+--------------------------------------------------+
| 26275 |           333 | dns-rrgnosecondary-alarms     | dns-rrgnosecondary-alarms_RrgNoSecondary         |
