In [133]:
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth, association_rules
from mlxtend.preprocessing import TransactionEncoder
from tabulate import tabulate

### Load Clusters


In [134]:
DATASET = "ne_id_loc_name"  # ne_id_ne_address
AGGREGATION_FIELD = "cluster_id2"  # cluster_id2, cluster_id3
CONCATENATION = "slogan"  # "slogan_network", "slogan_alarm_group"
MIN_SUPPORT = 0.005

ASSOCIATION_RULES_METRIC = "confidence"  # confidence, lift
MIN_THRESHOLD = 0.01
NUM_ASSOCIATION_RULES = 20000

In [135]:
clusters_ne_id_loc_name = pd.read_parquet(
    "20240601_20240828_clusters_rdg_all_ne_id_loc_name_aggregation_column_first_occurrence_5min_5min_5min_delta.parquet"
)

In [136]:
clusters_ne_id_ne_address_first_three_octets = pd.read_parquet(
    "20240601_20240828_clusters_rdg_all_ne_id_ne_address_first_three_octets_aggregation_column_first_occurrence_5min_5min_5min_delta.parquet"
)

### Filter clusters with at least 2 devices inside


In [137]:
if AGGREGATION_FIELD == "cluster_id" or AGGREGATION_FIELD == "cluster_id3":
    pass
else:
    valid_clusters = clusters_ne_id_loc_name.groupby("cluster_id2").ne_id.nunique() >= 2
    clusters_ne_id_loc_name = clusters_ne_id_loc_name[
        clusters_ne_id_loc_name.cluster_id2.isin(
            list(valid_clusters[valid_clusters].index)
        )
    ]
    valid_clusters = (
        clusters_ne_id_ne_address_first_three_octets.groupby(
            "cluster_id2"
        ).ne_id.nunique()
        >= 2
    )
    clusters_ne_id_ne_address_first_three_octets = (
        clusters_ne_id_ne_address_first_three_octets[
            clusters_ne_id_ne_address_first_three_octets.cluster_id2.isin(
                list(valid_clusters[valid_clusters].index)
            )
        ]
    )

### Filter only important columns


In [138]:
clusters_ne_id_loc_name_filtered = clusters_ne_id_loc_name[
    [
        "cluster_id",
        "cluster_id2",
        "cluster_id3",
        "ne_type",
        "probable_cause",
        "alarm_group",
        "network",
        "first_occurrence",
        "alarm_id",
    ]
]

clusters_ne_id_ne_address_first_three_octets_filtered = (
    clusters_ne_id_ne_address_first_three_octets[
        [
            "cluster_id",
            "cluster_id2",
            "cluster_id3",
            "ne_type",
            "probable_cause",
            "alarm_group",
            "network",
            "first_occurrence",
            "alarm_id",
        ]
    ]
)

## Statistics


In [139]:
total_rows = len(clusters_ne_id_loc_name_filtered)

# Statistiche per il campo ne_type
filtered_rows_ne_type_nd = clusters_ne_id_loc_name_filtered[
    clusters_ne_id_loc_name_filtered["ne_type"] == "n/d"
]
num_filtered_rows_ne_type_nd = len(filtered_rows_ne_type_nd)
percentage_ne_type_nd = (num_filtered_rows_ne_type_nd / total_rows) * 100

filtered_rows_ne_type_na = clusters_ne_id_loc_name_filtered[
    clusters_ne_id_loc_name_filtered["ne_type"].isna()
]
num_filtered_rows_ne_type_na = len(filtered_rows_ne_type_na)
percentage_ne_type_na = (num_filtered_rows_ne_type_na / total_rows) * 100

# Statistiche per il campo probable_cause
filtered_rows_probable_cause_na = clusters_ne_id_loc_name_filtered[
    clusters_ne_id_loc_name_filtered["probable_cause"].isna()
]
num_filtered_rows_probable_cause_na = len(filtered_rows_probable_cause_na)
percentage_probable_cause_na = (num_filtered_rows_probable_cause_na / total_rows) * 100

# Statistiche per il campo alarm_group
filtered_rows_alarm_group_na = clusters_ne_id_loc_name_filtered[
    clusters_ne_id_loc_name_filtered["alarm_group"].isna()
]
num_filtered_rows_alarm_group_na = len(filtered_rows_alarm_group_na)
percentage_alarm_group_na = (num_filtered_rows_alarm_group_na / total_rows) * 100

# Statistiche per il campo network

filtered_rows_network_na = clusters_ne_id_loc_name_filtered[
    clusters_ne_id_loc_name_filtered["network"].isna()
]
num_filtered_rows_network_na = len(filtered_rows_network_na)
percentage_network_na = (num_filtered_rows_network_na / total_rows) * 100

# Statistiche per il campo network con valore 'rdg_others'
filtered_rows_network_rdg_others = clusters_ne_id_loc_name_filtered[
    clusters_ne_id_loc_name_filtered["network"] == "rdg_others"
]
num_filtered_rows_network_rdg_others = len(filtered_rows_network_rdg_others)
percentage_network_rdg_others = (
    num_filtered_rows_network_rdg_others / total_rows
) * 100

# Stampa delle statistiche
print(f"Percentuale di righe con ne_type uguale a 'n/d': {percentage_ne_type_nd:.2f}%")
print(f"Percentuale di righe con ne_type uguale a NA: {percentage_ne_type_na:.2f}%")
print(
    f"Percentuale di righe con probable_cause uguale a NA: {percentage_probable_cause_na:.2f}%"
)
print(
    f"Percentuale di righe con alarm_group uguale a NA: {percentage_alarm_group_na:.2f}%"
)
print(f"Percentuale di righe con network uguale a NA: {percentage_network_na:.2f}%")
print(
    f"Percentuale di righe con network uguale a 'rdg_others': {percentage_network_rdg_others:.2f}%"
)

Percentuale di righe con ne_type uguale a 'n/d': 94.91%
Percentuale di righe con ne_type uguale a NA: 0.00%
Percentuale di righe con probable_cause uguale a NA: 0.00%
Percentuale di righe con alarm_group uguale a NA: 0.00%
Percentuale di righe con network uguale a NA: 0.00%
Percentuale di righe con network uguale a 'rdg_others': 83.84%


In [140]:
total_rows = len(clusters_ne_id_ne_address_first_three_octets_filtered)

# Statistiche per il campo ne_type
filtered_rows_ne_type_nd = clusters_ne_id_ne_address_first_three_octets_filtered[
    clusters_ne_id_ne_address_first_three_octets_filtered["ne_type"] == "n/d"
]
num_filtered_rows_ne_type_nd = len(filtered_rows_ne_type_nd)
percentage_ne_type_nd = (num_filtered_rows_ne_type_nd / total_rows) * 100

filtered_rows_ne_type_na = clusters_ne_id_ne_address_first_three_octets_filtered[
    clusters_ne_id_ne_address_first_three_octets_filtered["ne_type"].isna()
]
num_filtered_rows_ne_type_na = len(filtered_rows_ne_type_na)
percentage_ne_type_na = (num_filtered_rows_ne_type_na / total_rows) * 100

# Statistiche per il campo probable_cause
filtered_rows_probable_cause_na = clusters_ne_id_ne_address_first_three_octets_filtered[
    clusters_ne_id_ne_address_first_three_octets_filtered["probable_cause"].isna()
]
num_filtered_rows_probable_cause_na = len(filtered_rows_probable_cause_na)
percentage_probable_cause_na = (num_filtered_rows_probable_cause_na / total_rows) * 100


filtered_rows_alarm_group_na = clusters_ne_id_ne_address_first_three_octets_filtered[
    clusters_ne_id_ne_address_first_three_octets_filtered["alarm_group"].isna()
]
num_filtered_rows_alarm_group_na = len(filtered_rows_alarm_group_na)
percentage_alarm_group_na = (num_filtered_rows_alarm_group_na / total_rows) * 100


filtered_rows_network_na = clusters_ne_id_ne_address_first_three_octets_filtered[
    clusters_ne_id_ne_address_first_three_octets_filtered["network"].isna()
]
num_filtered_rows_network_na = len(filtered_rows_network_na)
percentage_network_na = (num_filtered_rows_network_na / total_rows) * 100

# Statistiche per il campo network con valore 'rdg_others'
filtered_rows_network_rdg_others = (
    clusters_ne_id_ne_address_first_three_octets_filtered[
        clusters_ne_id_ne_address_first_three_octets_filtered["network"] == "rdg_others"
    ]
)
num_filtered_rows_network_rdg_others = len(filtered_rows_network_rdg_others)
percentage_network_rdg_others = (
    num_filtered_rows_network_rdg_others / total_rows
) * 100

# Stampa delle statistiche
print(f"Percentuale di righe con ne_type uguale a 'n/d': {percentage_ne_type_nd:.2f}%")
print(f"Percentuale di righe con ne_type uguale a NA: {percentage_ne_type_na:.2f}%")
print(
    f"Percentuale di righe con probable_cause uguale a NA: {percentage_probable_cause_na:.2f}%"
)
print(
    f"Percentuale di righe con alarm_group uguale a NA: {percentage_alarm_group_na:.2f}%"
)
print(f"Percentuale di righe con network uguale a NA: {percentage_network_na:.2f}%")
print(
    f"Percentuale di righe con network uguale a 'rdg_others': {percentage_network_rdg_others:.2f}%"
)

Percentuale di righe con ne_type uguale a 'n/d': 77.06%
Percentuale di righe con ne_type uguale a NA: 0.14%
Percentuale di righe con probable_cause uguale a NA: 0.00%
Percentuale di righe con alarm_group uguale a NA: 0.00%
Percentuale di righe con network uguale a NA: 8.64%
Percentuale di righe con network uguale a 'rdg_others': 69.86%


### Create new column based on the Aggregation


In [141]:
clusters_ne_id_loc_name_filtered = clusters_ne_id_loc_name_filtered.rename(
    columns={"probable_cause": "slogan"}
)

clusters_ne_id_ne_address_first_three_octets_filtered = (
    clusters_ne_id_ne_address_first_three_octets_filtered.rename(
        columns={"probable_cause": "slogan"}
    )
)


if CONCATENATION == "slogan":
    pass
elif CONCATENATION == "slogan_network":
    clusters_ne_id_loc_name_filtered[CONCATENATION] = (
        clusters_ne_id_loc_name_filtered["slogan"]
        + "_"
        + clusters_ne_id_loc_name_filtered["network"]
    )
    clusters_ne_id_ne_address_first_three_octets_filtered["slogan_network"] = (
        clusters_ne_id_ne_address_first_three_octets_filtered["slogan"]
        + "_"
        + clusters_ne_id_ne_address_first_three_octets_filtered["network"]
    )
elif CONCATENATION == "slogan_alarm_group":
    clusters_ne_id_loc_name_filtered[CONCATENATION] = (
        clusters_ne_id_loc_name_filtered["slogan"]
        + "_"
        + clusters_ne_id_loc_name_filtered["alarm_group"]
    )
    clusters_ne_id_ne_address_first_three_octets_filtered["slogan_alarm_group"] = (
        clusters_ne_id_ne_address_first_three_octets_filtered["slogan"]
        + "_"
        + clusters_ne_id_ne_address_first_three_octets_filtered["alarm_group"]
    )
else:
    raise ValueError(
        "Valore di CONCATENATION non valido. Deve essere 'slogan', 'slogan_network' o 'slogan_alarm_group'."
    )

### Drop columns with network == NA


In [142]:
if CONCATENATION == "slogan_network":
    clusters_ne_id_ne_address_first_three_octets_filtered = (
        clusters_ne_id_ne_address_first_three_octets_filtered.dropna(subset=["network"])
    )

## Preprocessing and FP-Growth


In [143]:
def create_baskets(data, aggregation_field):
    # baskets = data.groupby(aggregation_field)["slogan_netype"].apply(list)
    baskets = data.groupby(aggregation_field)[CONCATENATION].apply(list)
    te = TransactionEncoder()
    te_ary = te.fit(baskets).transform(baskets)
    basket_df = pd.DataFrame(te_ary, columns=te.columns_)

    return basket_df

In [144]:
def print_frequent_itemsets(frequent_itemsets):
    frequent_itemsets["itemsets"] = frequent_itemsets["itemsets"].apply(
        lambda x: ", ".join(list(x))
    )
    table = frequent_itemsets.values.tolist()
    return tabulate(
        table, headers=frequent_itemsets.columns, tablefmt="grid", showindex=True
    )

In [145]:
def print_first_N_rules(rules, N):
    rules_df = rules[
        ["antecedents", "consequents", "support", "confidence", "lift"]
    ].head(N)

    rules_df["antecedents"] = rules_df["antecedents"].apply(
        lambda x: ", ".join(list(x))
    )
    rules_df["consequents"] = rules_df["consequents"].apply(
        lambda x: ", ".join(list(x))
    )

    print("\nAssociation Rules:")
    print(tabulate(rules_df, headers="keys", tablefmt="pretty", showindex=True))

### ITEMSETS AND ASSOCIATION RULES


In [146]:
if DATASET == "ne_id_loc_name":
    data = clusters_ne_id_loc_name_filtered
elif DATASET == "ne_id_ne_address":
    data = clusters_ne_id_ne_address_first_three_octets_filtered
else:
    raise ValueError(
        "Valore di DATASET non valido. Deve essere 'ne_id_loc_name' o 'ne_id_ne_address'."
    )


# Create baskets from dataframe of clusters
basket_df = create_baskets(data, AGGREGATION_FIELD)
# Find frequent itemsets with fpgrowth
frequent_itemsets = fpgrowth(basket_df, min_support=MIN_SUPPORT, use_colnames=True)
# Sort itemsets by support
frequent_itemsets = frequent_itemsets.sort_values(by="support", ascending=False)


# Filter out itemsets with only one item
frequent_itemsets_copy = frequent_itemsets.copy()
frequent_itemsets_copy = frequent_itemsets_copy[
    frequent_itemsets_copy["itemsets"].apply(lambda x: len(x) > 1)
]

support_distribution = frequent_itemsets["support"].describe()
print("\nDistribuzione dei supporti:")
print(support_distribution)
print("\n")


print("AGGREGATION: " + AGGREGATION_FIELD + "\n")
print(print_frequent_itemsets(frequent_itemsets_copy))


# file_name = f"frequent_itemsets_{AGGREGATION_FIELD}_{CONCATENATION}_{DATASET}.xlsx"

# frequent_itemsets_copy.to_excel(file_name, index=False)


Distribuzione dei supporti:
count    248.000000
mean       0.028515
std        0.073055
min        0.005014
25%        0.006432
50%        0.010819
75%        0.019429
max        0.749835
Name: support, dtype: float64


AGGREGATION: cluster_id2

+-----+------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+
|     |    support | itemsets                                                                                                                                                 |
|   0 | 0.390949   | dns-node-down-alarms, dns-nodeunmanagable-alarms                                                                                                         |
+-----+------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+
|   1 | 0.265734   | dns-link-down-alarms, dns-no

### Print Association Rules


In [147]:
rules = association_rules(
    frequent_itemsets, metric=ASSOCIATION_RULES_METRIC, min_threshold=MIN_THRESHOLD
)

print("AGGREGATION: " + AGGREGATION_FIELD + "\n")
print_first_N_rules(rules, NUM_ASSOCIATION_RULES)


rules["antecedents"] = rules["antecedents"].apply(lambda x: ", ".join(list(x)))
rules["consequents"] = rules["consequents"].apply(lambda x: ", ".join(list(x)))


# file_name = f"association_rules-{AGGREGATION_FIELD}-{CONCATENATION}-{DATASET}.xlsx"

# rules.to_excel(file_name, index=False)

AGGREGATION: cluster_id2


Association Rules:
+------+------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------+-----------------------+----------------------+---------------------+
|      |                                                            antecedents                                                             |                                                            consequents                                                             |        support        |      confidence      |        lift         |
+------+------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------

### Association Rules Sorted by Metrics


In [148]:
METRIC = "confidence"  # support, confidence, lift


# Funzione per ordinare le regole in base a una metrica specifica (support, confidence o lift)
def sort_rules(rules, metric="confidence", ascending=False):
    if metric not in ["support", "confidence", "lift"]:
        raise ValueError("La metrica deve essere 'support', 'confidence' o 'lift'.")
    return rules.sort_values(by=metric, ascending=ascending)


sorted_rules = sort_rules(rules, metric=METRIC)

print("\nRegole ordinate per: " + METRIC)


sorted_rules_df = pd.DataFrame(sorted_rules)
print(tabulate(sorted_rules_df, headers="keys", tablefmt="pretty", showindex=True))


Regole ordinate per: confidence
+------+------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------+-----------------------+-----------------------+-----------------------+----------------------+---------------------+-------------------------+--------------------+------------------------+
|      |                                                            antecedents                                                             |                                                            consequents                                                             |  antecedent support   |  consequent support   |        support        |      confidence      |        lift         |        leverage         |     conviction     |     zhangs_metric      |
+------+-------------------------------

## Exploration Functionalities


### Filter Itemsets containing a given string

In [149]:
FIELD = "dns-node-down-alarms"


def filter_itemsets_by_string(itemsets, search_string):

    return itemsets[itemsets["itemsets"].apply(lambda x: search_string in x)]


filtered_itemsets = filter_itemsets_by_string(frequent_itemsets_copy, FIELD)


print(f"Itemsets che contengono '{FIELD}':")


print(tabulate(filtered_itemsets, headers="keys", tablefmt="pretty"))

Itemsets che contengono 'dns-node-down-alarms':
+-----+-----------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+
|     |        support        |                                                                         itemsets                                                                         |
+-----+-----------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+
| 54  |  0.3909486739675419   |                                                     dns-node-down-alarms, dns-nodeunmanagable-alarms                                                     |
| 55  |  0.2555746140651801   |                                                        dns-node-down-alarms, dns-link-down-alarms                                                        |
| 56  |  0.231297

### Filter Ass. Rules containing a given string

In [150]:
# Funzione per filtrare le regole in base ad un particolare campo presente negli antecedents o nei consequents
def search_in_rules(rules, field, search_in):
    if search_in == "antecedents":
        return rules[rules["antecedents"].apply(lambda x: field in x)]
    elif search_in == "consequents":
        return rules[rules["consequents"].apply(lambda x: field in x)]
    else:
        raise ValueError(
            "search_in deve essere 'antecedents', 'consequents'."
        )

In [151]:
FIELD = "dns-node-down-alarms"
SEARCH_IN = "antecedents"  # antecedents, consequents
METRIC = "confidence"  # support, confidence, lift

searched_rules = search_in_rules(rules, field=FIELD, search_in=SEARCH_IN)


# sorted_rules = sort_rules(searched_rules, metric=METRIC)
# print("\nRegole ordinate per: " + METRIC)
# print_first_N_rules(sorted_rules, 1000)

print("Regole con <" + FIELD + "> negli " + SEARCH_IN + ":")
print(tabulate(searched_rules, headers="keys", tablefmt="pretty", showindex=True))

Regole con <dns-node-down-alarms> negli antecedents:
+------+------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------+-----------------------+-----------------------+-----------------------+----------------------+---------------------+-------------------------+--------------------+-----------------------+
|      |                                                            antecedents                                                             |                                                            consequents                                                             |  antecedent support   |  consequent support   |        support        |      confidence      |        lift         |        leverage         |     conviction     |     zhangs_metric     |
+------+-------------

### Given an Association Rule, filter itemsets that contain all the elements of the rule


In [152]:
def find_rule_by_index(index, rules):
    specific_rule = rules.iloc[index]
    antecedent = specific_rule["antecedents"]
    consequent = specific_rule["consequents"]

    antecedent_list = [item.strip() for item in antecedent.split(",")]
    consequent_list = [item.strip() for item in consequent.split(",")]

    combined_rule = antecedent_list + consequent_list

    return antecedent, consequent, combined_rule



def find_itemsets_with_rule(itemsets, rule):

    rule_set = set(rule)

    return [itemset for itemset in itemsets if rule_set.issubset(set(itemset))]



def format_itemsets(itemsets):

    itemsets_list = itemsets.values.tolist()

    separated_itemsets_list = [
        [index] + [itemset[0]] + [item.strip() for item in itemset[1].split(", ")]
        for index, itemset in enumerate(itemsets_list)
    ]
    return separated_itemsets_list

In [153]:
# RULE = [
#     "dns-snmplinkup-alarms",
#     "dns-node-down-alarms",
#     "dns-ciscofruinserted-alarms",
#     "dns-snmpcoldstart-alarms",
#     "dns-nodeunmanagable-alarms",
# ]

RULE_INDEX = 9

antecedent, consequent, RULE = find_rule_by_index(RULE_INDEX, rules)


print(f"Regola di associazione scelta: {antecedent} => {consequent}")
print("\n")


itemsets = frequent_itemsets_copy
formatted_itemsets = format_itemsets(itemsets)
itemsets_with_rule = find_itemsets_with_rule(formatted_itemsets, RULE)


# Nice print of itemsets with rule
itemsets_with_rule_transformed = [
    [row[0], row[1], ", ".join(row[2:])] for row in itemsets_with_rule
]
headers = ["support", "itemsets"]
print(tabulate(itemsets_with_rule_transformed, headers=headers, tablefmt="grid"))

Regola di associazione scelta: dns-node-down-alarms => dns-link-down-alarms, dns-nodeunmanagable-alarms


+-----+------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+
|     |    support | itemsets                                                                                                                                                 |
|   3 | 0.231297   | dns-node-down-alarms, dns-link-down-alarms, dns-nodeunmanagable-alarms                                                                                   |
+-----+------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+
|  30 | 0.0317984  | dns-node-down-alarms, dns-link-down-alarms, dns-nodeunmanagable-alarms, dns-snmplinkup-alarms                                                            |
+-----+-------

### Given an itemset, filter the clusters that contain the alarms


In [154]:
# Trova un itemset per indice e formatta ogni elemento come una lista di elementi
def find_itemset_by_index(index, itemsets):
    specific_itemset = itemsets.iloc[index]
    itemset_list = specific_itemset["itemsets"].split(", ")
    return itemset_list

In [155]:
def get_concatenation_by_cluster(clusters):
    cluster_slogans = []
    for cluster_id, cluster_data in clusters:
        if CONCATENATION in cluster_data.columns:
            slogans = list(set(cluster_data[CONCATENATION].tolist()))
            cluster_slogans.append({AGGREGATION_FIELD: cluster_id, CONCATENATION: slogans})
        else:
            print(f"Cluster {cluster_id} non ha il campo {CONCATENATION}")
    return pd.DataFrame(cluster_slogans)

In [156]:
# Funzione per filtrare i cluster_id in base all'ITEMSET
def filter_cluster_ids_by_itemset(cluster_slogans_df, itemset):
    filtered_cluster_ids = cluster_slogans_df[
        cluster_slogans_df[CONCATENATION].apply(lambda x: set(itemset).issubset(set(x)))
    ][AGGREGATION_FIELD]
    return filtered_cluster_ids

In [157]:
ITEMSET_INDEX = 2

ITEMSET = find_itemset_by_index(ITEMSET_INDEX, frequent_itemsets_copy)

print(f"ITEMSET scelto: {ITEMSET}")


common_columns = [AGGREGATION_FIELD, CONCATENATION, "first_occurrence", "alarm_id"]

if CONCATENATION != "slogan":
    common_columns.insert(1, "slogan")

# Seleziona i dati in base al valore di DATASET
if DATASET == "ne_id_loc_name":
    data = clusters_ne_id_loc_name_filtered[common_columns]
elif DATASET == "ne_id_ne_address":
    data = clusters_ne_id_ne_address_first_three_octets_filtered[common_columns]
else:
    raise ValueError(
        "Valore di DATASET non valido. Deve essere 'ne_id_loc_name' o 'ne_id_ne_address'."
    )

# Raggruppa i dati per cluster
clusters = data.groupby(AGGREGATION_FIELD)


# Ottieni i cluster con i loro slogan
cluster_slogans_df = get_concatenation_by_cluster(clusters)


# Filtra i cluster che contengono completamente l'ITEMSET
filtered_cluster_ids = filter_cluster_ids_by_itemset(cluster_slogans_df, ITEMSET)


print("Filtered Cluster IDs (aggregation_field = {}):".format(AGGREGATION_FIELD))
print(filtered_cluster_ids.tolist())
print("Number of filtered clusters: " + str(len(filtered_cluster_ids)))
print("Total number of clusters: " + str(len(clusters)))
print(
    "Percentage of clusters filtered: "
    + str(round((len(filtered_cluster_ids) / len(clusters)) * 100, 2))
    + "%"
)

CLUSTER_ID_TO_VIEW = 229


if CLUSTER_ID_TO_VIEW in filtered_cluster_ids.values:
    sample_cluster = data[data[AGGREGATION_FIELD] == CLUSTER_ID_TO_VIEW]
    print(f"{AGGREGATION_FIELD}: {CLUSTER_ID_TO_VIEW}")
    print(tabulate(sample_cluster, headers="keys", tablefmt="grid"))
else:
    print(
        f"{AGGREGATION_FIELD} {CLUSTER_ID_TO_VIEW} non trovato nella lista dei cluster filtrati."
    )

ITEMSET scelto: ['dns-node-down-alarms', 'dns-link-down-alarms']
Filtered Cluster IDs (aggregation_field = cluster_id2):
[4, 67, 68, 282, 283, 285, 287, 290, 293, 294, 301, 305, 307, 309, 310, 311, 314, 315, 316, 317, 318, 324, 325, 327, 328, 403, 404, 406, 437, 482, 516, 517, 631, 632, 649, 679, 681, 685, 686, 688, 700, 704, 708, 713, 728, 730, 737, 744, 748, 750, 752, 756, 758, 848, 852, 888, 898, 901, 902, 922, 1007, 1010, 1020, 1022, 1024, 1025, 1029, 1030, 1036, 1168, 1170, 1254, 1256, 1514, 1521, 1530, 1535, 1537, 1540, 1541, 1543, 1545, 1548, 1641, 1832, 1833, 1834, 1836, 1837, 1838, 1839, 1841, 1842, 1844, 1845, 1849, 1852, 1855, 1856, 1858, 1864, 1866, 1871, 1898, 1929, 2057, 2061, 2165, 2167, 2170, 2171, 2174, 2207, 2327, 2333, 2338, 2343, 2344, 2566, 2569, 2615, 2616, 2617, 2618, 2619, 2621, 2623, 2626, 2628, 2631, 2632, 2633, 2635, 2885, 2886, 3044, 3046, 3047, 3048, 3049, 3050, 3051, 3052, 3053, 3054, 3055, 3057, 3060, 3061, 3062, 3064, 3065, 3066, 3068, 3069, 3070, 3073, 

### Given a list of alarms, filter all the clusters that contain totally or not contain that alarms


In [158]:
ITEMSET = [
    "dns-node-down-alarms",
    "link-down-alarms",
]
MODE = "not_in"  # in,  not_in


print(f"ITEMSET scelto: {ITEMSET}")
print(f"MODE: {MODE}")



common_columns = [AGGREGATION_FIELD, CONCATENATION, "first_occurrence", "alarm_id"]

if CONCATENATION != "slogan":
    common_columns.insert(1, "slogan")

# Seleziona i dati in base al valore di DATASET
if DATASET == "ne_id_loc_name":
    data = clusters_ne_id_loc_name_filtered[common_columns]
elif DATASET == "ne_id_ne_address":
    data = clusters_ne_id_ne_address_first_three_octets_filtered[common_columns]
else:
    raise ValueError(
        "Valore di DATASET non valido. Deve essere 'ne_id_loc_name' o 'ne_id_ne_address'."
    )


# Raggruppa i dati per cluster
clusters = data.groupby(AGGREGATION_FIELD)

# Ottieni i cluster con i loro slogan
cluster_concatenation_df = get_concatenation_by_cluster(clusters)



if MODE == "not_in":

    filtered_cluster_ids = cluster_concatenation_df[
        cluster_concatenation_df[CONCATENATION].apply(
            lambda x: not all(item in x for item in ITEMSET)
        )

    ][AGGREGATION_FIELD]
elif MODE == "in":
    filtered_cluster_ids = filter_cluster_ids_by_itemset(
        cluster_concatenation_df, ITEMSET
    )
else:
    raise ValueError("MODE deve essere 'in', 'not_in'.")



print("Filtered Cluster IDs:")
print(filtered_cluster_ids.tolist())
print("Number of filtered clusters: " + str(len(filtered_cluster_ids)))
print("Total number of clusters: " + str(len(clusters)))
print(
    "Percentage of clusters filtered: "
    + str(round((len(filtered_cluster_ids) / len(clusters)) * 100, 2))
    + "%"
)


CLUSTER_ID_TO_VIEW = 13


print("\n")



if CLUSTER_ID_TO_VIEW in filtered_cluster_ids.values:
    sample_cluster = data[data[AGGREGATION_FIELD] == CLUSTER_ID_TO_VIEW]
    print(f"{AGGREGATION_FIELD}: {CLUSTER_ID_TO_VIEW}")
    print(tabulate(sample_cluster, headers="keys", tablefmt="grid"))
else:
    print(
        f"{AGGREGATION_FIELD} {CLUSTER_ID_TO_VIEW} non trovato nella lista dei cluster filtrati."
    )

ITEMSET scelto: ['dns-node-down-alarms', 'link-down-alarms']
MODE: not_in
Filtered Cluster IDs:
[2, 3, 4, 6, 67, 68, 69, 135, 136, 137, 139, 140, 141, 144, 148, 223, 230, 231, 234, 236, 237, 238, 241, 242, 244, 245, 246, 247, 249, 251, 252, 253, 254, 255, 263, 270, 275, 279, 282, 283, 285, 287, 290, 292, 293, 294, 295, 298, 300, 301, 305, 306, 307, 309, 310, 311, 314, 315, 316, 317, 318, 321, 322, 324, 325, 327, 328, 338, 339, 343, 345, 347, 348, 350, 351, 353, 359, 361, 362, 364, 365, 369, 372, 373, 374, 376, 378, 380, 381, 383, 384, 385, 387, 388, 389, 392, 394, 399, 400, 401, 402, 403, 404, 405, 406, 409, 410, 412, 414, 415, 416, 417, 418, 420, 421, 422, 423, 429, 430, 433, 437, 461, 470, 474, 482, 516, 517, 560, 561, 567, 576, 627, 631, 632, 637, 638, 639, 640, 643, 649, 679, 681, 685, 686, 688, 700, 704, 708, 713, 721, 728, 730, 737, 744, 748, 750, 752, 756, 758, 759, 760, 761, 762, 764, 768, 769, 773, 776, 778, 779, 780, 781, 783, 785, 786, 788, 789, 804, 848, 852, 888, 890, 893,

### Query with inclusion and exclusion conditions for alarms (e.g., all clusters that contain A and do NOT contain B)


In [159]:
def filter_clusters(clusters, inclusion_list, exclusion_list):
    inclusion_set = set(inclusion_list)
    exclusion_set = set(exclusion_list)

    filtered_cluster_ids = []
    for cluster_id, cluster_items in clusters:
        if inclusion_set.issubset(cluster_items) and exclusion_set.isdisjoint(
            cluster_items
        ):
            filtered_cluster_ids.append(cluster_id)
    return filtered_cluster_ids

In [160]:
inclusion_list = ["dns-nodeunmanagable-alarms", "dns-rrgnosecondary-alarms"]
exclusion_list = ["dns-snmpcoldstart-alarms"]


common_columns = [AGGREGATION_FIELD, CONCATENATION, "first_occurrence", "alarm_id"]

if CONCATENATION != "slogan":
    common_columns.insert(1, "slogan")

# Seleziona i dati in base al valore di DATASET
if DATASET == "ne_id_loc_name":
    data = clusters_ne_id_loc_name_filtered[common_columns]
elif DATASET == "ne_id_ne_address":
    data = clusters_ne_id_ne_address_first_three_octets_filtered[common_columns]
else:
    raise ValueError(
        "Valore di DATASET non valido. Deve essere 'ne_id_loc_name' o 'ne_id_ne_address'."
    )

# Raggruppa i dati per cluster
grouped_data = (
    data.groupby(AGGREGATION_FIELD)
    .apply(lambda x: (x.name, set(x[CONCATENATION])))
    .tolist()
)


filtered_cluster_ids = filter_clusters(grouped_data, inclusion_list, exclusion_list)

print("Filtered Cluster IDs:")
print(filtered_cluster_ids)
print("Number of filtered clusters: " + str(len(filtered_cluster_ids)))
print("Total number of clusters: " + str(len(clusters)))
print(
    "Percentage of clusters filtered: "
    + str(round((len(filtered_cluster_ids) / len(clusters)) * 100, 2))
    + "%"
)

CLUSTER_ID_TO_VIEW = 228
print("\n")


if CLUSTER_ID_TO_VIEW in filtered_cluster_ids:
    sample_cluster = data[data[AGGREGATION_FIELD] == CLUSTER_ID_TO_VIEW]
    print(f"{AGGREGATION_FIELD}: {CLUSTER_ID_TO_VIEW}")
    print(tabulate(sample_cluster, headers="keys", tablefmt="grid", showindex=False))
else:
    print(
        f"{AGGREGATION_FIELD} {CLUSTER_ID_TO_VIEW} non trovato nella lista dei cluster filtrati."
    )

Filtered Cluster IDs:
[516, 517, 1254, 1256, 1555, 1590, 1875, 1876, 1881, 1882, 1885, 1886, 1887, 1889, 1890, 1891, 1892, 1895, 1896, 1897, 2208, 2274, 2276, 2278, 2279, 2625, 2886, 3026, 3060, 3070, 3629, 3696, 3697, 4065, 4066, 4208, 4217, 4482, 4483, 5145, 5146, 5149, 5151, 5531, 5757, 5761, 5933, 6239, 6240, 6509, 6690, 6693, 6694, 7178, 7179, 7180, 7196, 7242, 8226, 8234, 8283, 8285, 8286, 8298, 8320, 8328, 8331, 8332, 8333, 8334, 8336, 8341, 8355, 8362, 8366, 8368, 8373, 8375, 9410, 9460, 9462, 9465, 9470, 9476, 9478, 9479, 9480, 9485, 9489, 9492, 9494, 9495, 9497, 9499, 9503, 9507, 9509, 9516, 9530, 9532, 9547, 9551, 9552, 9555, 9582, 9608, 9814, 11128, 11160, 11163, 11370, 11378, 11384, 11386, 11392, 11400, 11456, 11466, 11468, 11480, 11519, 11526, 11528, 11533, 11539, 11543, 11707, 11708, 11709, 12101, 12204, 12436, 12437, 12630, 12638, 12904, 13464, 13530, 13631, 14357, 14686, 14687, 14698, 15344, 15495, 15591, 15594, 15596, 15597, 15670, 15682, 15691, 15747, 15796, 15800, 1

  .apply(lambda x: (x.name, set(x[CONCATENATION])))


### Given an Itemset, find all the rules associated with that itemset

In [161]:
# Definire la funzione per concatenare antecedents e consequents
def concatenate_antecedents_consequents(row):
    antecedent = row["antecedents"]
    consequent = row["consequents"]

    antecedent_list = [item.strip() for item in antecedent.split(",")]
    consequent_list = [item.strip() for item in consequent.split(",")]

    combined = antecedent_list + consequent_list
    
    
    return combined

In [162]:
# Definire la funzione di filtro
def filter_rules_by_itemset(rules_df, itemset):
    
    rules_df["concatenated"] = rules_df.apply(concatenate_antecedents_consequents, axis=1)
    filtered_rules = rules_df[
        rules_df["concatenated"].apply(lambda x: set(x).issubset(itemset))
    ]
    
    
    return filtered_rules


In [163]:
ITEMSET_INDEX = 1


ITEMSET = find_itemset_by_index(ITEMSET_INDEX, frequent_itemsets_copy)

print(f"ITEMSET scelto: {ITEMSET}")


common_columns = [AGGREGATION_FIELD, CONCATENATION, "first_occurrence", "alarm_id"]

if CONCATENATION != "slogan":
    common_columns.insert(1, "slogan")

# Seleziona i dati in base al valore di DATASET
if DATASET == "ne_id_loc_name":
    data = clusters_ne_id_loc_name_filtered[common_columns]
elif DATASET == "ne_id_ne_address":
    data = clusters_ne_id_ne_address_first_three_octets_filtered[common_columns]
else:
    raise ValueError(
        "Valore di DATASET non valido. Deve essere 'ne_id_loc_name' o 'ne_id_ne_address'."
    )
    
    
# Filtrare le regole di associazione
filtered_rules_df = filter_rules_by_itemset(rules, ITEMSET)
filtered_rules_df = filtered_rules_df[
    ["antecedents", "consequents", "support", "confidence", "lift"]
]

print(tabulate(filtered_rules_df, headers="keys", tablefmt="pretty", showindex=True))


ITEMSET scelto: ['dns-link-down-alarms', 'dns-nodeunmanagable-alarms']
+---+----------------------------+----------------------------+---------------------+---------------------+--------------------+
|   |        antecedents         |        consequents         |       support       |     confidence      |        lift        |
+---+----------------------------+----------------------------+---------------------+---------------------+--------------------+
| 2 |    dns-link-down-alarms    | dns-nodeunmanagable-alarms | 0.26573426573426573 | 0.5741163055872291  | 0.7656567798778127 |
| 3 | dns-nodeunmanagable-alarms |    dns-link-down-alarms    | 0.26573426573426573 | 0.35439028682034135 | 0.7656567798778127 |
+---+----------------------------+----------------------------+---------------------+---------------------+--------------------+


### Find Cluster by ID

In [164]:
common_columns = [AGGREGATION_FIELD, CONCATENATION, "first_occurrence", "alarm_id"]

if CONCATENATION != "slogan":
    common_columns.insert(1, "slogan")

# Seleziona i dati in base al valore di DATASET
if DATASET == "ne_id_loc_name":
    data = clusters_ne_id_loc_name_filtered[common_columns]
elif DATASET == "ne_id_ne_address":
    data = clusters_ne_id_ne_address_first_three_octets_filtered[common_columns]
else:
    raise ValueError(
        "Valore di DATASET non valido. Deve essere 'ne_id_loc_name' o 'ne_id_ne_address'."
    )


CLUSTER_ID_TO_VIEW = 333
print("\n")


sample_cluster = data[data[AGGREGATION_FIELD] == CLUSTER_ID_TO_VIEW]
print(f"{AGGREGATION_FIELD}: {CLUSTER_ID_TO_VIEW}")
print(tabulate(sample_cluster, headers="keys", tablefmt="grid"))




cluster_id2: 333
+---------------+----------+--------------------+------------+
| cluster_id2   | slogan   | first_occurrence   | alarm_id   |
+---------------+----------+--------------------+------------+
