In [1267]:
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth, association_rules
from mlxtend.preprocessing import TransactionEncoder
from tabulate import tabulate

### Load Clusters


In [1268]:
DATASET = "ne_id_loc_name"  # ne_id_ne_address
AGGREGATION_FIELD = "cluster_id2"  # cluster_id2, cluster_id3
CONCATENATION = "slogan_network"  # "slogan_network", "slogan_alarm_group"
MIN_SUPPORT = 0.005

ASSOCIATION_RULES_METRIC = "confidence"  # confidence, lift
MIN_THRESHOLD = 0.01
NUM_ASSOCIATION_RULES = 30

In [1269]:
clusters_ne_id_loc_name = pd.read_parquet(
    "20240601_20240828_clusters_rdg_all_ne_id_loc_name_aggregation_column_first_occurrence_5min_5min_5min_delta.parquet"
)

In [1270]:
clusters_ne_id_ne_address_first_three_octets = pd.read_parquet(
    "20240601_20240828_clusters_rdg_all_ne_id_ne_address_first_three_octets_aggregation_column_first_occurrence_5min_5min_5min_delta.parquet"
)

### Filter clusters with at least 2 devices inside


In [1271]:
if AGGREGATION_FIELD == "cluster_id":
    pass
else:
    valid_clusters = clusters_ne_id_loc_name.groupby("cluster_id2").ne_id.nunique() >= 2
    clusters_ne_id_loc_name = clusters_ne_id_loc_name[
        clusters_ne_id_loc_name.cluster_id2.isin(
            list(valid_clusters[valid_clusters].index)
        )
    ]
    valid_clusters = (
        clusters_ne_id_ne_address_first_three_octets.groupby(
            "cluster_id2"
        ).ne_id.nunique()
        >= 2
    )
    clusters_ne_id_ne_address_first_three_octets = (
        clusters_ne_id_ne_address_first_three_octets[
            clusters_ne_id_ne_address_first_three_octets.cluster_id2.isin(
                list(valid_clusters[valid_clusters].index)
            )
        ]
    )

### Filter only important columns


In [1272]:
clusters_ne_id_loc_name_filtered = clusters_ne_id_loc_name[
    [
        "cluster_id",
        "cluster_id2",
        "cluster_id3",
        "ne_type",
        "probable_cause",
        "alarm_group",
        "network",
    ]
]

clusters_ne_id_ne_address_first_three_octets_filtered = (
    clusters_ne_id_ne_address_first_three_octets[
        [
            "cluster_id",
            "cluster_id2",
            "cluster_id3",
            "ne_type",
            "probable_cause",
            "alarm_group",
            "network",
        ]
    ]
)

## Statistics


In [1273]:
total_rows = len(clusters_ne_id_loc_name_filtered)

# Statistiche per il campo ne_type
filtered_rows_ne_type_nd = clusters_ne_id_loc_name_filtered[
    clusters_ne_id_loc_name_filtered["ne_type"] == "n/d"
]
num_filtered_rows_ne_type_nd = len(filtered_rows_ne_type_nd)
percentage_ne_type_nd = (num_filtered_rows_ne_type_nd / total_rows) * 100

filtered_rows_ne_type_na = clusters_ne_id_loc_name_filtered[
    clusters_ne_id_loc_name_filtered["ne_type"].isna()
]
num_filtered_rows_ne_type_na = len(filtered_rows_ne_type_na)
percentage_ne_type_na = (num_filtered_rows_ne_type_na / total_rows) * 100

# Statistiche per il campo probable_cause
filtered_rows_probable_cause_na = clusters_ne_id_loc_name_filtered[
    clusters_ne_id_loc_name_filtered["probable_cause"].isna()
]
num_filtered_rows_probable_cause_na = len(filtered_rows_probable_cause_na)
percentage_probable_cause_na = (num_filtered_rows_probable_cause_na / total_rows) * 100

# Statistiche per il campo alarm_group
filtered_rows_alarm_group_na = clusters_ne_id_loc_name_filtered[
    clusters_ne_id_loc_name_filtered["alarm_group"].isna()
]
num_filtered_rows_alarm_group_na = len(filtered_rows_alarm_group_na)
percentage_alarm_group_na = (num_filtered_rows_alarm_group_na / total_rows) * 100

# Statistiche per il campo network

filtered_rows_network_na = clusters_ne_id_loc_name_filtered[
    clusters_ne_id_loc_name_filtered["network"].isna()
]
num_filtered_rows_network_na = len(filtered_rows_network_na)
percentage_network_na = (num_filtered_rows_network_na / total_rows) * 100

# Statistiche per il campo network con valore 'rdg_others'
filtered_rows_network_rdg_others = clusters_ne_id_loc_name_filtered[
    clusters_ne_id_loc_name_filtered["network"] == "rdg_others"
]
num_filtered_rows_network_rdg_others = len(filtered_rows_network_rdg_others)
percentage_network_rdg_others = (
    num_filtered_rows_network_rdg_others / total_rows
) * 100

# Stampa delle statistiche
print(f"Percentuale di righe con ne_type uguale a 'n/d': {percentage_ne_type_nd:.2f}%")
print(f"Percentuale di righe con ne_type uguale a NA: {percentage_ne_type_na:.2f}%")
print(
    f"Percentuale di righe con probable_cause uguale a NA: {percentage_probable_cause_na:.2f}%"
)
print(
    f"Percentuale di righe con alarm_group uguale a NA: {percentage_alarm_group_na:.2f}%"
)
print(f"Percentuale di righe con network uguale a NA: {percentage_network_na:.2f}%")
print(
    f"Percentuale di righe con network uguale a 'rdg_others': {percentage_network_rdg_others:.2f}%"
)

Percentuale di righe con ne_type uguale a 'n/d': 94.91%
Percentuale di righe con ne_type uguale a NA: 0.00%
Percentuale di righe con probable_cause uguale a NA: 0.00%
Percentuale di righe con alarm_group uguale a NA: 0.00%
Percentuale di righe con network uguale a NA: 0.00%
Percentuale di righe con network uguale a 'rdg_others': 83.84%


In [1274]:
total_rows = len(clusters_ne_id_ne_address_first_three_octets_filtered)

# Statistiche per il campo ne_type
filtered_rows_ne_type_nd = clusters_ne_id_ne_address_first_three_octets_filtered[
    clusters_ne_id_ne_address_first_three_octets_filtered["ne_type"] == "n/d"
]
num_filtered_rows_ne_type_nd = len(filtered_rows_ne_type_nd)
percentage_ne_type_nd = (num_filtered_rows_ne_type_nd / total_rows) * 100

filtered_rows_ne_type_na = clusters_ne_id_ne_address_first_three_octets_filtered[
    clusters_ne_id_ne_address_first_three_octets_filtered["ne_type"].isna()
]
num_filtered_rows_ne_type_na = len(filtered_rows_ne_type_na)
percentage_ne_type_na = (num_filtered_rows_ne_type_na / total_rows) * 100

# Statistiche per il campo probable_cause
filtered_rows_probable_cause_na = clusters_ne_id_ne_address_first_three_octets_filtered[
    clusters_ne_id_ne_address_first_three_octets_filtered["probable_cause"].isna()
]
num_filtered_rows_probable_cause_na = len(filtered_rows_probable_cause_na)
percentage_probable_cause_na = (num_filtered_rows_probable_cause_na / total_rows) * 100


filtered_rows_alarm_group_na = clusters_ne_id_ne_address_first_three_octets_filtered[
    clusters_ne_id_ne_address_first_three_octets_filtered["alarm_group"].isna()
]
num_filtered_rows_alarm_group_na = len(filtered_rows_alarm_group_na)
percentage_alarm_group_na = (num_filtered_rows_alarm_group_na / total_rows) * 100


filtered_rows_network_na = clusters_ne_id_ne_address_first_three_octets_filtered[
    clusters_ne_id_ne_address_first_three_octets_filtered["network"].isna()
]
num_filtered_rows_network_na = len(filtered_rows_network_na)
percentage_network_na = (num_filtered_rows_network_na / total_rows) * 100

# Statistiche per il campo network con valore 'rdg_others'
filtered_rows_network_rdg_others = (
    clusters_ne_id_ne_address_first_three_octets_filtered[
        clusters_ne_id_ne_address_first_three_octets_filtered["network"] == "rdg_others"
    ]
)
num_filtered_rows_network_rdg_others = len(filtered_rows_network_rdg_others)
percentage_network_rdg_others = (
    num_filtered_rows_network_rdg_others / total_rows
) * 100

# Stampa delle statistiche
print(f"Percentuale di righe con ne_type uguale a 'n/d': {percentage_ne_type_nd:.2f}%")
print(f"Percentuale di righe con ne_type uguale a NA: {percentage_ne_type_na:.2f}%")
print(
    f"Percentuale di righe con probable_cause uguale a NA: {percentage_probable_cause_na:.2f}%"
)
print(
    f"Percentuale di righe con alarm_group uguale a NA: {percentage_alarm_group_na:.2f}%"
)
print(f"Percentuale di righe con network uguale a NA: {percentage_network_na:.2f}%")
print(
    f"Percentuale di righe con network uguale a 'rdg_others': {percentage_network_rdg_others:.2f}%"
)

Percentuale di righe con ne_type uguale a 'n/d': 77.06%
Percentuale di righe con ne_type uguale a NA: 0.14%
Percentuale di righe con probable_cause uguale a NA: 0.00%
Percentuale di righe con alarm_group uguale a NA: 0.00%
Percentuale di righe con network uguale a NA: 8.64%
Percentuale di righe con network uguale a 'rdg_others': 69.86%


### Create new column based on the Aggregation


In [1275]:
clusters_ne_id_loc_name_filtered = clusters_ne_id_loc_name_filtered.rename(
    columns={"probable_cause": "slogan"}
)

clusters_ne_id_ne_address_first_three_octets_filtered = (
    clusters_ne_id_ne_address_first_three_octets_filtered.rename(
        columns={"probable_cause": "slogan"}
    )
)


if CONCATENATION == "slogan":
    clusters_ne_id_loc_name_filtered[CONCATENATION] = clusters_ne_id_loc_name_filtered[
        "slogan"
    ]
    clusters_ne_id_ne_address_first_three_octets_filtered["slogan_netype"] = (
        clusters_ne_id_ne_address_first_three_octets_filtered["slogan"]
    )
elif CONCATENATION == "slogan_network":
    clusters_ne_id_loc_name_filtered[CONCATENATION] = (
        clusters_ne_id_loc_name_filtered["slogan"]
        + "_"
        + clusters_ne_id_loc_name_filtered["network"]
    )
    clusters_ne_id_ne_address_first_three_octets_filtered["slogan_network"] = (
        clusters_ne_id_ne_address_first_three_octets_filtered["slogan"]
        + "_"
        + clusters_ne_id_ne_address_first_three_octets_filtered["network"]
    )
elif CONCATENATION == "slogan_alarm_group":
    clusters_ne_id_loc_name_filtered[CONCATENATION] = (
        clusters_ne_id_loc_name_filtered["slogan"]
        + "_"
        + clusters_ne_id_loc_name_filtered["alarm_group"]
    )
    clusters_ne_id_ne_address_first_three_octets_filtered["slogan_alarm_group"] = (
        clusters_ne_id_ne_address_first_three_octets_filtered["slogan"]
        + "_"
        + clusters_ne_id_ne_address_first_three_octets_filtered["alarm_group"]
    )
else:
    raise ValueError(
        "Valore di CONCATENATION non valido. Deve essere 'slogan', 'slogan_network' o 'slogan_alarm_group'."
    )

### Drop columns with network == NA


In [1276]:
if CONCATENATION == "slogan_network":
    clusters_ne_id_ne_address_first_three_octets_filtered = (

        clusters_ne_id_ne_address_first_three_octets_filtered.dropna(subset=["network"])

    )

## Preprocessing and FP-Growth


In [1277]:
def create_baskets(data, aggregation_field):
    # baskets = data.groupby(aggregation_field)["slogan_netype"].apply(list)
    baskets = data.groupby(aggregation_field)[CONCATENATION].apply(list)
    te = TransactionEncoder()
    te_ary = te.fit(baskets).transform(baskets)
    basket_df = pd.DataFrame(te_ary, columns=te.columns_)

    return basket_df

In [1278]:
def print_frequent_itemsets(frequent_itemsets):
    frequent_itemsets["itemsets"] = frequent_itemsets["itemsets"].apply(
        lambda x: ", ".join(list(x))
    )
    table = frequent_itemsets.values.tolist()
    return tabulate(
        table, headers=frequent_itemsets.columns, tablefmt="grid", showindex=True
    )

In [1279]:
def print_first_N_rules(rules, N):
    rules_df = rules[
        ["antecedents", "consequents", "support", "confidence", "lift"]
    ].head(N)

    rules_df["antecedents"] = rules_df["antecedents"].apply(
        lambda x: ", ".join(list(x))
    )
    rules_df["consequents"] = rules_df["consequents"].apply(
        lambda x: ", ".join(list(x))
    )

    print("\nAssociation Rules:")
    print(tabulate(rules_df, headers="keys", tablefmt="pretty", showindex=True))

### ITEMSETS AND ASSOCIATION RULES


In [1280]:
if DATASET == "ne_id_loc_name":
    data = clusters_ne_id_loc_name_filtered
else:
    data = clusters_ne_id_ne_address_first_three_octets_filtered


# Create baskets from dataframe of clusters
basket_df = create_baskets(data, AGGREGATION_FIELD)
# Find frequent itemsets with fpgrowth
frequent_itemsets = fpgrowth(basket_df, min_support=MIN_SUPPORT, use_colnames=True)
# Sort itemsets by support
frequent_itemsets = frequent_itemsets.sort_values(by="support", ascending=False)


# Filter out itemsets with only one item
frequent_itemsets_copy = frequent_itemsets.copy()
frequent_itemsets_copy = frequent_itemsets_copy[
    frequent_itemsets_copy["itemsets"].apply(lambda x: len(x) > 1)
]

support_distribution = frequent_itemsets["support"].describe()
print("\nDistribuzione dei supporti:")
print(support_distribution)
print("\n")


print("AGGREGATION: " + AGGREGATION_FIELD + "\n")
print(print_frequent_itemsets(frequent_itemsets_copy))


Distribuzione dei supporti:
count    237.000000
mean       0.028214
std        0.071207
min        0.005014
25%        0.006333
50%        0.010424
75%        0.018340
max        0.739939
Name: support, dtype: float64


AGGREGATION: cluster_id2

+-----+------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|     |    support | itemsets                                                                                                                                                                                                                   |
|   0 | 0.381845   | dns-node-down-alarms_rdg_others, dns-nodeunmanagable-alarms_rdg_others                                                                                                                                                     |
+-----+------------+-------

### Print Association Rules


In [1281]:
rules = association_rules(
    frequent_itemsets, metric=ASSOCIATION_RULES_METRIC, min_threshold=MIN_THRESHOLD
)

print("AGGREGATION: " + AGGREGATION_FIELD + "\n")
print_first_N_rules(rules, NUM_ASSOCIATION_RULES)

AGGREGATION: cluster_id2


Association Rules:
+----+-----------------------------------------------------------------------------+-----------------------------------------------------------------------------+----------------------+---------------------+--------------------+
|    |                                 antecedents                                 |                                 consequents                                 |       support        |     confidence      |        lift        |
+----+-----------------------------------------------------------------------------+-----------------------------------------------------------------------------+----------------------+---------------------+--------------------+
| 0  |                       dns-node-down-alarms_rdg_others                       |                    dns-nodeunmanagable-alarms_rdg_others                    |  0.3818445705238158  | 0.9152435167615434  | 1.236917013825916  |
| 1  |                    dns-nodeunma

### Association Rules Sorted by Metrics

In [1282]:
METRIC = "confidence"  # support, confidence, lift


# Funzione per ordinare le regole in base a una metrica specifica (support, confidence o lift)
def sort_rules(rules, metric="confidence", ascending=False):
    if metric not in ["support", "confidence", "lift"]:
        raise ValueError("La metrica deve essere 'support', 'confidence' o 'lift'.")
    return rules.sort_values(by=metric, ascending=ascending)


sorted_rules = sort_rules(rules, metric=METRIC)

print("\nRegole ordinate per: " + METRIC)
print_first_N_rules(sorted_rules, 1000)


Regole ordinate per: confidence

Association Rules:
+------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+---------------------+--------------------+
|      |                                                                                        antecedents                                                                                        |                                                                                       consequents                                                                                       |        support        |     confidence      |        lift        |
+------+---------------------------------------

### Filter Itemsets containing a given string


In [1283]:
FIELD = "link-down"



def filter_itemsets_by_string(itemsets, search_string):


    return itemsets[itemsets["itemsets"].apply(lambda x: search_string in x)]



filtered_itemsets = filter_itemsets_by_string(frequent_itemsets_copy, FIELD)



print(f"Itemsets che contengono '{FIELD}':")


print(tabulate(filtered_itemsets, headers="keys", tablefmt="pretty"))

Itemsets che contengono 'link-down':
+-----+-----------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|     |        support        |                                                                                                          itemsets                                                                                                          |
+-----+-----------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 26  |  0.2564982187623697   |                                                                           dns-nodeunmanagable-alarms_rdg_others, dns-link-down-alarms_rdg_others                            

## Find a string in the Association Rules


In [1284]:
FIELD = "dns-snmplinkup-alarms_n/d"
SEARCH_IN = "antecedents"  # antecedents, consequents, both
METRIC = "confidence"  # support, confidence, lift


# Funzione per filtrare le regole in base ad un particolare campo presente negli antecedents o nei consequents
def search_in_rules(rules, field, search_in):
    if search_in == "antecedents":
        return rules[rules["antecedents"].apply(lambda x: field in x)]
    elif search_in == "consequents":
        return rules[rules["consequents"].apply(lambda x: field in x)]
    elif search_in == "both":
        return rules[
            (rules["antecedents"].apply(lambda x: field in x))
            | (rules["consequents"].apply(lambda x: field in x))
        ]
    else:
        raise ValueError(
            "search_in deve essere 'antecedents', 'consequents', o 'both'."
        )


searched_rules = search_in_rules(rules, field=FIELD, search_in=SEARCH_IN)


# sorted_rules = sort_rules(searched_rules, metric=METRIC)
# print("\nRegole ordinate per: " + METRIC)
# print_first_N_rules(sorted_rules, 1000)


print("Regole con <" + FIELD + "> negli " + SEARCH_IN + ":")
print_first_N_rules(searched_rules, 1000)

Regole con <dns-snmplinkup-alarms_n/d> negli antecedents:

Association Rules:
+-------------+-------------+---------+------------+------+
| antecedents | consequents | support | confidence | lift |
+-------------+-------------+---------+------------+------+
+-------------+-------------+---------+------------+------+


## Exploration Functionalities


### Data una association rule, filtriamo tutti gli itemset che contengono tutti gli elementi della regola


In [1285]:
# Find a rule by index
def find_rule_by_index(index, rules):
    specific_rule = rules.iloc[index]
    antecedent = list(specific_rule["antecedents"])
    consequent = list(specific_rule["consequents"])
    combined_rule = antecedent + consequent
    return antecedent, consequent, combined_rule


# Find itemsets that contain a specific rule
def find_itemsets_with_rule(itemsets, rule):
    rule_set = set(rule)
    return [itemset for itemset in itemsets if rule_set.issubset(set(itemset))]


def format_itemsets(itemsets):
    itemsets_list = itemsets.values.tolist()
    separated_itemsets_list = [
        [index] + [itemset[0]] + [item.strip() for item in itemset[1].split(", ")]
        for index, itemset in enumerate(itemsets_list)
    ]
    return separated_itemsets_list

In [1286]:
# RULE = [
#     "dns-snmplinkup-alarms",
#     "dns-node-down-alarms",
#     "dns-ciscofruinserted-alarms",
#     "dns-snmpcoldstart-alarms",
#     "dns-nodeunmanagable-alarms",
# ]

antecedent, consequent, RULE = find_rule_by_index(0, rules)

# Stampa antecedent e consequent come una regola di associazione
print(f"Regola di associazione scelta: {antecedent} => {consequent}")
print("\n")


itemsets = frequent_itemsets_copy
formatted_itemsets = format_itemsets(itemsets)
itemsets_with_rule = find_itemsets_with_rule(formatted_itemsets, RULE)


# Nice print of itemsets with rule
itemsets_with_rule_transformed = [
    [row[0] ,row[1], ", ".join(row[2:])] for row in itemsets_with_rule
]
headers = ["support", "itemsets"]
print(tabulate(itemsets_with_rule_transformed, headers=headers, tablefmt="grid"))

Regola di associazione scelta: ['dns-node-down-alarms_rdg_others'] => ['dns-nodeunmanagable-alarms_rdg_others']


+-----+------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|     |    support | itemsets                                                                                                                                                                                                                   |
|   0 | 0.381845   | dns-node-down-alarms_rdg_others, dns-nodeunmanagable-alarms_rdg_others                                                                                                                                                     |
+-----+------------+--------------------------------------------------------------------------------------------------------------------------------------------

### Dato un itemset, filtriamo tutti i cluster che contengono quegli allarmi


In [1287]:
# Trova un itemset per indice e formatta ogni elemento come una lista di elementi
def find_itemset_by_index(index, itemsets):
    specific_itemset = itemsets.iloc[index]
    itemset_list = specific_itemset["itemsets"].split(", ")
    return itemset_list

In [1288]:
def get_concatenation_by_cluster(clusters):
    cluster_slogans = []
    for cluster_id, cluster_data in clusters:
        slogans = list(set(cluster_data[CONCATENATION].tolist()))
        cluster_slogans.append(
            {AGGREGATION_FIELD: cluster_id, CONCATENATION: slogans}
        )
    return pd.DataFrame(cluster_slogans)

In [1289]:
# Funzione per filtrare i cluster_id in base all'ITEMSET
def filter_cluster_ids_by_itemset(cluster_slogans_df, itemset):
    filtered_cluster_ids = cluster_slogans_df[
        cluster_slogans_df[CONCATENATION].apply(lambda x: set(itemset).issubset(set(x)))
    ][AGGREGATION_FIELD]
    return filtered_cluster_ids

In [1290]:
ITEMSET_INDEX = 0

ITEMSET = find_itemset_by_index(ITEMSET_INDEX, frequent_itemsets_copy)

print(f"ITEMSET scelto: {ITEMSET}")

if DATASET == "ne_id_loc_name":
    data = clusters_ne_id_loc_name_filtered[[AGGREGATION_FIELD, "slogan", CONCATENATION]]
else:
    data = clusters_ne_id_ne_address_first_three_octets_filtered[[AGGREGATION_FIELD, "slogan", CONCATENATION]]

# Raggruppa i dati per cluster
clusters = data.groupby(AGGREGATION_FIELD)


# Ottieni i cluster con i loro slogan
cluster_slogans_df = get_concatenation_by_cluster(clusters)


# Filtra i cluster che contengono completamente l'ITEMSET
filtered_cluster_ids = filter_cluster_ids_by_itemset(cluster_slogans_df, ITEMSET)


print("Filtered Cluster IDs:")
print(filtered_cluster_ids.tolist())
print("Number of filtered clusters: " + str(len(filtered_cluster_ids)))
print("Total number of clusters: " + str(len(clusters)))
print(
    "Percentage of clusters filtered: "
    + str(round((len(filtered_cluster_ids) / len(clusters)) * 100, 2))
    + "%"
)

CLUSTER_ID_TO_VIEW = 283


if CLUSTER_ID_TO_VIEW in filtered_cluster_ids.values:
    sample_cluster = data[data[AGGREGATION_FIELD] == CLUSTER_ID_TO_VIEW]
    print(f"{AGGREGATION_FIELD}: {CLUSTER_ID_TO_VIEW}")
    print(tabulate(sample_cluster, headers="keys", tablefmt="grid"))
else:
    print(
        f"{AGGREGATION_FIELD} {CLUSTER_ID_TO_VIEW} non trovato nella lista dei cluster filtrati."
    )

ITEMSET scelto: ['dns-node-down-alarms_rdg_others', 'dns-nodeunmanagable-alarms_rdg_others']
Filtered Cluster IDs:
[4, 67, 68, 69, 148, 223, 279, 282, 283, 285, 287, 290, 293, 294, 298, 301, 305, 307, 309, 310, 311, 314, 315, 316, 317, 318, 324, 325, 327, 328, 338, 383, 384, 385, 403, 404, 406, 410, 430, 437, 482, 516, 627, 631, 632, 640, 649, 679, 681, 685, 686, 688, 700, 704, 708, 713, 721, 728, 730, 737, 744, 748, 752, 758, 762, 779, 888, 904, 919, 922, 1005, 1007, 1010, 1020, 1022, 1024, 1025, 1029, 1030, 1036, 1039, 1077, 1078, 1131, 1168, 1170, 1172, 1173, 1181, 1465, 1514, 1521, 1530, 1533, 1535, 1537, 1540, 1541, 1543, 1545, 1548, 1555, 1568, 1569, 1571, 1590, 1802, 1832, 1833, 1834, 1836, 1837, 1838, 1839, 1841, 1842, 1844, 1845, 1848, 1849, 1852, 1855, 1856, 1858, 1864, 1866, 1868, 1871, 1875, 1876, 1879, 1880, 1881, 1882, 1883, 1885, 1886, 1887, 1889, 1890, 1891, 1892, 1895, 1896, 1897, 1924, 1929, 2057, 2061, 2120, 2146, 2160, 2162, 2165, 2167, 2170, 2171, 2174, 2207, 2264,

### Dato un elenco generico di allarmi, dobbiamo poter ricavare tutti i cluster che contengono quegli allarmi

In [1291]:
ITEMSET = ["dns-link-down-alarms_rdg_others"]
print(f"ITEMSET scelto: {ITEMSET}")

MODE = "not_in" # in,  not_in
print(f"MODE: {MODE}")

if DATASET == "ne_id_loc_name":
    data = clusters_ne_id_loc_name_filtered[[AGGREGATION_FIELD,"slogan",CONCATENATION]]
else:
    data = clusters_ne_id_ne_address_first_three_octets_filtered[[AGGREGATION_FIELD,"slogan",CONCATENATION]]

# Raggruppa i dati per cluster
clusters = data.groupby(AGGREGATION_FIELD)


# Ottieni i cluster con i loro slogan
cluster_concatenation_df = get_concatenation_by_cluster(clusters)

if MODE == "not_in":
    filtered_cluster_ids = cluster_concatenation_df[
        cluster_concatenation_df[CONCATENATION].apply(
            lambda x: not any(item in x for item in ITEMSET)
        )
    ][AGGREGATION_FIELD]
else:
    filtered_cluster_ids = filter_cluster_ids_by_itemset(cluster_concatenation_df, ITEMSET)   
    

print("Filtered Cluster IDs:")
print(filtered_cluster_ids.tolist())
print("Number of filtered clusters: " + str(len(filtered_cluster_ids)))
print("Total number of clusters: " + str(len(clusters)))
print(
    "Percentage of clusters filtered: "
    + str(round((len(filtered_cluster_ids) / len(clusters)) * 100, 2))
    + "%"
)

CLUSTER_ID_TO_VIEW = 2
print("\n")


if CLUSTER_ID_TO_VIEW in filtered_cluster_ids.values:
    sample_cluster = data[data[AGGREGATION_FIELD] == CLUSTER_ID_TO_VIEW]
    print(f"{AGGREGATION_FIELD}: {CLUSTER_ID_TO_VIEW}")
    print(tabulate(sample_cluster, headers="keys", tablefmt="grid"))
else:
    print(
        f"{AGGREGATION_FIELD} {CLUSTER_ID_TO_VIEW} non trovato nella lista dei cluster filtrati."
    )

ITEMSET scelto: ['dns-link-down-alarms_rdg_others']
MODE: not_in
Filtered Cluster IDs:
[69, 135, 136, 137, 139, 140, 141, 144, 148, 223, 230, 231, 234, 236, 237, 238, 241, 242, 244, 245, 246, 247, 249, 251, 252, 253, 254, 255, 263, 270, 275, 279, 295, 298, 300, 306, 338, 339, 343, 345, 347, 348, 350, 351, 353, 359, 361, 362, 364, 365, 369, 372, 373, 374, 376, 378, 380, 381, 383, 384, 385, 387, 388, 389, 392, 394, 399, 400, 401, 409, 410, 412, 414, 415, 416, 417, 418, 420, 421, 422, 423, 429, 430, 461, 470, 474, 560, 561, 567, 576, 627, 637, 638, 639, 640, 643, 721, 759, 760, 761, 762, 764, 768, 769, 773, 776, 778, 779, 780, 781, 783, 785, 786, 788, 789, 893, 904, 918, 919, 1005, 1039, 1040, 1042, 1045, 1046, 1047, 1048, 1049, 1051, 1053, 1054, 1055, 1057, 1059, 1060, 1063, 1064, 1065, 1077, 1078, 1086, 1087, 1089, 1092, 1093, 1099, 1100, 1101, 1124, 1130, 1131, 1151, 1152, 1159, 1169, 1171, 1172, 1173, 1176, 1177, 1181, 1182, 1197, 1210, 1211, 1224, 1245, 1246, 1249, 1255, 1257, 1258, 

###


### Devo poter scrivere delle query che mescolano condizioni di inclusione ed esclusione degli allarmi (es: tutti i cluster che contengono A e che NON contengono B)