In [143]:
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth, association_rules
from mlxtend.preprocessing import TransactionEncoder
from tabulate import tabulate

### Load Clusters


In [144]:
clusters_ne_id_loc_name = pd.read_parquet(
    "20240601_20240828_clusters_rdg_all_ne_id_loc_name_aggregation_column_first_occurrence_5min_5min_5min_delta.parquet"
)


In [145]:

clusters_ne_id_ne_address_first_three_octets = pd.read_parquet(
    "20240601_20240828_clusters_rdg_all_ne_id_ne_address_first_three_octets_aggregation_column_first_occurrence_5min_5min_5min_delta.parquet"
)

### Filter only important columns


In [146]:
clusters_ne_id_loc_name_filtered = clusters_ne_id_loc_name[
    ["cluster_id", "cluster_id2", "cluster_id3", "ne_type", "probable_cause"]
]

clusters_ne_id_ne_address_first_three_octets_filtered = (
    clusters_ne_id_ne_address_first_three_octets[
        ["cluster_id", "cluster_id2", "cluster_id3", "ne_type", "probable_cause"]
    ]
)

In [147]:
clusters_ne_id_ne_address_first_three_octets_filtered

Unnamed: 0,cluster_id,cluster_id2,cluster_id3,ne_type,probable_cause
48867,55805,1,1,accesspoint,link-down-alarms
29596,55806,2,2,accesspoint,link-down-alarms
23576,55807,3,3,accesspoint,link-down-alarms
20715,55808,4,4,accesspoint,link-down-alarms
39779,55809,5,5,accesspoint,link-down-alarms
...,...,...,...,...,...
6667,39978,52678,52885,n/d,dns-link-down-alarms
6666,39978,52678,52885,n/d,dns-link-down-alarms
6677,39978,52678,52885,n/d,dns-snmplinkup-alarms
6670,39978,52678,52885,n/d,dns-link-down-alarms


## Statistics


In [148]:
total_rows = len(clusters_ne_id_loc_name_filtered)


filtered_rows_ne_type_nd = clusters_ne_id_loc_name_filtered[
    clusters_ne_id_loc_name_filtered["ne_type"] == "n/d"
]
num_filtered_rows_ne_type_nd = len(filtered_rows_ne_type_nd)
percentage_ne_type_nd = (num_filtered_rows_ne_type_nd / total_rows) * 100


filtered_rows_ne_type_na = clusters_ne_id_loc_name_filtered[
    clusters_ne_id_loc_name_filtered["ne_type"].isna()
]
num_filtered_rows_ne_type_na = len(filtered_rows_ne_type_na)
percentage_ne_type_na = (num_filtered_rows_ne_type_na / total_rows) * 100


filtered_rows_probable_cause_na = clusters_ne_id_loc_name_filtered[
    clusters_ne_id_loc_name_filtered["probable_cause"].isna()
]
num_filtered_rows_probable_cause_na = len(filtered_rows_probable_cause_na)
percentage_probable_cause_na = (num_filtered_rows_probable_cause_na / total_rows) * 100


print(f"Percentuale di righe con ne_type uguale a 'n/d': {percentage_ne_type_nd:.2f}%")
print(f"Percentuale di righe con ne_type uguale a NA: {percentage_ne_type_na:.2f}%")
print(
    f"Percentuale di righe con probable_cause uguale a NA: {percentage_probable_cause_na:.2f}%"
)

Percentuale di righe con ne_type uguale a 'n/d': 94.10%
Percentuale di righe con ne_type uguale a NA: 0.00%
Percentuale di righe con probable_cause uguale a NA: 0.00%


In [149]:
total_rows = len(clusters_ne_id_ne_address_first_three_octets_filtered)


filtered_rows_ne_type_nd = clusters_ne_id_ne_address_first_three_octets_filtered[
    clusters_ne_id_ne_address_first_three_octets_filtered["ne_type"] == "n/d"
]
num_filtered_rows_ne_type_nd = len(filtered_rows_ne_type_nd)
percentage_ne_type_nd = (num_filtered_rows_ne_type_nd / total_rows) * 100


filtered_rows_ne_type_na = clusters_ne_id_ne_address_first_three_octets_filtered[
    clusters_ne_id_ne_address_first_three_octets_filtered["ne_type"].isna()
]
num_filtered_rows_ne_type_na = len(filtered_rows_ne_type_na)
percentage_ne_type_na = (num_filtered_rows_ne_type_na / total_rows) * 100

filtered_rows_probable_cause_na = clusters_ne_id_ne_address_first_three_octets_filtered[
    clusters_ne_id_ne_address_first_three_octets_filtered["probable_cause"].isna()
]
num_filtered_rows_probable_cause_na = len(filtered_rows_probable_cause_na)
percentage_probable_cause_na = (num_filtered_rows_probable_cause_na / total_rows) * 100


print(f"Percentuale di righe con ne_type uguale a 'n/d': {percentage_ne_type_nd:.2f}%")
print(f"Percentuale di righe con ne_type uguale a NA: {percentage_ne_type_na:.2f}%")
print(
    f"Percentuale di righe con probable_cause uguale a NA: {percentage_probable_cause_na:.2f}%"
)

Percentuale di righe con ne_type uguale a 'n/d': 83.43%
Percentuale di righe con ne_type uguale a NA: 0.36%
Percentuale di righe con probable_cause uguale a NA: 0.00%


### Create new column "slogan_netype"


In [150]:
clusters_ne_id_loc_name_filtered = clusters_ne_id_loc_name_filtered.rename(
    columns={"probable_cause": "slogan"}
)

clusters_ne_id_ne_address_first_three_octets_filtered = (
    clusters_ne_id_ne_address_first_three_octets_filtered.rename(
        columns={"probable_cause": "slogan"}
    )
)


clusters_ne_id_loc_name_filtered["slogan_netype"] = (
    clusters_ne_id_loc_name_filtered["slogan"]
    + "_"
    + clusters_ne_id_loc_name_filtered["ne_type"]

)
clusters_ne_id_ne_address_first_three_octets_filtered["slogan_netype"] = (
    clusters_ne_id_ne_address_first_three_octets_filtered["slogan"]
    + "_"
    + clusters_ne_id_ne_address_first_three_octets_filtered["ne_type"]
)

### Drop columns with ne_type = NA


In [151]:
clusters_ne_id_ne_address_first_three_octets_filtered = (
    clusters_ne_id_ne_address_first_three_octets_filtered.dropna(subset=["ne_type"])
)

## Preprocessing and FP-Growth


In [152]:
def create_baskets(data, aggregation_field):
    baskets = data.groupby(aggregation_field)["slogan_netype"].apply(list)
    te = TransactionEncoder()
    te_ary = te.fit(baskets).transform(baskets)
    basket_df = pd.DataFrame(te_ary, columns=te.columns_)

    return basket_df

In [153]:
def print_frequent_itemsets(frequent_itemsets):
    frequent_itemsets["itemsets"] = frequent_itemsets["itemsets"].apply(
        lambda x: ", ".join(list(x))
    )
    table = frequent_itemsets.values.tolist()
    return tabulate(table, headers=frequent_itemsets.columns, tablefmt="grid")

In [154]:
def print_first_N_rules(rules, N):
    rules_df = rules[
        ["antecedents", "consequents", "support", "confidence", "lift"]
    ].head(N)

    rules_df["antecedents"] = rules_df["antecedents"].apply(
        lambda x: ", ".join(list(x))
    )
    rules_df["consequents"] = rules_df["consequents"].apply(
        lambda x: ", ".join(list(x))
    )

    print("\nAssociation Rules:")
    print(tabulate(rules_df, headers="keys", tablefmt="pretty", showindex=False))

### ITEMSETS AND ASSOCIATION RULES


In [155]:
DATASET = "ne_id_loc_name"  # ne_id_ne_address
AGGREGATION_FIELD = "cluster_id2"  # cluster_id2, cluster_id3
MIN_SUPPORT = 0.005

ASSOCIATION_RULES_METRIC = "confidence"  # confidence, lift
MIN_THRESHOLD = 0.01
NUM_ASSOCIATION_RULES = 20

In [156]:
if DATASET == "ne_id_loc_name":
    data = clusters_ne_id_loc_name_filtered
else:
    data = clusters_ne_id_ne_address_first_three_octets_filtered



# Create baskets from dataframe of clusters
basket_df = create_baskets(data, AGGREGATION_FIELD)
# Find frequent itemsets with fpgrowth
frequent_itemsets = fpgrowth(basket_df, min_support=MIN_SUPPORT, use_colnames=True)
# Sort itemsets by support
frequent_itemsets = frequent_itemsets.sort_values(by="support", ascending=False)



# Filter out itemsets with only one item
frequent_itemsets_copy = frequent_itemsets.copy()
frequent_itemsets_copy = frequent_itemsets_copy[

    frequent_itemsets_copy["itemsets"].apply(lambda x: len(x) > 1)

]

support_distribution = frequent_itemsets['support'].describe()
print("\nDistribuzione dei supporti:")
print(support_distribution)
print("\n")


print("AGGREGATION: " + AGGREGATION_FIELD + "\n")
print(print_frequent_itemsets(frequent_itemsets_copy))



Distribuzione dei supporti:
count    53.000000
mean      0.035968
std       0.083185
min       0.005061
25%       0.006931
50%       0.009704
75%       0.024073
max       0.512136
Name: support, dtype: float64


AGGREGATION: cluster_id2

+------------+----------------------------------------------------------------------------------------------------------------------+
|    support | itemsets                                                                                                             |
| 0.15049    | dns-node-down-alarms_n/d, dns-nodeunmanagable-alarms_n/d                                                             |
+------------+----------------------------------------------------------------------------------------------------------------------+
| 0.0470239  | dns-link-down-alarms_n/d, dns-nodeunmanagable-alarms_n/d                                                             |
+------------+-----------------------------------------------------------------------------

### Filter Itemsets containing a given string

In [157]:

FIELD = "link-down"

def filter_itemsets_by_string(itemsets, search_string):
    return itemsets[itemsets["itemsets"].apply(lambda x: search_string in x)]


filtered_itemsets = filter_itemsets_by_string(frequent_itemsets_copy, FIELD)


print(f"Itemsets che contengono '{FIELD}':")
print(tabulate(filtered_itemsets, headers='keys', tablefmt='pretty'))



Itemsets che contengono 'link-down':
+----+-----------------------+---------------------------------------------------------------------------------------------------------------+
|    |        support        |                                                   itemsets                                                    |
+----+-----------------------+---------------------------------------------------------------------------------------------------------------+
| 20 |  0.04702387501375289  |                           dns-link-down-alarms_n/d, dns-nodeunmanagable-alarms_n/d                            |
| 22 |  0.04438332049730443  |                              dns-node-down-alarms_n/d, dns-link-down-alarms_n/d                               |
| 23 |  0.04004841016613489  |              dns-node-down-alarms_n/d, dns-link-down-alarms_n/d, dns-nodeunmanagable-alarms_n/d               |
| 24 |  0.02693365606777423  |                              dns-snmplinkup-alarms_n/d, dns-link-down-alar

### Print Association Rules

In [158]:
rules = association_rules(
    frequent_itemsets, metric=ASSOCIATION_RULES_METRIC, min_threshold=MIN_THRESHOLD
)

print("AGGREGATION: " + AGGREGATION_FIELD + "\n")
print_first_N_rules(rules, NUM_ASSOCIATION_RULES)

AGGREGATION: cluster_id2


Association Rules:
+----------------------------------------------------------+----------------------------------------------------------+----------------------+----------------------+--------------------+
|                       antecedents                        |                       consequents                        |       support        |      confidence      |        lift        |
+----------------------------------------------------------+----------------------------------------------------------+----------------------+----------------------+--------------------+
|                 dns-node-down-alarms_n/d                 |              dns-nodeunmanagable-alarms_n/d              |  0.1504896028165915  |  0.8243731918997108  | 2.7672949996958454 |
|              dns-nodeunmanagable-alarms_n/d              |                 dns-node-down-alarms_n/d                 |  0.1504896028165915  |  0.5051706308169597  | 2.7672949996958454 |
|                 d

## Association Rules Sorted by Metrics

In [159]:
METRIC = "confidence"  # support, confidence, lift


# Funzione per ordinare le regole in base a una metrica specifica (support, confidence o lift)
def sort_rules(rules, metric="confidence", ascending=False):
    if metric not in ["support", "confidence", "lift"]:
        raise ValueError("La metrica deve essere 'support', 'confidence' o 'lift'.")
    return rules.sort_values(by=metric, ascending=ascending)


sorted_rules = sort_rules(rules, metric=METRIC)

print("\nRegole ordinate per: " + METRIC)
print_first_N_rules(sorted_rules, 1000)


Regole ordinate per: confidence

Association Rules:
+--------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------+-----------------------+----------------------+--------------------+
|                                        antecedents                                         |                                        consequents                                         |        support        |      confidence      |        lift        |
+--------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------+-----------------------+----------------------+--------------------+
|       dns-snmplinkup-alarms_n/d, dns-node-down-alarms_n/d, dns-link-down-alarms_n/d        |                               dns-nodeunmanagable-alarms_n/d                        

## Find a string in the Association Rules

In [160]:
FIELD = 'dns-snmplinkup-alarms_n/d'
SEARCH_IN  = 'antecedents'  # antecedents, consequents, both
METRIC = 'confidence'  # support, confidence, lift



# Funzione per filtrare le regole in base ad un particolare campo presente negli antecedents o nei consequents
def search_in_rules(rules, field, search_in):
    if search_in == 'antecedents':
        return rules[rules['antecedents'].apply(lambda x: field in x)]
    elif search_in == 'consequents':
        return rules[rules['consequents'].apply(lambda x: field in x)]
    elif search_in == 'both':
        return rules[(rules['antecedents'].apply(lambda x: field in x)) | (rules['consequents'].apply(lambda x: field in x))]
    else:
        raise ValueError("search_in deve essere 'antecedents', 'consequents', o 'both'.")



searched_rules = search_in_rules(rules, field=FIELD, search_in=SEARCH_IN)


# sorted_rules = sort_rules(searched_rules, metric=METRIC)
# print("\nRegole ordinate per: " + METRIC)
# print_first_N_rules(sorted_rules, 1000)


print("Regole con <" + FIELD + "> negli " + SEARCH_IN + ":")
print_first_N_rules(searched_rules, 1000)



Regole con <dns-snmplinkup-alarms_n/d> negli antecedents:

Association Rules:
+-------------------------------------------------------------------------------------+------------------------------------------------------------------------------------+----------------------+---------------------+--------------------+
|                                     antecedents                                     |                                    consequents                                     |       support        |     confidence      |        lift        |
+-------------------------------------------------------------------------------------+------------------------------------------------------------------------------------+----------------------+---------------------+--------------------+
|                              dns-snmplinkup-alarms_n/d                              |                              dns-link-down-alarms_n/d                              | 0.02693365606777423  | 0.3926852