In [212]:
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth, association_rules
from mlxtend.preprocessing import TransactionEncoder
from tabulate import tabulate


### Load Clusters

In [213]:
clusters_ne_id_loc_name = pd.read_parquet("20240601_20240828_clusters_rdg_all_ne_id_loc_name_aggregation_column_first_occurrence_5min_5min_5min_delta.parquet")
clusters_ne_id_ne_address_first_three_octets = pd.read_parquet("20240601_20240828_clusters_rdg_all_ne_id_ne_address_first_three_octets_aggregation_column_first_occurrence_5min_5min_5min_delta.parquet")

### Filter only important columns

In [214]:
clusters_ne_id_loc_name_filtered = clusters_ne_id_loc_name[["cluster_id", "cluster_id2", "cluster_id3", "ne_type", "probable_cause"]]
clusters_ne_id_ne_address_first_three_octets_filtered = clusters_ne_id_ne_address_first_three_octets[["cluster_id", "cluster_id2", "cluster_id3", "ne_type", "probable_cause"]]

In [215]:
clusters_ne_id_ne_address_first_three_octets_filtered

Unnamed: 0,cluster_id,cluster_id2,cluster_id3,ne_type,probable_cause
48867,55805,1,1,accesspoint,link-down-alarms
29596,55806,2,2,accesspoint,link-down-alarms
23576,55807,3,3,accesspoint,link-down-alarms
20715,55808,4,4,accesspoint,link-down-alarms
39779,55809,5,5,accesspoint,link-down-alarms
...,...,...,...,...,...
6667,39978,52678,52885,n/d,dns-link-down-alarms
6666,39978,52678,52885,n/d,dns-link-down-alarms
6677,39978,52678,52885,n/d,dns-snmplinkup-alarms
6670,39978,52678,52885,n/d,dns-link-down-alarms


## Statistics

In [216]:
total_rows = len(clusters_ne_id_loc_name_filtered)


filtered_rows_ne_type_nd = clusters_ne_id_loc_name_filtered[clusters_ne_id_loc_name_filtered['ne_type'] == 'n/d']
num_filtered_rows_ne_type_nd = len(filtered_rows_ne_type_nd)
percentage_ne_type_nd = (num_filtered_rows_ne_type_nd / total_rows) * 100


filtered_rows_ne_type_na = clusters_ne_id_loc_name_filtered[clusters_ne_id_loc_name_filtered['ne_type'].isna()]
num_filtered_rows_ne_type_na = len(filtered_rows_ne_type_na)
percentage_ne_type_na = (num_filtered_rows_ne_type_na / total_rows) * 100


filtered_rows_probable_cause_na = clusters_ne_id_loc_name_filtered[clusters_ne_id_loc_name_filtered['probable_cause'].isna()]
num_filtered_rows_probable_cause_na = len(filtered_rows_probable_cause_na)
percentage_probable_cause_na = (num_filtered_rows_probable_cause_na / total_rows) * 100


print(f"Percentuale di righe con ne_type uguale a 'n/d': {percentage_ne_type_nd:.2f}%")
print(f"Percentuale di righe con ne_type uguale a NA: {percentage_ne_type_na:.2f}%")
print(f"Percentuale di righe con probable_cause uguale a NA: {percentage_probable_cause_na:.2f}%")

Percentuale di righe con ne_type uguale a 'n/d': 94.10%
Percentuale di righe con ne_type uguale a NA: 0.00%
Percentuale di righe con probable_cause uguale a NA: 0.00%


In [217]:
total_rows = len(clusters_ne_id_ne_address_first_three_octets_filtered)


filtered_rows_ne_type_nd = clusters_ne_id_ne_address_first_three_octets_filtered[clusters_ne_id_ne_address_first_three_octets_filtered['ne_type'] == 'n/d']
num_filtered_rows_ne_type_nd = len(filtered_rows_ne_type_nd)
percentage_ne_type_nd = (num_filtered_rows_ne_type_nd / total_rows) * 100


filtered_rows_ne_type_na = clusters_ne_id_ne_address_first_three_octets_filtered[clusters_ne_id_ne_address_first_three_octets_filtered['ne_type'].isna()]
num_filtered_rows_ne_type_na = len(filtered_rows_ne_type_na)
percentage_ne_type_na = (num_filtered_rows_ne_type_na / total_rows) * 100

filtered_rows_probable_cause_na = clusters_ne_id_ne_address_first_three_octets_filtered[clusters_ne_id_ne_address_first_three_octets_filtered['probable_cause'].isna()]
num_filtered_rows_probable_cause_na = len(filtered_rows_probable_cause_na)
percentage_probable_cause_na = (num_filtered_rows_probable_cause_na / total_rows) * 100


print(f"Percentuale di righe con ne_type uguale a 'n/d': {percentage_ne_type_nd:.2f}%")
print(f"Percentuale di righe con ne_type uguale a NA: {percentage_ne_type_na:.2f}%")
print(f"Percentuale di righe con probable_cause uguale a NA: {percentage_probable_cause_na:.2f}%")

Percentuale di righe con ne_type uguale a 'n/d': 83.43%
Percentuale di righe con ne_type uguale a NA: 0.36%
Percentuale di righe con probable_cause uguale a NA: 0.00%


### Create new column "slogan_netype"

In [218]:
clusters_ne_id_loc_name_filtered = clusters_ne_id_loc_name_filtered.rename(columns={"probable_cause": "slogan"})
clusters_ne_id_ne_address_first_three_octets_filtered = clusters_ne_id_ne_address_first_three_octets_filtered.rename(columns={"probable_cause": "slogan"})

clusters_ne_id_loc_name_filtered["slogan_netype"] = clusters_ne_id_loc_name_filtered["slogan"] + "_" + clusters_ne_id_loc_name_filtered["ne_type"]
clusters_ne_id_ne_address_first_three_octets_filtered["slogan_netype"] = clusters_ne_id_ne_address_first_three_octets_filtered["slogan"] + "_" + clusters_ne_id_ne_address_first_three_octets_filtered["ne_type"]

### Drop columns with ne_type = NA

In [219]:
clusters_ne_id_ne_address_first_three_octets_filtered = clusters_ne_id_ne_address_first_three_octets_filtered.dropna(subset=['ne_type'])

## Preprocessing and FP-Growth

In [220]:
def create_baskets(data,aggregation_field):
    baskets = data.groupby(aggregation_field)['slogan_netype'].apply(list)
    te = TransactionEncoder()
    te_ary = te.fit(baskets).transform(baskets)
    basket_df = pd.DataFrame(te_ary, columns=te.columns_)
    
    return basket_df

In [221]:
def print_frequent_itemsets(frequent_itemsets):
    frequent_itemsets['itemsets'] = frequent_itemsets['itemsets'].apply(lambda x: ', '.join(list(x)))
    table = frequent_itemsets.values.tolist()
    return tabulate(table, headers=frequent_itemsets.columns, tablefmt='grid')

In [222]:
def print_first20_rules(rules):
    rules_df = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(20)
    
    rules_df['antecedents'] = rules_df['antecedents'].apply(lambda x: ', '.join(list(x)))
    rules_df['consequents'] = rules_df['consequents'].apply(lambda x: ', '.join(list(x)))
    
    print("\nAssociation Rules:")
    print(tabulate(rules_df, headers='keys', tablefmt='pretty', showindex=False))

### Cluster ne_id_loc_name LVL 1-2-3

In [223]:
basket_df_lvl1 = create_baskets(clusters_ne_id_loc_name_filtered, "cluster_id")
basket_df_lvl2 = create_baskets(clusters_ne_id_loc_name_filtered, "cluster_id2")
basket_df_lvl3 = create_baskets(clusters_ne_id_loc_name_filtered, "cluster_id3")

frequent_itemsets_lvl1 = fpgrowth(basket_df_lvl1, min_support=0.005, use_colnames=True)
frequent_itemsets_lvl2 = fpgrowth(basket_df_lvl2, min_support=0.005, use_colnames=True)
frequent_itemsets_lvl3 = fpgrowth(basket_df_lvl3, min_support=0.005, use_colnames=True)


frequent_itemsets_lvl1 = frequent_itemsets_lvl1.sort_values(by='support', ascending=False)
frequent_itemsets_lvl2 = frequent_itemsets_lvl2.sort_values(by='support', ascending=False)
frequent_itemsets_lvl3 = frequent_itemsets_lvl3.sort_values(by='support', ascending=False)


rules_lvl1 = association_rules(frequent_itemsets_lvl1, metric="confidence", min_threshold=0.01)
rules_lvl2 = association_rules(frequent_itemsets_lvl2, metric="confidence", min_threshold=0.01)
rules_lvl3 = association_rules(frequent_itemsets_lvl3, metric="confidence", min_threshold=0.01)

frequent_itemsets_lvl1 = frequent_itemsets_lvl1[frequent_itemsets_lvl1['itemsets'].apply(lambda x: len(x) > 1)]
frequent_itemsets_lvl2 = frequent_itemsets_lvl2[frequent_itemsets_lvl2['itemsets'].apply(lambda x: len(x) > 1)]
frequent_itemsets_lvl3 = frequent_itemsets_lvl3[frequent_itemsets_lvl3['itemsets'].apply(lambda x: len(x) > 1)]

print("LVL 1")
print(print_frequent_itemsets(frequent_itemsets_lvl1.head(20)))
print("\n")
print("LVL 2")
print(print_frequent_itemsets(frequent_itemsets_lvl2.head(20)))
print("\n")
print("LVL 3")
print(print_frequent_itemsets(frequent_itemsets_lvl3.head(20)))


LVL 1
+------------+-----------------------------------------------------------------------------------------+
|    support | itemsets                                                                                |
| 0.137755   | dns-node-down-alarms_n/d, dns-nodeunmanagable-alarms_n/d                                |
+------------+-----------------------------------------------------------------------------------------+
| 0.0248598  | dns-islandgroupdown-alarms_n/d, dns-nodeunmanagable-alarms_n/d                          |
+------------+-----------------------------------------------------------------------------------------+
| 0.0180146  | dns-link-down-alarms_n/d, dns-snmplinkup-alarms_n/d                                     |
+------------+-----------------------------------------------------------------------------------------+
| 0.0152598  | dns-snmplinkup-alarms_n/d, dns-snmpcoldstart-alarms_n/d                                 |
+------------+-----------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frequent_itemsets['itemsets'] = frequent_itemsets['itemsets'].apply(lambda x: ', '.join(list(x)))


In [224]:
print("LVL 1")
print_first20_rules(rules_lvl1)
print("\n")
print("LVL 2")
print_first20_rules(rules_lvl2)
print("\n")
print("LVL 3")
print_first20_rules(rules_lvl3)

LVL 1

Association Rules:
+--------------------------------+--------------------------------+----------------------+----------------------+---------------------+
|          antecedents           |          consequents           |       support        |      confidence      |        lift         |
+--------------------------------+--------------------------------+----------------------+----------------------+---------------------+
|    dns-node-down-alarms_n/d    | dns-nodeunmanagable-alarms_n/d |  0.1377554427674636  |  0.8178213896322727  |  2.328037163319928  |
| dns-nodeunmanagable-alarms_n/d |    dns-node-down-alarms_n/d    |  0.1377554427674636  |  0.3921391568841785  |  2.328037163319928  |
| dns-islandgroupdown-alarms_n/d | dns-nodeunmanagable-alarms_n/d | 0.024859756911980765 |  0.6805301645338209  | 1.9372194636622657  |
| dns-nodeunmanagable-alarms_n/d | dns-islandgroupdown-alarms_n/d | 0.024859756911980765 | 0.07076659854569649  | 1.9372194636622655  |
|    dns-link-down-ala

### Cluster ne_id_ne_address_first_three_octets LVL 1-2-3

In [225]:
basket_df_lvl1 = create_baskets(clusters_ne_id_ne_address_first_three_octets_filtered, "cluster_id")
basket_df_lvl2 = create_baskets(clusters_ne_id_ne_address_first_three_octets_filtered, "cluster_id2")
basket_df_lvl3 = create_baskets(clusters_ne_id_ne_address_first_three_octets_filtered, "cluster_id3")

frequent_itemsets_lvl1 = fpgrowth(basket_df_lvl1, min_support=0.005, use_colnames=True)
frequent_itemsets_lvl2 = fpgrowth(basket_df_lvl2, min_support=0.005, use_colnames=True)
frequent_itemsets_lvl3 = fpgrowth(basket_df_lvl3, min_support=0.005, use_colnames=True)


frequent_itemsets_lvl1 = frequent_itemsets_lvl1.sort_values(by='support', ascending=False)
frequent_itemsets_lvl2 = frequent_itemsets_lvl2.sort_values(by='support', ascending=False)
frequent_itemsets_lvl3 = frequent_itemsets_lvl3.sort_values(by='support', ascending=False)


rules_lvl1 = association_rules(frequent_itemsets_lvl1, metric="confidence", min_threshold=0.01)
rules_lvl2 = association_rules(frequent_itemsets_lvl2, metric="confidence", min_threshold=0.01)
rules_lvl3 = association_rules(frequent_itemsets_lvl3, metric="confidence", min_threshold=0.01)

frequent_itemsets_lvl1 = frequent_itemsets_lvl1[frequent_itemsets_lvl1['itemsets'].apply(lambda x: len(x) > 1)]
frequent_itemsets_lvl2 = frequent_itemsets_lvl2[frequent_itemsets_lvl2['itemsets'].apply(lambda x: len(x) > 1)]
frequent_itemsets_lvl3 = frequent_itemsets_lvl3[frequent_itemsets_lvl3['itemsets'].apply(lambda x: len(x) > 1)]

print("LVL 1")
print(print_frequent_itemsets(frequent_itemsets_lvl1))
print("\n")
print("LVL 2")
print(print_frequent_itemsets(frequent_itemsets_lvl2))
print("\n")
print("LVL 3")
print(print_frequent_itemsets(frequent_itemsets_lvl3))


LVL 1
+------------+----------------------------------------------------------------+
|    support | itemsets                                                       |
| 0.107957   | dns-node-down-alarms_n/d, dns-nodeunmanagable-alarms_n/d       |
+------------+----------------------------------------------------------------+
| 0.0195082  | dns-islandgroupdown-alarms_n/d, dns-nodeunmanagable-alarms_n/d |
+------------+----------------------------------------------------------------+
| 0.0141496  | dns-link-down-alarms_n/d, dns-snmplinkup-alarms_n/d            |
+------------+----------------------------------------------------------------+
| 0.0119748  | dns-snmplinkup-alarms_n/d, dns-snmpcoldstart-alarms_n/d        |
+------------+----------------------------------------------------------------+
| 0.00791332 | dns-rrgnosecondary-alarms_n/d, dns-rrgnoprimary-alarms_n/d     |
+------------+----------------------------------------------------------------+
| 0.00780851 | dns-snmplinkup-alar

In [226]:
print("LVL 1")
print_first20_rules(rules_lvl1)
print("\n")
print("LVL 2")
print_first20_rules(rules_lvl2)
print("\n")
print("LVL 3")
print_first20_rules(rules_lvl3)

LVL 1

Association Rules:
+--------------------------------+--------------------------------+----------------------+----------------------+---------------------+
|          antecedents           |          consequents           |       support        |      confidence      |        lift         |
+--------------------------------+--------------------------------+----------------------+----------------------+---------------------+
|    dns-node-down-alarms_n/d    | dns-nodeunmanagable-alarms_n/d |  0.1079565553473869  |  0.8177036816512852  | 2.9683662565108744  |
| dns-nodeunmanagable-alarms_n/d |    dns-node-down-alarms_n/d    |  0.1079565553473869  | 0.39189574812137357  | 2.9683662565108744  |
| dns-islandgroupdown-alarms_n/d | dns-nodeunmanagable-alarms_n/d | 0.019508168800031443 |  0.6805301645338209  | 2.4704092965077975  |
| dns-nodeunmanagable-alarms_n/d | dns-islandgroupdown-alarms_n/d | 0.019508168800031443 | 0.07081708361076762  |  2.470409296507797  |
|    dns-link-down-ala