In [2]:
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth, association_rules
from mlxtend.preprocessing import TransactionEncoder
from tabulate import tabulate

### Load Clusters


In [3]:
clusters_ne_id_loc_name = pd.read_parquet(
    "20240601_20240828_clusters_rdg_all_ne_id_loc_name_aggregation_column_first_occurrence_5min_5min_5min_delta.parquet"
)

clusters_ne_id_ne_address_first_three_octets = pd.read_parquet(
    "20240601_20240828_clusters_rdg_all_ne_id_ne_address_first_three_octets_aggregation_column_first_occurrence_5min_5min_5min_delta.parquet"
)

### Filter only important columns


In [4]:
clusters_ne_id_loc_name_filtered = clusters_ne_id_loc_name[
    ["cluster_id", "cluster_id2", "cluster_id3", "ne_type", "probable_cause"]
]

clusters_ne_id_ne_address_first_three_octets_filtered = (
    clusters_ne_id_ne_address_first_three_octets[
        ["cluster_id", "cluster_id2", "cluster_id3", "ne_type", "probable_cause"]
    ]
)

In [5]:
clusters_ne_id_ne_address_first_three_octets_filtered

Unnamed: 0,cluster_id,cluster_id2,cluster_id3,ne_type,probable_cause
48867,55805,1,1,accesspoint,link-down-alarms
29596,55806,2,2,accesspoint,link-down-alarms
23576,55807,3,3,accesspoint,link-down-alarms
20715,55808,4,4,accesspoint,link-down-alarms
39779,55809,5,5,accesspoint,link-down-alarms
...,...,...,...,...,...
6667,39978,52678,52885,n/d,dns-link-down-alarms
6666,39978,52678,52885,n/d,dns-link-down-alarms
6677,39978,52678,52885,n/d,dns-snmplinkup-alarms
6670,39978,52678,52885,n/d,dns-link-down-alarms


## Statistics


In [7]:
total_rows = len(clusters_ne_id_loc_name_filtered)


filtered_rows_ne_type_nd = clusters_ne_id_loc_name_filtered[
    clusters_ne_id_loc_name_filtered["ne_type"] == "n/d"
]
num_filtered_rows_ne_type_nd = len(filtered_rows_ne_type_nd)
percentage_ne_type_nd = (num_filtered_rows_ne_type_nd / total_rows) * 100


filtered_rows_ne_type_na = clusters_ne_id_loc_name_filtered[
    clusters_ne_id_loc_name_filtered["ne_type"].isna()
]
num_filtered_rows_ne_type_na = len(filtered_rows_ne_type_na)
percentage_ne_type_na = (num_filtered_rows_ne_type_na / total_rows) * 100


filtered_rows_probable_cause_na = clusters_ne_id_loc_name_filtered[
    clusters_ne_id_loc_name_filtered["probable_cause"].isna()
]
num_filtered_rows_probable_cause_na = len(filtered_rows_probable_cause_na)
percentage_probable_cause_na = (num_filtered_rows_probable_cause_na / total_rows) * 100


print(f"Percentuale di righe con ne_type uguale a 'n/d': {percentage_ne_type_nd:.2f}%")
print(f"Percentuale di righe con ne_type uguale a NA: {percentage_ne_type_na:.2f}%")
print(
    f"Percentuale di righe con probable_cause uguale a NA: {percentage_probable_cause_na:.2f}%"
)

Percentuale di righe con ne_type uguale a 'n/d': 94.10%
Percentuale di righe con ne_type uguale a NA: 0.00%
Percentuale di righe con probable_cause uguale a NA: 0.00%


In [8]:
total_rows = len(clusters_ne_id_ne_address_first_three_octets_filtered)


filtered_rows_ne_type_nd = clusters_ne_id_ne_address_first_three_octets_filtered[
    clusters_ne_id_ne_address_first_three_octets_filtered["ne_type"] == "n/d"
]
num_filtered_rows_ne_type_nd = len(filtered_rows_ne_type_nd)
percentage_ne_type_nd = (num_filtered_rows_ne_type_nd / total_rows) * 100


filtered_rows_ne_type_na = clusters_ne_id_ne_address_first_three_octets_filtered[
    clusters_ne_id_ne_address_first_three_octets_filtered["ne_type"].isna()
]
num_filtered_rows_ne_type_na = len(filtered_rows_ne_type_na)
percentage_ne_type_na = (num_filtered_rows_ne_type_na / total_rows) * 100

filtered_rows_probable_cause_na = clusters_ne_id_ne_address_first_three_octets_filtered[
    clusters_ne_id_ne_address_first_three_octets_filtered["probable_cause"].isna()
]
num_filtered_rows_probable_cause_na = len(filtered_rows_probable_cause_na)
percentage_probable_cause_na = (num_filtered_rows_probable_cause_na / total_rows) * 100


print(f"Percentuale di righe con ne_type uguale a 'n/d': {percentage_ne_type_nd:.2f}%")
print(f"Percentuale di righe con ne_type uguale a NA: {percentage_ne_type_na:.2f}%")
print(
    f"Percentuale di righe con probable_cause uguale a NA: {percentage_probable_cause_na:.2f}%"
)

Percentuale di righe con ne_type uguale a 'n/d': 83.43%
Percentuale di righe con ne_type uguale a NA: 0.36%
Percentuale di righe con probable_cause uguale a NA: 0.00%


### Create new column "slogan_netype"


In [9]:
clusters_ne_id_loc_name_filtered = clusters_ne_id_loc_name_filtered.rename(
    columns={"probable_cause": "slogan"}
)

clusters_ne_id_ne_address_first_three_octets_filtered = (
    clusters_ne_id_ne_address_first_three_octets_filtered.rename(
        columns={"probable_cause": "slogan"}
    )
)


clusters_ne_id_loc_name_filtered["slogan_netype"] = (
    clusters_ne_id_loc_name_filtered["slogan"]
    + "_"
    + clusters_ne_id_loc_name_filtered["ne_type"]

)
clusters_ne_id_ne_address_first_three_octets_filtered["slogan_netype"] = (
    clusters_ne_id_ne_address_first_three_octets_filtered["slogan"]
    + "_"
    + clusters_ne_id_ne_address_first_three_octets_filtered["ne_type"]
)

### Drop columns with ne_type = NA


In [10]:
clusters_ne_id_ne_address_first_three_octets_filtered = (
    clusters_ne_id_ne_address_first_three_octets_filtered.dropna(subset=["ne_type"])
)

## Preprocessing and FP-Growth


In [11]:
def create_baskets(data, aggregation_field):
    baskets = data.groupby(aggregation_field)["slogan_netype"].apply(list)
    te = TransactionEncoder()
    te_ary = te.fit(baskets).transform(baskets)
    basket_df = pd.DataFrame(te_ary, columns=te.columns_)

    return basket_df

In [12]:
def print_frequent_itemsets(frequent_itemsets):
    frequent_itemsets["itemsets"] = frequent_itemsets["itemsets"].apply(
        lambda x: ", ".join(list(x))
    )
    table = frequent_itemsets.values.tolist()
    return tabulate(table, headers=frequent_itemsets.columns, tablefmt="grid")

In [18]:
def print_first_N_rules(rules, N):
    rules_df = rules[
        ["antecedents", "consequents", "support", "confidence", "lift"]
    ].head(N)

    rules_df["antecedents"] = rules_df["antecedents"].apply(
        lambda x: ", ".join(list(x))
    )
    rules_df["consequents"] = rules_df["consequents"].apply(
        lambda x: ", ".join(list(x))
    )

    print("\nAssociation Rules:")
    print(tabulate(rules_df, headers="keys", tablefmt="pretty", showindex=False))

### ITEMSETS AND ASSOCIATION RULES


In [24]:
DATASET = "ne_id_loc_name"  # ne_id_ne_address
AGGREGATION_FIELD = "cluster_id2"  # cluster_id2, cluster_id3
MIN_SUPPORT = 0.005

ASSOCIATION_RULES_METRIC = "confidence"  # confidence, lift, leverage, conviction
MIN_THRESHOLD = 0.01
NUM_ASSOCIATION_RULES = 20

In [25]:
if DATASET == "ne_id_loc_name":
    data = clusters_ne_id_loc_name_filtered
else:
    data = clusters_ne_id_ne_address_first_three_octets_filtered



# Create baskets from dataframe of clusters
basket_df = create_baskets(data, AGGREGATION_FIELD)
# Find frequent itemsets with fpgrowth
frequent_itemsets = fpgrowth(basket_df, min_support=MIN_SUPPORT, use_colnames=True)
# Sort itemsets by support
frequent_itemsets = frequent_itemsets.sort_values(by="support", ascending=False)



# Filter out itemsets with only one item
frequent_itemsets_copy = frequent_itemsets.copy()
frequent_itemsets_copy = frequent_itemsets_copy[

    frequent_itemsets_copy["itemsets"].apply(lambda x: len(x) > 1)

]



print("AGGREGATION: " + AGGREGATION_FIELD + "\n")
print(print_frequent_itemsets(frequent_itemsets_copy.head(20)))

AGGREGATION: cluster_id2

+------------+--------------------------------------------------------------------------------------------+
|    support | itemsets                                                                                   |
| 0.15049    | dns-node-down-alarms_n/d, dns-nodeunmanagable-alarms_n/d                                   |
+------------+--------------------------------------------------------------------------------------------+
| 0.0470239  | dns-link-down-alarms_n/d, dns-nodeunmanagable-alarms_n/d                                   |
+------------+--------------------------------------------------------------------------------------------+
| 0.0443833  | dns-node-down-alarms_n/d, dns-link-down-alarms_n/d                                         |
+------------+--------------------------------------------------------------------------------------------+
| 0.0400484  | dns-node-down-alarms_n/d, dns-link-down-alarms_n/d, dns-nodeunmanagable-alarms_n/d         |
+-

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frequent_itemsets['itemsets'] = frequent_itemsets['itemsets'].apply(lambda x: ', '.join(list(x)))


In [26]:
rules = association_rules(
    frequent_itemsets, metric=ASSOCIATION_RULES_METRIC, min_threshold=MIN_THRESHOLD
)

print("AGGREGATION: " + AGGREGATION_FIELD + "\n")
print_first_N_rules(rules, NUM_ASSOCIATION_RULES)

AGGREGATION: cluster_id2


Association Rules:
+----------------------------------------------------------+----------------------------------------------------------+----------------------+----------------------+--------------------+
|                       antecedents                        |                       consequents                        |       support        |      confidence      |        lift        |
+----------------------------------------------------------+----------------------------------------------------------+----------------------+----------------------+--------------------+
|                 dns-node-down-alarms_n/d                 |              dns-nodeunmanagable-alarms_n/d              |  0.1504896028165915  |  0.8243731918997108  | 2.7672949996958454 |
|              dns-nodeunmanagable-alarms_n/d              |                 dns-node-down-alarms_n/d                 |  0.1504896028165915  |  0.5051706308169597  | 2.7672949996958454 |
|                 d