In [73]:
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth, association_rules
from mlxtend.preprocessing import TransactionEncoder
from tabulate import tabulate


### Load Clusters

In [74]:
clusters_ne_id_loc_name = pd.read_parquet("20240601_20240828_clusters_rdg_all_ne_id_loc_name_aggregation_column_first_occurrence_5min_5min_5min_delta.parquet")
clusters_ne_id_ne_address_first_three_octets = pd.read_parquet("20240601_20240828_clusters_rdg_all_ne_id_ne_address_first_three_octets_aggregation_column_first_occurrence_5min_5min_5min_delta.parquet")

### Filter only important columns

In [75]:
clusters_ne_id_loc_name_filtered = clusters_ne_id_loc_name[["cluster_id", "cluster_id2", "cluster_id3", "ne_type", "std_probable_cause_no"]]
clusters_ne_id_ne_address_first_three_octets_filtered = clusters_ne_id_ne_address_first_three_octets[["cluster_id", "cluster_id2", "cluster_id3", "ne_type", "std_probable_cause_no"]]

In [76]:
clusters_ne_id_ne_address_first_three_octets_filtered

Unnamed: 0,cluster_id,cluster_id2,cluster_id3,ne_type,std_probable_cause_no
48867,55805,1,1,accesspoint,
29596,55806,2,2,accesspoint,
23576,55807,3,3,accesspoint,
20715,55808,4,4,accesspoint,
39779,55809,5,5,accesspoint,
...,...,...,...,...,...
6667,39978,52678,52885,n/d,link-down
6666,39978,52678,52885,n/d,link-down
6677,39978,52678,52885,n/d,snmplinkup
6670,39978,52678,52885,n/d,link-down


## Statistics

In [77]:

total_rows = len(clusters_ne_id_loc_name_filtered)
filtered_rows_ne_type = clusters_ne_id_loc_name_filtered[clusters_ne_id_loc_name_filtered['ne_type'] == 'n/d']
num_filtered_rows_ne_type = len(filtered_rows_ne_type)
percentage_ne_type = (num_filtered_rows_ne_type / total_rows) * 100
filtered_rows_std_probable_cause_no = clusters_ne_id_loc_name_filtered[clusters_ne_id_loc_name_filtered['std_probable_cause_no'].isna()]
num_filtered_rows_std_probable_cause_no = len(filtered_rows_std_probable_cause_no)
percentage_std_probable_cause_no = (num_filtered_rows_std_probable_cause_no / total_rows) * 100

# Stampare i risultati
print(f"Percentuale di righe con ne_type uguale a 'n/d': {percentage_ne_type:.2f}%")
print(f"Percentuale di righe con std_probable_cause_no uguale a NaN/Null/NA: {percentage_std_probable_cause_no:.2f}%")

Percentuale di righe con ne_type uguale a 'n/d': 94.10%
Percentuale di righe con std_probable_cause_no uguale a NaN/Null/NA: 0.00%


In [78]:

total_rows = len(clusters_ne_id_ne_address_first_three_octets_filtered)
filtered_rows_ne_type = clusters_ne_id_ne_address_first_three_octets_filtered[clusters_ne_id_ne_address_first_three_octets_filtered['ne_type'] == 'n/d']
num_filtered_rows_ne_type = len(filtered_rows_ne_type)
percentage_ne_type = (num_filtered_rows_ne_type / total_rows) * 100
filtered_rows_std_probable_cause_no = clusters_ne_id_ne_address_first_three_octets_filtered[clusters_ne_id_ne_address_first_three_octets_filtered['std_probable_cause_no'].isna()]
num_filtered_rows_std_probable_cause_no = len(filtered_rows_std_probable_cause_no)
percentage_std_probable_cause_no = (num_filtered_rows_std_probable_cause_no / total_rows) * 100

print(f"Percentuale di righe con ne_type uguale a 'n/d': {percentage_ne_type:.2f}%")
print(f"Percentuale di righe con std_probable_cause_no uguale a NaN/Null/NA: {percentage_std_probable_cause_no:.2f}%")

Percentuale di righe con ne_type uguale a 'n/d': 83.43%
Percentuale di righe con std_probable_cause_no uguale a NaN/Null/NA: 10.98%


### Create new column "slogan_netype"

In [79]:
clusters_ne_id_loc_name_filtered = clusters_ne_id_loc_name_filtered.rename(columns={"std_probable_cause_no": "slogan"})
clusters_ne_id_ne_address_first_three_octets_filtered = clusters_ne_id_ne_address_first_three_octets_filtered.rename(columns={"std_probable_cause_no": "slogan"})

clusters_ne_id_loc_name_filtered["slogan_netype"] = clusters_ne_id_loc_name_filtered["slogan"] + "_" + clusters_ne_id_loc_name_filtered["ne_type"]
clusters_ne_id_ne_address_first_three_octets_filtered["slogan_netype"] = clusters_ne_id_ne_address_first_three_octets_filtered["slogan"] + "_" + clusters_ne_id_ne_address_first_three_octets_filtered["ne_type"]

In [80]:
clusters_ne_id_ne_address_first_three_octets_filtered = clusters_ne_id_ne_address_first_three_octets_filtered.dropna(subset=['slogan_netype'])

## Preprocessing and FP-Growth

In [81]:
def create_baskets(data,aggregation_field):
    baskets = data.groupby(aggregation_field)['slogan_netype'].apply(list)
    te = TransactionEncoder()
    te_ary = te.fit(baskets).transform(baskets)
    basket_df = pd.DataFrame(te_ary, columns=te.columns_)
    
    return basket_df

In [82]:
def print_frequent_itemsets(frequent_itemsets):
    frequent_itemsets['itemsets'] = frequent_itemsets['itemsets'].apply(lambda x: ', '.join(list(x)))
    table = frequent_itemsets.values.tolist()
    return tabulate(table, headers=frequent_itemsets.columns, tablefmt='grid')

In [83]:
def print_first20_rules(rules):
    rules_df = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(20)
    
    rules_df['antecedents'] = rules_df['antecedents'].apply(lambda x: ', '.join(list(x)))
    rules_df['consequents'] = rules_df['consequents'].apply(lambda x: ', '.join(list(x)))
    
    print("\nAssociation Rules:")
    print(tabulate(rules_df, headers='keys', tablefmt='pretty', showindex=False))

### Cluster ne_id_loc_name LVL 1-2-3

In [84]:
basket_df_lvl1 = create_baskets(clusters_ne_id_loc_name_filtered, "cluster_id")
basket_df_lvl2 = create_baskets(clusters_ne_id_loc_name_filtered, "cluster_id2")
basket_df_lvl3 = create_baskets(clusters_ne_id_loc_name_filtered, "cluster_id3")

frequent_itemsets_lvl1 = fpgrowth(basket_df_lvl1, min_support=0.005, use_colnames=True)
frequent_itemsets_lvl2 = fpgrowth(basket_df_lvl2, min_support=0.005, use_colnames=True)
frequent_itemsets_lvl3 = fpgrowth(basket_df_lvl3, min_support=0.005, use_colnames=True)

# frequent_itemsets_lvl1 = frequent_itemsets_lvl1[frequent_itemsets_lvl1['itemsets'].apply(lambda x: len(x) > 1)]
# frequent_itemsets_lvl2 = frequent_itemsets_lvl2[frequent_itemsets_lvl2['itemsets'].apply(lambda x: len(x) > 1)]
# frequent_itemsets_lvl3 = frequent_itemsets_lvl3[frequent_itemsets_lvl3['itemsets'].apply(lambda x: len(x) > 1)]

frequent_itemsets_lvl1 = frequent_itemsets_lvl1.sort_values(by='support', ascending=False)
frequent_itemsets_lvl2 = frequent_itemsets_lvl2.sort_values(by='support', ascending=False)
frequent_itemsets_lvl3 = frequent_itemsets_lvl3.sort_values(by='support', ascending=False)


rules_lvl1 = association_rules(frequent_itemsets_lvl1, metric="confidence", min_threshold=0.1)
rules_lvl2 = association_rules(frequent_itemsets_lvl2, metric="confidence", min_threshold=0.1)
rules_lvl3 = association_rules(frequent_itemsets_lvl3, metric="confidence", min_threshold=0.1)

print("LVL 1")
print(print_frequent_itemsets(frequent_itemsets_lvl1.head(20)))
print("\n")
print("LVL 2")
print(print_frequent_itemsets(frequent_itemsets_lvl2.head(20)))
print("\n")
print("LVL 3")
print(print_frequent_itemsets(frequent_itemsets_lvl3.head(20)))


LVL 1
+------------+-------------------------------------------+
|    support | itemsets                                  |
| 0.42876    | link-down_n/d                             |
+------------+-------------------------------------------+
| 0.351292   | nodeunmanagable_n/d                       |
+------------+-------------------------------------------+
| 0.168442   | node-down_n/d                             |
+------------+-------------------------------------------+
| 0.137755   | node-down_n/d, nodeunmanagable_n/d        |
+------------+-------------------------------------------+
| 0.0588854  | snmplinkup_n/d                            |
+------------+-------------------------------------------+
| 0.03653    | islandgroupdown_n/d                       |
+------------+-------------------------------------------+
| 0.0248598  | islandgroupdown_n/d, nodeunmanagable_n/d  |
+------------+-------------------------------------------+
| 0.0242921  | snmpcoldstart_n/d                  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frequent_itemsets['itemsets'] = frequent_itemsets['itemsets'].apply(lambda x: ', '.join(list(x)))


In [85]:
print("LVL 1")
print_first20_rules(rules_lvl1)
print("\n")
print("LVL 2")
print_first20_rules(rules_lvl2)
print("\n")
print("LVL 3")
print_first20_rules(rules_lvl3)

LVL 1

Association Rules:
+----------------------------------------+---------------------+-----------------------+---------------------+--------------------+
|              antecedents               |     consequents     |        support        |     confidence      |        lift        |
+----------------------------------------+---------------------+-----------------------+---------------------+--------------------+
|             node-down_n/d              | nodeunmanagable_n/d |  0.1377554427674636   | 0.8178213896322727  | 2.328037163319928  |
|          nodeunmanagable_n/d           |    node-down_n/d    |  0.1377554427674636   | 0.3921391568841785  | 2.328037163319928  |
|          islandgroupdown_n/d           | nodeunmanagable_n/d | 0.024859756911980765  | 0.6805301645338209  | 1.9372194636622657 |
|             snmplinkup_n/d             |    link-down_n/d    | 0.018014558568184853  |  0.305925715905869  | 0.7135129737898808 |
|           snmpcoldstart_n/d            |   snmpl

### Cluster ne_id_ne_address_first_three_octets LVL 1-2-3

In [86]:
basket_df_lvl1 = create_baskets(clusters_ne_id_ne_address_first_three_octets_filtered, "cluster_id")
basket_df_lvl2 = create_baskets(clusters_ne_id_ne_address_first_three_octets_filtered, "cluster_id2")
basket_df_lvl3 = create_baskets(clusters_ne_id_ne_address_first_three_octets_filtered, "cluster_id3")

frequent_itemsets_lvl1 = fpgrowth(basket_df_lvl1, min_support=0.005, use_colnames=True)
frequent_itemsets_lvl2 = fpgrowth(basket_df_lvl2, min_support=0.005, use_colnames=True)
frequent_itemsets_lvl3 = fpgrowth(basket_df_lvl3, min_support=0.005, use_colnames=True)


# frequent_itemsets_lvl1 = frequent_itemsets_lvl1[frequent_itemsets_lvl1['itemsets'].apply(lambda x: len(x) > 1)]
# frequent_itemsets_lvl2 = frequent_itemsets_lvl2[frequent_itemsets_lvl2['itemsets'].apply(lambda x: len(x) > 1)]
# frequent_itemsets_lvl3 = frequent_itemsets_lvl3[frequent_itemsets_lvl3['itemsets'].apply(lambda x: len(x) > 1)]

frequent_itemsets_lvl1 = frequent_itemsets_lvl1.sort_values(by='support', ascending=False)
frequent_itemsets_lvl2 = frequent_itemsets_lvl2.sort_values(by='support', ascending=False)
frequent_itemsets_lvl3 = frequent_itemsets_lvl3.sort_values(by='support', ascending=False)


# Calcolare le Regole di Associazione
rules_lvl1 = association_rules(frequent_itemsets_lvl1, metric="confidence", min_threshold=0.1)
rules_lvl2 = association_rules(frequent_itemsets_lvl2, metric="confidence", min_threshold=0.1)
rules_lvl3 = association_rules(frequent_itemsets_lvl3, metric="confidence", min_threshold=0.1)

print("LVL 1")
print(print_frequent_itemsets(frequent_itemsets_lvl1))
print("\n")
print("LVL 2")
print(print_frequent_itemsets(frequent_itemsets_lvl2))
print("\n")
print("LVL 3")
print(print_frequent_itemsets(frequent_itemsets_lvl3))


LVL 1
+------------+--------------------------------------------------------+
|    support | itemsets                                               |
| 0.428924   | link-down_n/d                                          |
+------------+--------------------------------------------------------+
| 0.351054   | nodeunmanagable_n/d                                    |
+------------+--------------------------------------------------------+
| 0.168247   | node-down_n/d                                          |
+------------+--------------------------------------------------------+
| 0.137576   | node-down_n/d, nodeunmanagable_n/d                     |
+------------+--------------------------------------------------------+
| 0.0589542  | snmplinkup_n/d                                         |
+------------+--------------------------------------------------------+
| 0.0365312  | islandgroupdown_n/d                                    |
+------------+--------------------------------------------

In [87]:
print("LVL 1")
print_first20_rules(rules_lvl1)
print("\n")
print("LVL 2")
print_first20_rules(rules_lvl2)
print("\n")
print("LVL 3")
print_first20_rules(rules_lvl3)

LVL 1

Association Rules:
+----------------------------------------+---------------------+----------------------+---------------------+---------------------+
|              antecedents               |     consequents     |       support        |     confidence      |        lift         |
+----------------------------------------+---------------------+----------------------+---------------------+---------------------+
|             node-down_n/d              | nodeunmanagable_n/d | 0.13757638494673924  |  0.817703681651285  | 2.3292849000676337  |
|          nodeunmanagable_n/d           |    node-down_n/d    | 0.13757638494673924  | 0.39189574812137357 |  2.329284900067634  |
|          islandgroupdown_n/d           | nodeunmanagable_n/d | 0.024860587037098876 | 0.6805301645338209  |  1.938536748529852  |
|             snmplinkup_n/d             |    link-down_n/d    | 0.018031856279426987 | 0.3058623619371283  | 0.7130914871881028  |
|           snmpcoldstart_n/d            |   snmpl