In [1]:
import pickle 
import numpy as np
import matplotlib.pyplot as plt

import plotly.express as px
import pandas as pd

import os

In [2]:
import logging
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s [%(levelname)s] %(message)s", 
                    datefmt="%d-%b-%y %H:%M:%S")

In [3]:
RULE_FOLDER = "rules_saved_clean"

assert os.path.exists(RULE_FOLDER), f"Folder {RULE_FOLDER} does not exist"

rules = os.listdir(RULE_FOLDER)
logging.info(f"Found {len(rules)} rules")

28-Apr-24 00:38:58 [INFO] Found 25 rules


In [4]:
def get_pairs(rules):
    pairs = []
    for rule1 in rules:
        if 'maf_method' in rule1:
            for rule2 in rules:
                if rule1 == rule2 or (rule2, rule1) in pairs:
                    continue
                if rule2.startswith(rule1.split('maf_method')[0]):
                    pairs.append((rule1, rule2))
    return pairs

def extract_res(rules_info):
    rules = rules_info["preds"]
    mafs = rules_info["masses"]
    
    rule_names = [str(i) for i in rules]
    mass_first = [float(i[0]) for i in mafs]
    mass_second = [float(i[1]) for i in mafs]
    uncertainty = [float(i[2]) for i in mafs]
    
    res = {"rule": rule_names, "mass_first": mass_first, "mass_second": mass_second, "uncertainty": uncertainty}
    return res

In [10]:
lst = ["dataset=Brain Tumor, label_for_dist=labels, clust=kmeans breaks=3, add_mult_rules=False, maf_method=clustering", 
"dataset=Brain Tumor, label_for_dist=labels, clust=kmeans breaks=3, add_mult_rules=False, maf_method=random",
"dataset=breast-cancer-wisconsin, label_for_dist=labels, clust=kmeans breaks=3, add_mult_rules=False, maf_method=clustering",
"dataset=breast-cancer-wisconsin, label_for_dist=labels, clust=kmeans breaks=3, add_mult_rules=False, maf_method=random",
"dataset=gaussian_df, label_for_dist=labels, clust=kmeans breaks=3, add_mult_rules=False, maf_method=clustering",
"dataset=gaussian_df, label_for_dist=labels, clust=kmeans breaks=3, add_mult_rules=False, maf_method=random",
"dataset=rectangle_df, label_for_dist=labels, clust=kmeans breaks=3, add_mult_rules=False, maf_method=clustering",
"dataset=rectangle_df, label_for_dist=labels, clust=kmeans breaks=3, add_mult_rules=False, maf_method=random",
"dataset=uniform_df, label_for_dist=labels, clust=kmeans breaks=3, add_mult_rules=False, maf_method=clustering",
"dataset=uniform_df, label_for_dist=labels, clust=kmeans breaks=3, add_mult_rules=False, maf_method=random"]
len(lst)

10

In [11]:
pairs = get_pairs(rules)

In [21]:

def filter_pairs(pairs, allowed_lst):
    relevant = []
    for i, j in pairs:
        if i.split(".")[0] in allowed_lst and j.split(".")[0] in allowed_lst:
            relevant.append((i, j)) 
            
    return relevant


pairs_new = filter_pairs(pairs,lst)
pairs_new

[('dataset=Brain Tumor, label_for_dist=labels, clust=kmeans breaks=3, add_mult_rules=False, maf_method=clustering.dsb',
  'dataset=Brain Tumor, label_for_dist=labels, clust=kmeans breaks=3, add_mult_rules=False, maf_method=random.dsb'),
 ('dataset=breast-cancer-wisconsin, label_for_dist=labels, clust=kmeans breaks=3, add_mult_rules=False, maf_method=clustering.dsb',
  'dataset=breast-cancer-wisconsin, label_for_dist=labels, clust=kmeans breaks=3, add_mult_rules=False, maf_method=random.dsb'),
 ('dataset=gaussian_df, label_for_dist=labels, clust=kmeans breaks=3, add_mult_rules=False, maf_method=clustering.dsb',
  'dataset=gaussian_df, label_for_dist=labels, clust=kmeans breaks=3, add_mult_rules=False, maf_method=random.dsb'),
 ('dataset=rectangle_df, label_for_dist=labels, clust=kmeans breaks=3, add_mult_rules=False, maf_method=clustering.dsb',
  'dataset=rectangle_df, label_for_dist=labels, clust=kmeans breaks=3, add_mult_rules=False, maf_method=random.dsb'),
 ('dataset=uniform_df, lab

In [24]:


dfs = {}

for r1, r2 in [pairs_new[1]]:
    print(r1, r2, sep="\n")
    r1_info = pickle.load(open(os.path.join(RULE_FOLDER, r1), "rb"))
    r2_info = pickle.load(open(os.path.join(RULE_FOLDER, r2), "rb"))
    
    r1_df = pd.DataFrame(extract_res(r1_info))
    r2_df = pd.DataFrame(extract_res(r2_info))
    
    
    merged = pd.merge(r1_df, r2_df, on="rule", suffixes=('_kmeans', '_random'))
    # merged.to_csv(f"rules_{r1}_{r2}.csv", index=False)
    print(merged)
    print(f"Num rules {len(merged)}")
    fig = px.bar(merged, x='rule', y=['uncertainty_kmeans', 'uncertainty_random'], 
        title='Uncertainty comparison between kmeans and random', # put bars next to each other
        barmode='group')
    # fig.write_html(f"{r1}_{r2}.html")
    dfs[r1] = merged 
    fig.show()

dataset=breast-cancer-wisconsin, label_for_dist=labels, clust=kmeans breaks=3, add_mult_rules=False, maf_method=clustering.dsb
dataset=breast-cancer-wisconsin, label_for_dist=labels, clust=kmeans breaks=3, add_mult_rules=False, maf_method=random.dsb
                                 rule  mass_first_kmeans  mass_second_kmeans  \
0             clump_thickness < 2.569           0.725134            0.111726   
1     2.569 < clump_thickness < 4.454           0.638576            0.183734   
2     4.454 < clump_thickness < 6.339           0.568323            0.221567   
3             clump_thickness > 6.339           0.074992            0.590052   
4             size_uniformity < 1.066           0.820435            0.020243   
5     1.066 < size_uniformity < 3.100           0.369722            0.334375   
6     3.100 < size_uniformity < 5.134           0.059759            0.662165   
7             size_uniformity > 5.134           0.140512            0.534107   
8            shape_uniformity 

In [55]:
inds = [3, 31, 18, 7, 15, 34, 18]
merged["ratio"] = merged["mass_second_kmeans"] / merged["mass_first_kmeans"]
m = merged.sort_values(by="uncertainty_kmeans", ascending=True)
px.scatter(m.loc[inds], x="uncertainty_kmeans", y="ratio", hover_data=["rule"])

In [60]:
m.loc[[7, 3]].drop(["mass_first_random", "mass_second_random",	"uncertainty_random"],axis=1)

Unnamed: 0,rule,mass_first_kmeans,mass_second_kmeans,uncertainty_kmeans,ratio
7,size_uniformity > 5.134,0.140512,0.534107,0.325381,3.801164
3,clump_thickness > 6.339,0.074992,0.590052,0.334956,7.868188


In [7]:
rules_info = pickle.load(open(f"{RULE_FOLDER}/{rules[0]}", "rb"))

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
name_1 = "kmeans_wine.csv, 3 single breaks, maf_method=random.dsb"
name_2 = "kmeans_wine.csv, 3 single breaks, maf_method=kmeans.dsb"

In [13]:
rules_info_random = pickle.load(open(f"{RULE_FOLDER}/{name_1}", "rb"))
rules_info_kmeans = pickle.load(open(f"{RULE_FOLDER}/{name_2}", "rb"))

In [30]:
def extract_res(rules_info):
    rules = rules_info["preds"]
    mafs = rules_info["masses"]
    
    rule_names = [str(i) for i in rules]
    mass_first = [float(i[0]) for i in mafs]
    mass_second = [float(i[1]) for i in mafs]
    uncertainty = [float(i[2]) for i in mafs]
    
    res = {"rule": rule_names, "mass_first": mass_first, "mass_second": mass_second, "uncertainty": uncertainty}
    return res

In [31]:
kmeans = extract_res(rules_info_kmeans)
random = extract_res(rules_info_random)

In [41]:
kmeans_df = pd.DataFrame(kmeans)
random_df = pd.DataFrame(random)

# merge on rule column
merged = pd.merge(kmeans_df, random_df, on="rule", suffixes=('_kmeans', '_random'))
merged.head(1)

Unnamed: 0,rule,mass_first_kmeans,mass_second_kmeans,uncertainty_kmeans,mass_first_random,mass_second_random,uncertainty_random
0,fixed acidity < 6.350,0.231988,0.532523,0.23549,0.019433,0.287232,0.693334


In [44]:
px.bar(merged, x='rule', y=['uncertainty_kmeans', 'uncertainty_random'], 
       title='Uncertainty comparison between kmeans and random', # put bars next to each other
         barmode='group')