In [16]:
import os
import pickle 
import numpy as np
import pandas as pd

import plotly.express as px
import matplotlib.pyplot as plt

from config import *

from functools import reduce 

import logging
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s [%(levelname)s] %(message)s", 
                    datefmt="%d-%b-%y %H:%M:%S")

In [49]:
class ReportGenerator:
    def __init__(self, dataset: str, num_exps: int = 6,
                 rules_folder: str = RULE_FOLDER, 
                 results_file: str = "experiments.csv", 
                 res_df_cols_ignore: list = ["breaks",	"mult_rules", 	"confusion_matrix",	"label_for_dist"]):
        
        assert os.path.exists(rules_folder), f"Rules folder `{rules_folder}` does not exist"
        assert os.path.exists(results_file), f"Results csv file `{results_file}` does not exist"
        
        self.dataset = dataset
        self.dataset_lst = os.listdir(rules_folder)
        assert self.dataset in self.dataset_lst, \
            f"No rules folder found for dataset `{dataset}` in folder `{rules_folder}`, found {self.dataset_lst}"
        
        # ----------------------- Load rules ----------------------- #
        self.rules_results_lst = os.listdir(os.path.join(rules_folder, self.dataset))
        
        assert len(self.rules_results_lst) == num_exps, f"Expected 4 experiments, found list -> {self.rules_results_lst}"
        self.exp_full_names = [i.split(".")[0] for i in self.rules_results_lst]
        
        self.rules_dict = {}
        for exp_rule in self.rules_results_lst:
            exp_name = self.get_experiment_name(exp_rule)
            
            rule_file_name = os.path.join(rules_folder, self.dataset, exp_rule)
            rule_pickle = pickle.load(open(rule_file_name, "rb"))
            rule_dict = self.extract_rule_info(rule_pickle)
            self.rules_dict[exp_name] = pd.DataFrame(rule_dict)

        
        self.rules_df = self.convert_rules_dict_to_df()
        self.exp_names = list(self.rules_dict.keys())

        logging.info(f"Rules for dataset `{dataset}` loaded, found experiments {self.exp_names}")
    
        # ----------------------- Load df results ----------------------- #
        self.res_df_cols_ignore = res_df_cols_ignore
        self.res_df = pd.read_csv(results_file, usecols=lambda x: x not in self.res_df_cols_ignore)
        self.res_df = self.res_df[self.res_df["name"].isin(self.exp_full_names)].tail(num_exps)
        
        assert len(self.res_df) == num_exps, f"Expected 4 experiments, found {len(self.res_df)}"
        
        self.res_df["exp_name"] = self.res_df.apply(self.get_experiment_name_for_df, axis=1)
        assert set(self.res_df["exp_name"]) == set(self.exp_names), \
            f"Most likely missing some experiments. Found {set(self.res_df['exp_name'])}, expected {set(self.exp_names)}"
            
        self.res_df["initial_loss"] = self.res_df["all_losses"].apply(lambda x: eval(x)[0])
        logging.info(f"Results for dataset `{dataset}` loaded") 
    
    def get_bar_plots_res(self, ys=["training_time", "accuracy", "f1", "min_loss", "initial_loss"], 
                             show=False):
        # get_bar = lambda y
        fig = px.bar(self.res_df, x="exp_name", y=ys, 
                     title=f"Metrics for dataset `{self.dataset}`", 
                     barmode='group')
                     # stack next to each other
                        
        return fig
    
    
    
    def convert_rules_dict_to_df(self):
        # https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns
        dfs_with_suffixes = [(self.rules_dict[exp_name], f"_{exp_name}") for exp_name in self.rules_dict]
        first_df = dfs_with_suffixes[0][0]
        first_df_name = dfs_with_suffixes[0][1]
        the_rest_dfs = dfs_with_suffixes[1:]

        merge_one = lambda x, y, sfx: pd.merge(x, y, on=['rule'], suffixes=("", sfx))

        merged = reduce(lambda left, right: merge_one(left,*right), the_rest_dfs, first_df)
        
        merged.rename(columns={"mass_first": f"mass_first{first_df_name}", 
                                "mass_second": f"mass_second{first_df_name}",
                                "uncertainty": f"uncertainty{first_df_name}"}, inplace=True)
        return merged
    
    @staticmethod
    def extract_rule_info(rules_info):
        rules = rules_info["preds"]
        mafs = rules_info["masses"]
        
        rule_names = [str(i) for i in rules]
        mass_first = [float(i[0]) for i in mafs]
        mass_second = [float(i[1]) for i in mafs]
        uncertainty = [float(i[2]) for i in mafs]
        
        res = {"rule": rule_names, "mass_first": mass_first, "mass_second": mass_second, "uncertainty": uncertainty}
        return res
    
    @staticmethod
    def get_experiment_name(file_name):
        if "uniform" in file_name:
            return "uniform"
        elif "random" in file_name:
            return "random"
        elif "means_no_clustering" in file_name:
            return "means_no_clustering"
        elif "density_no_clustering" in file_name:
            return "density_no_clustering"
        elif "clustering" in file_name:
            if "kmeans" in file_name:
                return "kmeans"
            elif "dbscan" in file_name:
                return "dbscan"
            
    @staticmethod
    def get_experiment_name_for_df(df):
        if df["MAF method"] != "clustering":
            return df["MAF method"]
        return df["clustering_alg"]

In [50]:
R = ReportGenerator("Brain Tumor", num_exps=6)

04-May-24 00:50:40 [INFO] Rules for dataset `Brain Tumor` loaded, found experiments ['dbscan', 'density_no_clustering', 'kmeans', 'means_no_clustering', 'random', 'uniform']
04-May-24 00:50:40 [INFO] Results for dataset `Brain Tumor` loaded


In [51]:
R.res_df.head(1)

Unnamed: 0,datetime,name,MAF method,dataset,accuracy,f1,training_time,epochs,min_loss,all_losses,clustering_alg,exp_name,initial_loss
96,04-05-2024 00:24:15,"dataset=Brain Tumor, label_for_dist=labels, cl...",clustering,Brain Tumor,0.976667,0.951049,11.600965,55,0.011857,"[0.033411476761102676, 0.030747447162866592, 0...",kmeans,kmeans,0.033411


In [53]:
R.get_bar_plots_res(show=False)

In [23]:
from pandas import option_context

with option_context('display.max_colwidth', 400):
    print(R.df_res[["name", "clustering_alg", "exp_name"]])




                                                                                                                               name  \
96                  dataset=Brain Tumor, label_for_dist=labels, clust=kmeans, breaks=3, add_mult_rules=False, maf_method=clustering   
97                  dataset=Brain Tumor, label_for_dist=labels, clust=dbscan, breaks=3, add_mult_rules=False, maf_method=clustering   
98     dataset=Brain Tumor, label_for_dist=labels, clust=means_no_clustering, breaks=3, add_mult_rules=False, maf_method=clustering   
99   dataset=Brain Tumor, label_for_dist=labels, clust=density_no_clustering, breaks=3, add_mult_rules=False, maf_method=clustering   
100                       dataset=Brain Tumor, label_for_dist=labels, clust=None, breaks=3, add_mult_rules=False, maf_method=random   
101                      dataset=Brain Tumor, label_for_dist=labels, clust=None, breaks=3, add_mult_rules=False, maf_method=uniform   

            clustering_alg               exp_name  
96

In [82]:
merged.head(1)

Unnamed: 0,rule,mass_first,mass_second,uncertainty,mass_first_kmeans,mass_second_kmeans,uncertainty_kmeans,mass_first_random,mass_second_random,uncertainty_random,mass_first_uniform,mass_second_uniform,uncertainty_uniform
0,x < 0.321,0.0,0.806145,0.193855,0.0,0.834516,0.165484,0.0,0.717049,0.282951,0.0,0.758178,0.241822


In [83]:
merged_manual.head(1)

Unnamed: 0,rule,mass_first,mass_second,uncertainty,mass_first_kmeans,mass_second_kmeans,uncertainty_kmeans,mass_first_random,mass_second_random,uncertainty_random,mass_first_uniform,mass_second_uniform,uncertainty_uniform
0,x < 0.321,0.0,0.806145,0.193855,0.0,0.834516,0.165484,0.0,0.717049,0.282951,0.0,0.758178,0.241822
