In [39]:
import os
import pickle 
import numpy as np
import pandas as pd

import plotly.express as px
import matplotlib.pyplot as plt

from config import *

from functools import reduce 

import logging
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s [%(levelname)s] %(message)s", 
                    datefmt="%d-%b-%y %H:%M:%S")

In [112]:
class ReportGenerator:
    def __init__(self, dataset: str, num_exps: int = 4,
                 rules_folder: str = RULE_FOLDER, 
                 results_file: str = "experiments.csv"):
        
        assert os.path.exists(rules_folder), f"Rules folder `{rules_folder}` does not exist"
        assert os.path.exists(results_file), f"Results csv file `{results_file}` does not exist"
        
        self.dataset = dataset
        self.dataset_lst = os.listdir(rules_folder)
        assert self.dataset in self.dataset_lst, \
            f"No rules folder found for dataset `{dataset}` in folder `{rules_folder}`, found {self.dataset_lst}"
        
        # ----------------------- Load rules ----------------------- #
        self.rules_results_lst = os.listdir(os.path.join(rules_folder, self.dataset))
        
        assert len(self.rules_results_lst) == num_exps, f"Expected 4 experiments, found list -> {self.rules_results_lst}"
        self.exp_full_names = [i.split(".")[0] for i in self.rules_results_lst]
        
        self.rules_dict = {}
        for exp_rule in self.rules_results_lst:
            exp_name = self.get_experiment_name(exp_rule)
            
            rule_file_name = os.path.join(rules_folder, self.dataset, exp_rule)
            rule_pickle = pickle.load(open(rule_file_name, "rb"))
            rule_dict = self.extract_rule_info(rule_pickle)
            self.rules_dict[exp_name] = pd.DataFrame(rule_dict)

        
        self.rules_df = self.convert_rules_dict_to_df()
        self.exp_names = list(self.rules_dict.keys())

        logging.info(f"Rules for dataset `{dataset}` loaded, found experiments {self.exp_names}")
    
        self.df_res = pd.read_csv(results_file)
        self.df_res = self.df_res[self.df_res["name"].isin(self.exp_full_names)].tail(num_exps)
        
        assert len(self.df_res) == num_exps, f"Expected 4 experiments, found {len(self.df_res)}"
        
        self.df_res["exp_name"] = self.df_res.apply(self.get_experiment_name_for_df, axis=1)
        
        logging.info(f"Results for dataset `{dataset}` loaded") 
        
    
    def convert_rules_dict_to_df(self):
        # https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns
        dfs_with_suffixes = [(self.rules_dict[exp_name], f"_{exp_name}") for exp_name in self.rules_dict]
        first_df = dfs_with_suffixes[0][0]
        first_df_name = dfs_with_suffixes[0][1]
        the_rest_dfs = dfs_with_suffixes[1:]

        merge_one = lambda x, y, sfx: pd.merge(x, y, on=['rule'], suffixes=("", sfx))

        merged = reduce(lambda left, right: merge_one(left,*right), the_rest_dfs, first_df)
        
        merged.rename(columns={"mass_first": f"mass_first{first_df_name}", 
                                "mass_second": f"mass_second{first_df_name}",
                                "uncertainty": f"uncertainty{first_df_name}"}, inplace=True)
        return merged
    
    @staticmethod
    def extract_rule_info(rules_info):
        rules = rules_info["preds"]
        mafs = rules_info["masses"]
        
        rule_names = [str(i) for i in rules]
        mass_first = [float(i[0]) for i in mafs]
        mass_second = [float(i[1]) for i in mafs]
        uncertainty = [float(i[2]) for i in mafs]
        
        res = {"rule": rule_names, "mass_first": mass_first, "mass_second": mass_second, "uncertainty": uncertainty}
        return res
    
    @staticmethod
    def get_experiment_name(file_name):
        if "uniform" in file_name:
            return "uniform"
        elif "random" in file_name:
            return "random"
        elif "clustering" in file_name:
            if "kmeans" in file_name:
                return "kmeans"
            elif "dbscan" in file_name:
                return "dbscan"
            
    @staticmethod
    def get_experiment_name_for_df(df):
        if df["MAF method"] != "clustering":
            return df["MAF method"]
        return df["clustering_alg"]

In [113]:
R = ReportGenerator("gaussian_df")

03-May-24 22:11:20 [INFO] Rules for dataset `gaussian_df` loaded, found experiments ['dbscan', 'kmeans', 'random', 'uniform']
03-May-24 22:11:20 [INFO] Results for dataset `gaussian_df` loaded


In [114]:
R.df_res




Unnamed: 0,datetime,name,MAF method,dataset,breaks,mult_rules,accuracy,f1,confusion_matrix,training_time,epochs,min_loss,all_losses,clustering_alg,label_for_dist,exp_name
63,03-05-2024 21:04:46,"dataset=gaussian_df, label_for_dist=labels, cl...",clustering,gaussian_df,3,False,0.986667,0.9875,[[69 2]\n [ 0 79]],13.159925,102,0.017054,"[0.09687478095293045, 0.09325835853815079, 0.0...",kmeans,labels,kmeans
64,03-05-2024 21:05:01,"dataset=gaussian_df, label_for_dist=labels, cl...",clustering,gaussian_df,3,False,0.986667,0.9875,[[69 2]\n [ 0 79]],13.650317,105,0.018348,"[0.10890117287635803, 0.10498002171516418, 0.1...",dbscan,labels,dbscan
65,03-05-2024 21:05:32,"dataset=gaussian_df, label_for_dist=labels, cl...",random,gaussian_df,3,False,0.986667,0.9875,[[69 2]\n [ 0 79]],31.065621,260,0.023018,"[0.2417849451303482, 0.23679682612419128, 0.23...",,labels,random
66,03-05-2024 21:05:52,"dataset=gaussian_df, label_for_dist=labels, cl...",uniform,gaussian_df,3,False,0.986667,0.9875,[[69 2]\n [ 0 79]],19.516701,190,0.023795,"[0.25, 0.2432374805212021, 0.23658305406570435...",,labels,uniform


In [82]:
merged.head(1)

Unnamed: 0,rule,mass_first,mass_second,uncertainty,mass_first_kmeans,mass_second_kmeans,uncertainty_kmeans,mass_first_random,mass_second_random,uncertainty_random,mass_first_uniform,mass_second_uniform,uncertainty_uniform
0,x < 0.321,0.0,0.806145,0.193855,0.0,0.834516,0.165484,0.0,0.717049,0.282951,0.0,0.758178,0.241822


In [83]:
merged_manual.head(1)

Unnamed: 0,rule,mass_first,mass_second,uncertainty,mass_first_kmeans,mass_second_kmeans,uncertainty_kmeans,mass_first_random,mass_second_random,uncertainty_random,mass_first_uniform,mass_second_uniform,uncertainty_uniform
0,x < 0.321,0.0,0.806145,0.193855,0.0,0.834516,0.165484,0.0,0.717049,0.282951,0.0,0.758178,0.241822
