In [None]:
import numpy as np
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import random

random.seed(42)

feature_names = pd.read_csv('../processed_data/metafeatures.csv', index_col=0).columns
dataset_names = pd.read_csv('../processed_data/HAMMING LOSS example based/regression/performance.csv', index_col=0).index
domains = {
    "Text": ["ARABIC200", "BIBTEX", "ENRON", "FOODTRUCK", "LANGLOG", "MEDICAL", "NG20", "OHSUMED", "REUTERSK500", "SCENE", "STACKEX_CHESS", "STACKEX_CS", "STACKEX_PHILOSOPHY", "TMC2007_500", "YELP", "DELICIOUS", "SLASHDOT"],
    "Bioinformatics": ["GENBASE", "GNEGATIVEGO", "GNEGATIVEPSEACC", "GPOSITIVEGO", "GPOSITIVEPSEAAC", "HUMANGO", "HUMANPSEAAC", "PLANTGO", "PLANTPSEAAC", "PROTEINS_HUMAN", "PROTEINS_PLANT", "PROTEINS_VIRUS", "VIRUSGO", "VIRUS_PSEAAC", "YEAST"],
    "Multimedia": ["BIRDS", "CAL500", "COREL5K", "EMOTIONS", "FLAGS"],
    "Medical": ["CHD_49", "ABPM"],
    "Chemistry": ["WATER_QUALITY"]
}

# Function to calculate heights for subplots based on domain data
def get_heights(domains, df_fimps, row_height=30):
    return [len(df_fimps.loc[df_fimps.index.intersection(domain_datasets)]) * row_height for domain, domain_datasets in domains.items()]

def generate_plot(df_fimps, domains):
    # Create subplot figure with variable row heights
    fig = make_subplots(
        rows=len(domains),
        cols=1,
        shared_xaxes=True,
        vertical_spacing=0.04,
        subplot_titles=list(domains.keys()),
        row_heights=get_heights(domains, df_fimps)
    )

    # Determine global zmin and zmax for consistent color scaling across all plots
    zmin = df_fimps.min().min()
    zmax = df_fimps.max().max()

    # Adding heatmaps to each subplot
    for i, (domain, domain_datasets) in enumerate(domains.items(), start=1):
        domain_data = df_fimps.loc[df_fimps.index.intersection(domain_datasets)]
        fig.add_trace(
            go.Heatmap(
                z=domain_data.values,
                x=domain_data.columns,
                y=domain_data.index,
                # colorscale=[[0, 'red'], [0.5, 'white'], [1, '#167bb6']],  # Teal pastel blue
                colorscale='Reds',
                zmin=zmin,
                zmax=zmax,
                name=domain,
                reversescale=True
            ),
            row=i, col=1
        )

    # Update layout to make the figure more readable and give equal space for each subplot
    fig.update_layout(
        # title='SHAP Values Heatmap by Domain',
        height=sum(get_heights(domains, df_fimps)),  # Set total height based on the sum of heights
        width=900,
        autosize=True
    )

    fig.show()

import os

def get_pairwise_targets(run_id, seed, metric, selected_algo):
    folder = f"../{run_id}/seed_{seed}/{metric}/pairwise_classification/single"
    pairs_targets = [(f.split('_')[2], f.split('_')[4].split('.')[0]) for f in os.listdir(folder) if f.startswith("performance")]
    filtered_pairs = [ pair for pair in pairs_targets  if selected_algo in pair ]
    return filtered_pairs
    

def get_data_pairwise_classification_single(metric, seed, run_id):
    learning_task = 'pairwise_classification'
    task_output = 'single'
    algo_portfolio = np.load(f'../processed_data/{metric}/algo_portfolio.npy', allow_pickle=True)
    print(algo_portfolio)
    AS_file = f"../{run_id}/seed_{seed}/{metric}/AS.csv"
    predicted_algo = pd.read_csv(AS_file)['AS-PC-SO_algo_name']
    

    data = []
    for fold in range(40):
        data_fold = []
        predicted_algos_fold = predicted_algo.iloc[fold].split(',')
        for predicted_algo_fold in predicted_algos_fold: 
            print(predicted_algo_fold)
            pairs = get_pairwise_targets(run_id, seed, metric, predicted_algo_fold)
            for pair in pairs:
                prediction_pair_file_path = f'../{run_id}/seed_{seed}/{metric}/pairwise_classification/single/predictions/test_predictions_fold_{fold}_{pair[0]}_vs_{pair[1]}.npy'
                print(prediction_pair_file_path)
                predictions_pair = np.load(prediction_pair_file_path)[0]
                shap_pair_file_path = f'../{run_id}/seed_{seed}/{metric}/pairwise_classification/single/shap/test_shap_fold_{fold}_{pair[0]}_vs_{pair[1]}.npy'
                shap_pair = np.load(shap_pair_file_path)
                if pair[predictions_pair] == predicted_algo_fold:
                    print(predictions_pair, pair, predicted_algo_fold)
                    shap_fimps = shap_pair[predictions_pair]
                    shap_fimps_df = pd.DataFrame(shap_fimps)
                    shap_fimps_df = shap_fimps_df.abs()
                    print("before", shap_fimps_df)
                    shap_fimps_df = shap_fimps_df.rank(axis=1, method='min', ascending=False)
                    print("after", shap_fimps_df)
                    data_fold.append(list(shap_fimps_df.values[0]))
            break
        print(data_fold)
        data_fold_df = pd.DataFrame(data_fold)
        data_fold_df = data_fold_df.mean(axis=0)
        print(data_fold_df)


        data.append(data_fold_df.values)

        # predicted_algo_fold_idx = list(algo_portfolio).index(predicted_algo_fold)
        # shap_algo_fold_file_path = f"../{run_id}/seed_{seed}/{metric}/{learning_task}/{task_output}/shap/test_shap_fold_{fold}.npy"
        # shap_algo_fold = np.load(shap_algo_fold_file_path)[predicted_algo_fold_idx]
        # data.append(shap_algo_fold[0])
    return pd.DataFrame(data, index=dataset_names, columns=feature_names)


# settings
metric = "AUCROC MICRO"
seed = 42
run_id = "results"

df_fimps = get_data_pairwise_classification_single(metric, seed, run_id)
generate_plot(df_fimps, domains)
