In [None]:
import numpy as np
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go

feature_names = pd.read_csv('../processed_data/metafeatures.csv', index_col=0).columns
dataset_names = pd.read_csv('../processed_data/HAMMING LOSS example based/regression/performance.csv', index_col=0).index
domains = {
    "Text": ["ARABIC200", "BIBTEX", "ENRON", "FOODTRUCK", "LANGLOG", "MEDICAL", "NG20", "OHSUMED", "REUTERSK500", "SCENE", "STACKEX_CHESS", "STACKEX_CS", "STACKEX_PHILOSOPHY", "TMC2007_500", "YELP", "DELICIOUS", "SLASHDOT"],
    "Bioinformatics": ["GENBASE", "GNEGATIVEGO", "GNEGATIVEPSEACC", "GPOSITIVEGO", "GPOSITIVEPSEAAC", "HUMANGO", "HUMANPSEAAC", "PLANTGO", "PLANTPSEAAC", "PROTEINS_HUMAN", "PROTEINS_PLANT", "PROTEINS_VIRUS", "VIRUSGO", "VIRUS_PSEAAC", "YEAST"],
    "Multimedia": ["BIRDS", "CAL500", "COREL5K", "EMOTIONS", "FLAGS"],
    "Medical": ["CHD_49", "ABPM"],
    "Chemistry": ["WATER_QUALITY"]
}


# Function to calculate heights for subplots based on domain data
def get_heights(domains, df_fimps, row_height=30):
    return [len(df_fimps.loc[df_fimps.index.intersection(domain_datasets)]) * row_height for domain, domain_datasets in domains.items()]

def generate_plot(df_fimps, domains):
    # Create subplot figure with variable row heights
    fig = make_subplots(
        rows=num_domains,
        cols=1,
        shared_xaxes=True,
        vertical_spacing=0.04,
        subplot_titles=list(domains.keys()),
        row_heights=get_heights(domains, df_fimps)
    )

    # Determine global zmin and zmax for consistent color scaling across all plots
    zmin = df_fimps.min().min()
    zmax = df_fimps.max().max()

    # Adding heatmaps to each subplot
    for i, (domain, domain_datasets) in enumerate(domains.items(), start=1):
        domain_data = df_fimps.loc[df_fimps.index.intersection(domain_datasets)]
        fig.add_trace(
            go.Heatmap(
                z=domain_data.values,
                x=domain_data.columns,
                y=domain_data.index,
                colorscale=[[0, 'red'], [0.5, 'white'], [1, '#167bb6']],  # Teal pastel blue
                zmin=zmin,
                zmax=zmax,
                name=domain
            ),
            row=i, col=1
        )

    # Update layout to make the figure more readable and give equal space for each subplot
    fig.update_layout(
        # title='SHAP Values Heatmap by Domain',
        height=sum(get_heights(domains, df_fimps)),  # Set total height based on the sum of heights
        width=900,
        autosize=True
    )

    fig.show()

# Function to get data for a specific target column
def get_data_for_target(target_col, task_output):
    data = []
    for fold in range(40):
        if task_output == 'multi':
            file_path = f"../{run_id}/seed_{seed}/{metric}/{learning_task}/{task_output}/shap/test_shap_fold_{fold}.npy"
            shap_dataset_instance = np.load(file_path)
            data.append(shap_dataset_instance[target_col][0])
        else: 
            file_path = f"../{run_id}/seed_{seed}/{metric}/{learning_task}/{task_output}/shap/test_shap_fold_{fold}_{target_col}.npy"
            shap_dataset_instance = np.load(file_path)
            data.append(shap_dataset_instance[0])
        
    return pd.DataFrame(data, index=dataset_names, columns=feature_names)


# Settings
learning_task = 'regression'
metric = "F1 example based"
seed = 42
run_id = "results"
num_domains = len(domains)

task_output = 'multi'
target_col = 1


df_fimps = get_data_for_target(target_col, task_output)
generate_plot(df_fimps, domains)


In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go

# Load feature names and dataset names
feature_names = pd.read_csv('../processed_data/metafeatures.csv', index_col=0).columns
dataset_names = pd.read_csv('../processed_data/HAMMING LOSS example based/regression/performance.csv', index_col=0).index


domains = {
    "Text": ["ARABIC200", "BIBTEX", "ENRON", "FOODTRUCK", "LANGLOG", "MEDICAL","NG20", "OHSUMED", "REUTERSK500", "SCENE", "STACKEX_CHESS", "STACKEX_CS","STACKEX_PHILOSOPHY", "TMC2007_500", "YELP", "DELICIOUS", "SLASHDOT"],
    "Bioinformatics": ["GENBASE", "GNEGATIVEGO", "GNEGATIVEPSEACC", "GPOSITIVEGO", "GPOSITIVEPSEAAC", "HUMANGO", "HUMANPSEAAC", "PLANTGO", "PLANTPSEAAC","PROTEINS_HUMAN","PROTEINS_PLANT", "PROTEINS_VIRUS", "VIRUSGO", "VIRUS_PSEAAC", "YEAST"],
    "Multimedia": ["BIRDS","CAL500", "COREL5K", "EMOTIONS", "FLAGS"],
    "Medical": ["CHD_49", "ABPM"],
    "Chemistry": ['WATER_QUALITY',],
    
}

# Setup
learning_task = 'regression'
metric = "F1 example based"
task_output = 'multi'
seed = 2
run_id = "results"
num_targets = 6

dfs = {}
for i in range(num_targets):
    dfs[f"target_{i}"] = []

for fold in range(40):
    file_path = f"../{run_id}/seed_{seed}/{metric}/{learning_task}/{task_output}/shap/test_shap_fold_{fold}.npy"
    shap_dataset_instance = np.load(file_path)
    for target in range(len(shap_dataset_instance)):
        dfs[f"target_{target}"].append(shap_dataset_instance[target][0])

for i in range(num_targets):
    data = dfs[f"target_{i}"]
    df = pd.DataFrame(data, columns=feature_names, index=dataset_names)

    # Create the heatmap with a corrected custom colorscale
    fig = go.Figure(data=go.Heatmap(
                   z=df.values,  # Heatmap values
                   x=df.columns,  # Column labels
                   y=df.index,    # Row labels
                   colorscale=[[0, 'red'], [0.5, 'white'], [1, '#10B2E2']]  # Use teal pastel blue
                   ))

    # Update layout
    fig.update_layout(
        title='Heatmap of DataFrame',
        xaxis_nticks=36,
        autosize=False,  # Disable autosize to set custom width and height
        width=800,       # Width of the figure in pixels
        height=1200      # Height of the figure in pixels
    )

    fig.show()
    break  # Remove or adjust this line based on how you want to handle multiple targets