In [None]:
import numpy as np
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go

feature_names = pd.read_csv('../processed_data/metafeatures.csv', index_col=0).columns
dataset_names = pd.read_csv('../processed_data/HAMMING LOSS example based/regression/performance.csv', index_col=0).index
domains = {
    "Text": ["ARABIC200", "BIBTEX", "ENRON", "FOODTRUCK", "LANGLOG", "MEDICAL", "NG20", "OHSUMED", "REUTERSK500", "SCENE", "STACKEX_CHESS", "STACKEX_CS", "STACKEX_PHILOSOPHY", "TMC2007_500", "YELP", "DELICIOUS", "SLASHDOT"],
    "Bioinformatics": ["GENBASE", "GNEGATIVEGO", "GNEGATIVEPSEACC", "GPOSITIVEGO", "GPOSITIVEPSEAAC", "HUMANGO", "HUMANPSEAAC", "PLANTGO", "PLANTPSEAAC", "PROTEINS_HUMAN", "PROTEINS_PLANT", "PROTEINS_VIRUS", "VIRUSGO", "VIRUS_PSEAAC", "YEAST"],
    "Multimedia": ["BIRDS", "CAL500", "COREL5K", "EMOTIONS", "FLAGS"],
    "Medical": ["CHD_49", "ABPM"],
    "Chemistry": ["WATER_QUALITY"]
}

# Function to calculate heights for subplots based on domain data
def get_heights(domains, df_fimps, row_height=30):
    return [len(df_fimps.loc[df_fimps.index.intersection(domain_datasets)]) * row_height for domain, domain_datasets in domains.items()]

def generate_plot(df_fimps, domains):
    # Create subplot figure with variable row heights
    fig = make_subplots(
        rows=len(domains),
        cols=1,
        shared_xaxes=True,
        vertical_spacing=0.04,
        subplot_titles=list(domains.keys()),
        row_heights=get_heights(domains, df_fimps)
    )

    # Determine global zmin and zmax for consistent color scaling across all plots
    zmin = df_fimps.min().min()
    zmax = df_fimps.max().max()

    # Adding heatmaps to each subplot
    for i, (domain, domain_datasets) in enumerate(domains.items(), start=1):
        domain_data = df_fimps.loc[df_fimps.index.intersection(domain_datasets)]
        fig.add_trace(
            go.Heatmap(
                z=domain_data.values,
                x=domain_data.columns,
                y=domain_data.index,
                colorscale=[[0, 'red'], [0.5, 'white'], [1, '#167bb6']],  # Teal pastel blue
                zmin=zmin,
                zmax=zmax,
                name=domain
            ),
            row=i, col=1
        )

    # Update layout to make the figure more readable and give equal space for each subplot
    fig.update_layout(
        # title='SHAP Values Heatmap by Domain',
        height=sum(get_heights(domains, df_fimps)),  # Set total height based on the sum of heights
        width=900,
        autosize=True
    )

    fig.show()
    fig.write_image("../figures/fimps_macro_f1_R_MO.pdf")

def get_data_regression_multi(metric, seed, run_id):
    learning_task = 'regression'
    task_output = 'multi'

    algo_portfolio = np.load(f'../processed_data/{metric}/algo_portfolio.npy', allow_pickle=True)
    # print(algo_portfolio)
    data = []

    AS_file = f"../{run_id}/seed_{seed}/{metric}/AS.csv"
    predicted_algo = pd.read_csv(AS_file)['AS-R-MO_algo_name']
    for fold in range(40):
        predicted_algo_fold = predicted_algo.iloc[fold]
        predicted_algo_fold_idx = list(algo_portfolio).index(predicted_algo_fold)
        shap_algo_fold_file_path = f"../{run_id}/seed_{seed}/{metric}/{learning_task}/{task_output}/shap/test_shap_fold_{fold}.npy"
        shap_algo_fold = np.load(shap_algo_fold_file_path)[predicted_algo_fold_idx]
        data.append(shap_algo_fold[0])
    return pd.DataFrame(data, index=dataset_names, columns=feature_names)


for metric in ["MACRO F1"]:
    # settings
    # metric = "AUCROC MICRO"
    # metric = "HAMMING LOSS example based"
    seed = 42
    print(metric)
    run_id = "results"

    df_fimps = get_data_regression_multi(metric, seed, run_id)
    # print(df_fimps)
    # df_fimps = df_fimps.abs()
    # df_fimps = df_fimps.rank(axis=1, method='min', ascending=False)
    # print(df_fimps)
    
    generate_plot(df_fimps, domains)


MACRO F1


In [None]:
import pandas as pd
import numpy as np
import umap
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

def evaluate_umap(X, y, n_neighbors, min_dist, metric, spread, set_op_mix_ratio, local_connectivity, n_epochs):
    umap_model = umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        metric=metric,
        spread=spread,
        set_op_mix_ratio=set_op_mix_ratio,
        local_connectivity=local_connectivity,
        random_state=42,
        n_epochs=n_epochs,
        learning_rate = 0.1,
        n_jobs=1
    )
    X_umap = umap_model.fit_transform(X)
    score = silhouette_score(X_umap, y)  # Using silhouette score as an example metric
    ch_score = calinski_harabasz_score(X_umap, y)
    db_score = davies_bouldin_score(X_umap, y)
    return score, ch_score, db_score

# Define metrics and initialize figure with subplots
metrics = ["HAMMING LOSS example based", "MACRO F1", "AUCROC MICRO",  "MICRO F1", "F1 example based"]
fig = make_subplots(rows=1, cols=len(metrics), subplot_titles=metrics)

# Initialize a consistent color map
color_discrete_sequence = px.colors.qualitative.Plotly

# Collect all unique algorithm names across all metrics
all_algo_names = set()
seed = 42
run_id = "results"

for metric in metrics:
    AS_file = f"../{run_id}/seed_{seed}/{metric}/AS.csv"
    predicted_algo_list = pd.read_csv(AS_file)['AS-R-MO_algo_name']
    all_algo_names.update(predicted_algo_list.unique())

# Create a unique color map
color_map = {algo: color_discrete_sequence[i % len(color_discrete_sequence)] for i, algo in enumerate(all_algo_names)}

for i, metric in enumerate(metrics):
    df_fimps = get_data_regression_multi(metric, seed, run_id)
    scaler = StandardScaler()

    # Fit the scaler on the SHAP values and transform them
    # df_fimps = pd.DataFrame(scaler.fit_transform(df_fimps), index=df_fimps.index, columns=df_fimps.columns)

    AS_file = f"../{run_id}/seed_{seed}/{metric}/AS.csv"
    predicted_algo_list = pd.read_csv(AS_file)['AS-R-MO_algo_name']

    # Define parameter grid
    param_grid = {
        'n_neighbors': [15],  # Default value 2, 5, 8, 11, 15, 18, 21, 25, 28, 32
        'min_dist': [0.01],  # Default value is 0.1 ...0.001, 0.01, 0.1, 0.5, 0.7, 0.9
        'metric': ['euclidean'],  # Default value euclidean
        'spread': [0.5],  # Default value is 1.0
        'set_op_mix_ratio': [0.8],  # Default value 1.0
        'local_connectivity': [3],  # Default value 1
        'n_epochs': [500]  # Default value for small datasets
    }

    # Perform grid search
    best_score = -1
    best_params = None
    combination_count = 0

    for params in ParameterGrid(param_grid):
        combination_count += 1
        score, ch_score, db_score = evaluate_umap(df_fimps, predicted_algo_list, **params)
        print(f"Combination {combination_count}: {params} -> Score: {score}, Ch_score: {ch_score}, DB_score: {db_score}")
        if score > best_score:
            best_score = score
            best_params = params

    # Visualize UMAP with best parameters
    umap_model = umap.UMAP(**best_params, random_state=42)
    X_umap = umap_model.fit_transform(df_fimps)
    umap_df = pd.DataFrame(X_umap, columns=['UMAP1', 'UMAP2'])
    umap_df['Predicted_Algorithm'] = predicted_algo_list.values

    # Apply the consistent color mapping
    umap_df['color'] = umap_df['Predicted_Algorithm'].map(color_map)

    scatter = go.Scatter(
        x=umap_df['UMAP1'],
        y=umap_df['UMAP2'],
        mode='markers',
        marker=dict(color=umap_df['color']),
        text=umap_df['Predicted_Algorithm'],
        showlegend=False if i > 0 else True,
        name=metric
    )

    fig.add_trace(scatter, row=1, col=i+1)

# Update layout for shared legend
fig.update_layout(
    plot_bgcolor='white',
    width=1800,  # Adjust width as needed
    height=600,  # Adjust height as needed
    title_x=0.5,  # Center the title
    legend=dict(
        x=1.05,
        y=1,
        traceorder='normal',
        title='Predicted Algorithm'
    )
)

# Add borders around each subplot and set axis titles
for i in range(len(metrics)):
    fig.update_xaxes(title_text='UMAP Component 1', row=1, col=i+1)
    fig.update_yaxes(title_text='UMAP Component 2', row=1, col=i+1)

    fig.add_shape(
        type='rect',
        x0=fig['layout'][f'xaxis{i+1}']['domain'][0],
        y0=fig['layout'][f'yaxis{i+1}']['domain'][0],
        x1=fig['layout'][f'xaxis{i+1}']['domain'][1],
        y1=fig['layout'][f'yaxis{i+1}']['domain'][1],
        xref='paper',
        yref='paper',
        line=dict(color="#D3D3D3"),
    )

# Add a combined legend for algorithms
for algo, color in color_map.items():
    fig.add_trace(go.Scatter(x=[None], y=[None], mode='markers', marker=dict(color=color), 
                             legendgroup=algo, showlegend=True, name=algo))

fig.show()
