In [33]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import pairwise_distances
import umap.umap_ as umap
import plotly.express as px


# Import and view data

In [5]:
data = pd.read_csv('Paradigms_of_AI_evaluation.csv')

In [6]:
data.head()

Unnamed: 0,Paper,Link,Year,Indicator,Distribution,Subject,Measurement,Task origin,Protocol,Reference,Task Modes,Evaluators,Motivation,Discipline,Paradigm
0,MMLU,https://arxiv.org/abs/2009.03300,2021,Performance,Aggregate Case,System,Observed,Sample,Fixed,Objective,Identification,Researchers,Comparison,AI,Benchmarking
1,GPQA,https://arxiv.org/pdf/2311.12022,2023,Performance,Aggregate Case,System,Observed,Design,Fixed,Objective,Identification,Researchers,Comparison,AI,Benchmarking
2,SWE Bench,https://arxiv.org/pdf/2310.06770,2023,Performance,Aggregate Case,System,Observed,Sample,Fixed,Objective,Generation,Researchers,Comparison,AI,Benchmarking
3,The WMDP Benchmark: Measuring and Reducing Mal...,https://arxiv.org/pdf/2403.03218,2024,"Performance, Safety",Aggregate Case,System,Observed,Design,Fixed,Objective,Identification,Researchers,"Assurance, Comparison","AI, Security, Bio, Chem",Benchmarking
4,Safety Gymnasium,https://arxiv.org/pdf/2310.12567,2023,"Performance, Safety",Aggregate Case,"System, Algorithms",Observed,Design,Procedural,Objective,Generation,Researchers,Comparison,AI,Benchmarking


In [8]:
data['Paradigm'].value_counts()

Paradigm
Benchmarking                        69
Exploratory                         14
Construct-oriented                  12
Evals                               11
TEVV                                10
Real-World Impact                    4
Construct-oriented & Exploratory     3
Evals & Benchmarking                 2
Exploratory & Benchmarking           1
Name: count, dtype: int64

# One-hot encode the dimensions

In [9]:
analysis_data = data.drop(columns=["Paper", "Link", "Year",'Paradigm'])
analysis_data.head()

Unnamed: 0,Indicator,Distribution,Subject,Measurement,Task origin,Protocol,Reference,Task Modes,Evaluators,Motivation,Discipline
0,Performance,Aggregate Case,System,Observed,Sample,Fixed,Objective,Identification,Researchers,Comparison,AI
1,Performance,Aggregate Case,System,Observed,Design,Fixed,Objective,Identification,Researchers,Comparison,AI
2,Performance,Aggregate Case,System,Observed,Sample,Fixed,Objective,Generation,Researchers,Comparison,AI
3,"Performance, Safety",Aggregate Case,System,Observed,Design,Fixed,Objective,Identification,Researchers,"Assurance, Comparison","AI, Security, Bio, Chem"
4,"Performance, Safety",Aggregate Case,"System, Algorithms",Observed,Design,Procedural,Objective,Generation,Researchers,Comparison,AI


In [20]:
def onehot_encode_all_features(df):
    encoded_dfs = []  # Store encoded DataFrames

    for column in df.columns:
        if df[column].dtype == object and df[column].str.contains(',').any():
            # Handle multi-label categorical columns
            split_values = df[column].dropna().str.split(',').apply(lambda x: [entry.strip() for entry in x])
            mlb = MultiLabelBinarizer()
            encoded = pd.DataFrame(
                mlb.fit_transform(split_values),
                columns=[f"{column}_{value}" for value in mlb.classes_],
                index=split_values.index
            )
            encoded_dfs.append(encoded)
        else:
            # Handle single-label categorical columns
            onehot_encoded = pd.get_dummies(df[column], prefix=column)
            encoded_dfs.append(onehot_encoded)

    # Concatenate all encoded columns once, instead of repeatedly inside the loop
    return pd.concat(encoded_dfs, axis=1).astype(bool)


In [24]:
onehot_encoded_data = onehot_encode_all_features(analysis_data)

In [25]:
onehot_encoded_data.head()

Unnamed: 0,Indicator_Behaviour,Indicator_Cost,Indicator_Fairness,Indicator_Performance,Indicator_Robustness and reliability,Indicator_Safety,Distribution_Aggregate Case,Distribution_Extreme Case,Distribution_Functional,Distribution_Manual inspection,...,Discipline_Climate Science,Discipline_Consulting,Discipline_Economics,Discipline_Education,Discipline_Law,Discipline_Linguistics,Discipline_Math,Discipline_Psychology,Discipline_Security,Discipline_Software Engineering
0,False,False,False,True,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,True,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,True,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,True,False,True,True,False,False,False,...,False,False,False,False,False,False,False,False,True,False
4,False,False,False,True,False,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [27]:
onehot_encoded_data_array = onehot_encoded_data.to_numpy()
onehot_encoded_data_array[:5,:]

array([[False, False, False,  True, False, False,  True, False, False,
        False, False, False,  True, False,  True, False, False,  True,
        False,  True, False, False, False,  True, False, False, False,
         True, False, False,  True, False,  True, False,  True, False,
        False, False, False, False, False, False, False, False, False,
        False, False],
       [False, False, False,  True, False, False,  True, False, False,
        False, False, False,  True, False,  True,  True, False, False,
        False,  True, False, False, False,  True, False, False, False,
         True, False, False,  True, False,  True, False,  True, False,
        False, False, False, False, False, False, False, False, False,
        False, False],
       [False, False, False,  True, False, False,  True, False, False,
        False, False, False,  True, False,  True, False, False,  True,
        False,  True, False, False, False,  True, False, False,  True,
        False, False, False,  T

# Compute pairwise Jaccard distances

In [30]:
jaccard_distance_matrix = pairwise_distances(onehot_encoded_data_array, metric="jaccard")
jaccard_distance_matrix[:10,:10]

array([[0.        , 0.16666667, 0.16666667, 0.41176471, 0.5       ,
        0.5       , 0.42857143, 0.46666667, 0.3125    , 0.44444444],
       [0.16666667, 0.        , 0.30769231, 0.3125    , 0.4       ,
        0.4       , 0.53333333, 0.35714286, 0.41176471, 0.44444444],
       [0.16666667, 0.30769231, 0.        , 0.5       , 0.4       ,
        0.4       , 0.30769231, 0.5625    , 0.3125    , 0.44444444],
       [0.41176471, 0.3125    , 0.5       , 0.        , 0.47368421,
        0.47368421, 0.65      , 0.44444444, 0.54545455, 0.42857143],
       [0.5       , 0.4       , 0.4       , 0.47368421, 0.        ,
        0.        , 0.58823529, 0.52941176, 0.55      , 0.5       ],
       [0.5       , 0.4       , 0.4       , 0.47368421, 0.        ,
        0.        , 0.58823529, 0.52941176, 0.55      , 0.5       ],
       [0.42857143, 0.53333333, 0.30769231, 0.65      , 0.58823529,
        0.58823529, 0.        , 0.5625    , 0.41176471, 0.44444444],
       [0.46666667, 0.35714286, 0.5625   

# UMAP and interactive plot

In [35]:
umap_model = umap.UMAP(n_components=2, metric='precomputed', random_state=4)
X_umap = umap_model.fit_transform(jaccard_distance_matrix)
X_umap[:5,:]


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


using precomputed metric; inverse_transform will be unavailable


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



array([[ 8.397359 ,  2.865271 ],
       [10.395241 ,  4.1778903],
       [ 9.543428 ,  2.2098448],
       [ 9.973457 ,  3.741352 ],
       [12.459528 ,  2.9773357]], dtype=float32)

In [37]:
# Create a DataFrame for interactive plotting
interactive_data = pd.DataFrame({
    "x": X_umap[:, 0],
    "y": X_umap[:, 1],
    "Paradigm": data['Paradigm'].values,
    "Paper": data['Paper'].values,
    "Year": data['Year'].values,
    "Link": data['Link'].values
})

interactive_data['Paradigm'] = interactive_data['Paradigm'].astype(str)

# Create the interactive scatter plot
fig = px.scatter(
    interactive_data,
    x="x",
    y="y",
    color="Paradigm",
    hover_data={"x": False, "y": False,"Paper":True, "Year":True,"Link":True},  # Information displayed on hover
    color_discrete_sequence=px.colors.qualitative.D3,
    category_orders={"Paradigm": [
        "Benchmarking", "Evals", "Construct-oriented", "Exploratory", 
        "Real-World Impact", "TEVV", "Construct-oriented & Exploratory", 
        "Evals & Benchmarking", "Exploratory & Benchmarking"
    ]}
)

fig.update_layout(
    width=1000,  # Adjust plot width
    height=800,  # Adjust plot height
    legend=dict(
        x=0,  # Position it at the left
        y=0,  # Position it at the bottom
        font=dict(size=15),
        xanchor="left",
        yanchor="bottom",
        bgcolor="rgba(255,255,255,0)",
    ),
    plot_bgcolor="white",  # White plot background
    paper_bgcolor="white",  # White figure background
    xaxis=dict(
        title=dict(text="UMAP Dimension 1", font=dict(size=18)),
        mirror=True,  # Extends ticks and gridlines outside the plot
        showline=True,  # Ensures y-axis line is drawn
        linecolor="lightgray",  
        linewidth=0.8,
    ),
    yaxis=dict(
        title=dict(text="UMAP Dimension 2", font=dict(size=18)),
        mirror=True,  # Extends ticks and gridlines outside the plot
        showline=True,  # Ensures y-axis line is drawn
        linecolor="lightgray",  
        linewidth=0.8,
    )
)

fig.update_traces(marker=dict(line=dict(width=1, color='black')))
fig.update_traces(marker=dict(size=17))

# Save the plot as an HTML file and a PNG image
#fig.write_html("UMAP_projections_jaccard_dist_interactive.html")
#fig.write_image('UMAP_projections_jaccard_dist_interactive.png', width=1000, height=800, scale=3)

fig.show()