In [1]:
from transformers import ClapModel, AutoProcessor
import torch
import librosa
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# Load CLAP model + processor
model = ClapModel.from_pretrained("laion/larger_clap_general")
processor = AutoProcessor.from_pretrained("laion/larger_clap_general")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/643 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/776M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/776M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [3]:
print('total number of CLAP\'s parameters:', sum(p.numel() for p in model.parameters()))

total number of CLAP's parameters: 193913882


In [4]:
param_size = 0
for param in model.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
print('CLAP model size: {:.3f}MB'.format(size_all_mb))

CLAP model size: 740.294MB


# Process audio

In [5]:
import zipfile
import os

zip_files = ["Exp1.zip", "Exp2.zip", "Analysis.zip"]

extract_dir = "/content/" # You can change this if you want to extract elsewhere
os.makedirs(extract_dir, exist_ok=True)

for zip_file in zip_files:
    if os.path.exists(zip_file):
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)
        print(f"Extracted {zip_file} to {extract_dir}")
    else:
        print(f"Error: {zip_file} not found.")

Extracted Exp1.zip to /content/
Extracted Exp2.zip to /content/
Extracted Analysis.zip to /content/


In [6]:
audio_stimuli = []
stimuli_path = "/content/Exp1/Stimuli/"

for file in sorted(os.listdir(stimuli_path)):
    if file.endswith(".wav"):
        wav_path = os.path.join(stimuli_path, file)
        #Clap has already been trained on a sample rate of 48,000 so we should use what it knows already
        audio, sample_rate = librosa.load(wav_path, sr=48000)
        audio_stimuli.append(audio)

In [7]:
inputs = processor(audios=audio_stimuli, return_tensors="pt", padding=True, sampling_rate=48000)
audio_embeddings = model.get_audio_features(**inputs)

In [8]:
print(audio_embeddings.shape)
# Size of the 2^9 vector

torch.Size([59, 512])


# Process text

In [9]:
discrete_tags = ["happiness", "sadness", "anger", "tenderness", "fear"]
discrete_captions_perceived = ["I perceive this sound as " + tag for tag in discrete_tags]
print(discrete_captions_perceived)

discrete_captions_induced = ["This sound makes me feel " + tag for tag in discrete_tags]
print(discrete_captions_induced)

dimensional_tags = ["positive", "relaxed", "awake"]
dimensional_captions_perceived = ["I perceive this sound as " + tag for tag in dimensional_tags]
print(dimensional_captions_perceived)
dimensional_captions_induced = ["This sound makes me feel " + tag for tag in dimensional_tags]
print(dimensional_captions_induced)


['I perceive this sound as happiness', 'I perceive this sound as sadness', 'I perceive this sound as anger', 'I perceive this sound as tenderness', 'I perceive this sound as fear']
['This sound makes me feel happiness', 'This sound makes me feel sadness', 'This sound makes me feel anger', 'This sound makes me feel tenderness', 'This sound makes me feel fear']
['I perceive this sound as positive', 'I perceive this sound as relaxed', 'I perceive this sound as awake']
['This sound makes me feel positive', 'This sound makes me feel relaxed', 'This sound makes me feel awake']


In [10]:
all_tags = discrete_captions_perceived + discrete_captions_induced + dimensional_captions_perceived + dimensional_captions_induced

# Using CLAP processor to tokenize (text-->numerical tokens) and prepare text captions
tag_inputs = processor(text=dimensional_captions_induced, return_tensors="pt", padding=True)
# Pass processed numerical tokens (from text) through claps text encoder --> text embeddings
tag_embeds = model.get_text_features(**tag_inputs)

In [11]:
tag_inputs_disc_i = processor(text=discrete_captions_induced, return_tensors="pt", padding=True)
tag_embeds_disc_i = model.get_text_features(**tag_inputs_disc_i)

In [12]:
tag_inputs_dim_p = processor(text=dimensional_captions_perceived, return_tensors="pt", padding=True)
tag_embeds_dim_p = model.get_text_features(**tag_inputs_dim_p)

In [13]:
tag_inputs_disc_p = processor(text=discrete_captions_perceived, return_tensors="pt", padding=True)
tag_embeds_disc_p = model.get_text_features(**tag_inputs_disc_p)

# Generate outputs

In [14]:
# Similarity scores between audio and text embeddings (dot product)
sims = torch.matmul(audio_embeddings, tag_embeds.T)
print(sims.shape)

torch.Size([59, 3])


In [15]:
sims_disc_i = torch.matmul(audio_embeddings, tag_embeds_disc_i.T)
print(sims_disc_i.shape)

torch.Size([59, 5])


In [16]:
sims_dim_p = torch.matmul(audio_embeddings, tag_embeds_dim_p.T)
print(sims_dim_p.shape)

torch.Size([59, 3])


In [17]:
sims_disc_p = torch.matmul(audio_embeddings, tag_embeds_disc_p.T)
print(sims_disc_p.shape)

torch.Size([59, 5])


## Load csv files and extract related columns

In [18]:
IDim_path = '/content/Exp1/Data/IDim/'
#IDim: Indiced dimensional
IDim_responses = []

# All 65 responses concatenated to one data frame
all_dfs = []

for file in sorted(os.listdir(IDim_path)):
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(IDim_path, file))
        df.columns = df.columns.str.strip()
        required_columns = ['positive', 'relaxed', 'awake']
        if all(col in df.columns for col in required_columns):
            all_dfs.append(df)

mean_positive_vector = []
mean_relaxed_vector = []
mean_awake_vector = []

num_rows = all_dfs[0].shape[0]

# 59 rows
for i in range(num_rows):
    current_row_positives = []
    current_row_relaxeds = []
    current_row_awakes = []

    # For the current row index 'i', collect values from all DataFrames
    for df in all_dfs:
        current_row_positives.append(df.iloc[i]['positive'])
        current_row_relaxeds.append(df.iloc[i]['relaxed'])
        current_row_awakes.append(df.iloc[i]['awake'])

    # Calculate the mean for the current row across all files, for each column
    mean_positive_vector.append(np.mean(current_row_positives))
    mean_relaxed_vector.append(np.mean(current_row_relaxeds))
    mean_awake_vector.append(np.mean(current_row_awakes))

IDim_responses = {
    'positive_vector': mean_positive_vector,
    'relaxed_vector': mean_relaxed_vector,
    'awake_vector': mean_awake_vector
}

# One value per stimulus (mean from all participants rating)
print("\n--- Mean Vectors (Mean across CSVs for each row position) ---")
print(f"Length of positive_vector: {len(IDim_responses['positive_vector'])}")
print(f"Length of relaxed_vector: {len(IDim_responses['relaxed_vector'])}")
print(f"Length of awake_vector: {len(IDim_responses['awake_vector'])}")


--- Mean Vectors (Mean across CSVs for each row position) ---
Length of positive_vector: 59
Length of relaxed_vector: 59
Length of awake_vector: 59


In [19]:
PDim_path = '/content/Exp1/Data/PDim/'
#PDim: Percieved dimensional
PDim_responses = []

all_dfs = []

for file in sorted(os.listdir(PDim_path)):
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(PDim_path, file))
        df.columns = df.columns.str.strip()
        required_columns = ['positive', 'relaxed', 'awake']
        if all(col in df.columns for col in required_columns):
            all_dfs.append(df)

mean_positive_vector = []
mean_relaxed_vector = []
mean_awake_vector = []

num_rows = all_dfs[0].shape[0]

for i in range(num_rows):
    current_row_positives = []
    current_row_relaxeds = []
    current_row_awakes = []

    # For the current row index 'i', collect values from all DataFrames
    for df in all_dfs:
        current_row_positives.append(df.iloc[i]['positive'])
        current_row_relaxeds.append(df.iloc[i]['relaxed'])
        current_row_awakes.append(df.iloc[i]['awake'])

    # Calculate the mean for the current row across all files, for each column
    mean_positive_vector.append(np.mean(current_row_positives))
    mean_relaxed_vector.append(np.mean(current_row_relaxeds))
    mean_awake_vector.append(np.mean(current_row_awakes))

PDim_responses = {
    'positive_vector': mean_positive_vector,
    'relaxed_vector': mean_relaxed_vector,
    'awake_vector': mean_awake_vector
}

print("\n--- Mean Vectors (Mean across CSVs for each row position) ---")
print(f"Length of positive_vector: {len(PDim_responses['positive_vector'])}")
print(f"Length of relaxed_vector: {len(PDim_responses['relaxed_vector'])}")
print(f"Length of awake_vector: {len(PDim_responses['awake_vector'])}")


--- Mean Vectors (Mean across CSVs for each row position) ---
Length of positive_vector: 59
Length of relaxed_vector: 59
Length of awake_vector: 59


In [20]:
IDisc_path = '/content/Exp1/Data/IDisc/'
#IDisc: Induced discrete
IDisc_responses = []

all_dfs = []

for file in sorted(os.listdir(IDisc_path)):
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(IDisc_path, file))
        df.columns = df.columns.str.strip()
        required_columns = ["happiness", "sadness", "anger", "tenderness", "fear"]
        if all(col in df.columns for col in required_columns):
            all_dfs.append(df)

mean_happiness_vector = []
mean_sadness_vector = []
mean_anger_vector = []
mean_tenderness_vector = []
mean_fear_vector = []



num_rows = all_dfs[0].shape[0]

for i in range(num_rows):
    current_row_happiness = []
    current_row_sadness = []
    current_row_anger = []
    current_row_tenderness = []
    current_row_fear = []


    # For the current row index 'i', collect values from all DataFrames
    for df in all_dfs:
        current_row_happiness.append(df.iloc[i]['happiness'])
        current_row_sadness.append(df.iloc[i]['sadness'])
        current_row_anger.append(df.iloc[i]['anger'])
        current_row_tenderness.append(df.iloc[i]['tenderness'])
        current_row_fear.append(df.iloc[i]['fear'])


    # Calculate the mean for the current row across all files, for each column
    mean_happiness_vector.append(np.mean(current_row_happiness))
    mean_sadness_vector.append(np.mean(current_row_sadness))
    mean_anger_vector.append(np.mean(current_row_anger))
    mean_tenderness_vector.append(np.mean(current_row_tenderness))
    mean_fear_vector.append(np.mean(current_row_fear))

IDisc_responses = {
    'happiness_vector': mean_happiness_vector,
    'sadness_vector': mean_sadness_vector,
    'anger_vector': mean_anger_vector,
    'tenderness_vector': mean_tenderness_vector,
    'fear_vector': mean_fear_vector
}

print("\n--- Mean Vectors (Mean across CSVs for each row position) ---")
print(f"Length of happiness_vector: {len(IDisc_responses['happiness_vector'])}")
print(f"Length of sadness_vector: {len(IDisc_responses['sadness_vector'])}")
print(f"Length of anger_vector: {len(IDisc_responses['anger_vector'])}")
print(f"Length of tenderness_vector: {len(IDisc_responses['tenderness_vector'])}")
print(f"Length of fear_vector: {len(IDisc_responses['fear_vector'])}")



--- Mean Vectors (Mean across CSVs for each row position) ---
Length of happiness_vector: 59
Length of sadness_vector: 59
Length of anger_vector: 59
Length of tenderness_vector: 59
Length of fear_vector: 59


In [21]:
PDisc_path = '/content/Exp1/Data/PDisc/'
#PDisc: Percieved discrete
PDisc_responses = []

all_dfs = []

for file in sorted(os.listdir(PDisc_path)):
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(PDisc_path, file))
        df.columns = df.columns.str.strip()
        required_columns = ["happiness", "sadness", "anger", "tenderness", "fear"]
        if all(col in df.columns for col in required_columns):
            all_dfs.append(df)

mean_happiness_vector = []
mean_sadness_vector = []
mean_anger_vector = []
mean_tenderness_vector = []
mean_fear_vector = []



num_rows = all_dfs[0].shape[0]

for i in range(num_rows):
    current_row_happiness = []
    current_row_sadness = []
    current_row_anger = []
    current_row_tenderness = []
    current_row_fear = []


    # For the current row index 'i', collect values from all DataFrames
    for df in all_dfs:
        current_row_happiness.append(df.iloc[i]['happiness'])
        current_row_sadness.append(df.iloc[i]['sadness'])
        current_row_anger.append(df.iloc[i]['anger'])
        current_row_tenderness.append(df.iloc[i]['tenderness'])
        current_row_fear.append(df.iloc[i]['fear'])


    # Calculate the mean for the current row across all files, for each column
    mean_happiness_vector.append(np.mean(current_row_happiness))
    mean_sadness_vector.append(np.mean(current_row_sadness))
    mean_anger_vector.append(np.mean(current_row_anger))
    mean_tenderness_vector.append(np.mean(current_row_tenderness))
    mean_fear_vector.append(np.mean(current_row_fear))

PDisc_responses = {
    'happiness_vector': mean_happiness_vector,
    'sadness_vector': mean_sadness_vector,
    'anger_vector': mean_anger_vector,
    'tenderness_vector': mean_tenderness_vector,
    'fear_vector': mean_fear_vector
}

print("\n--- Mean Vectors (Mean across CSVs for each row position) ---")
print(f"Length of happiness_vector: {len(PDisc_responses['happiness_vector'])}")
print(f"Length of sadness_vector: {len(PDisc_responses['sadness_vector'])}")
print(f"Length of anger_vector: {len(PDisc_responses['anger_vector'])}")
print(f"Length of tenderness_vector: {len(PDisc_responses['tenderness_vector'])}")
print(f"Length of fear_vector: {len(PDisc_responses['fear_vector'])}")



--- Mean Vectors (Mean across CSVs for each row position) ---
Length of happiness_vector: 59
Length of sadness_vector: 59
Length of anger_vector: 59
Length of tenderness_vector: 59
Length of fear_vector: 59


# Evaluate

In [22]:
# Find the min and max values in the current sims tensor
old_min = sims.min()
old_max = sims.max()

# Define the new desired range
new_min = 1.0
new_max = 9.0

# Apply the min-max scaling formula
# Scaling CLAPs predictions to match the human ratings scale (1-9)
scaled_sims = ((sims - old_min) / (old_max - old_min)) * (new_max - new_min) + new_min

print(f"Scaled sims shape: {scaled_sims.shape}")
print(f"Scaled sims (first 5 rows):\n{scaled_sims[:10]}\n")
print(f"Scaled sims min value: {scaled_sims.min():.4f}")
print(f"Scaled sims max value: {scaled_sims.max():.4f}\n")

human_ratings_tensor = torch.tensor([
    IDim_responses['positive_vector'],
    IDim_responses['relaxed_vector'],
    IDim_responses['awake_vector']
], dtype=torch.float32).T # Transpose to get shape (59, 3)

print(f"Human ratings tensor shape: {human_ratings_tensor.shape}")
print(f"Human ratings tensor (first 5 rows):\n{human_ratings_tensor[:10]}\n")

Scaled sims shape: torch.Size([59, 3])
Scaled sims (first 5 rows):
tensor([[5.5154, 6.2459, 4.3756],
        [5.0735, 6.2841, 3.4231],
        [4.5980, 6.2105, 4.2557],
        [3.6465, 5.2515, 4.2829],
        [4.3135, 3.9755, 2.7836],
        [3.4054, 3.1892, 2.4026],
        [5.0046, 4.9204, 3.4381],
        [5.6248, 4.9287, 3.4671],
        [5.4228, 5.2849, 3.4730],
        [5.7928, 5.7447, 4.3253]], grad_fn=<SliceBackward0>)

Scaled sims min value: 1.0000
Scaled sims max value: 9.0000

Human ratings tensor shape: torch.Size([59, 3])
Human ratings tensor (first 5 rows):
tensor([[4.4872, 4.8798, 4.5185],
        [4.7682, 5.3057, 4.7414],
        [5.3105, 5.7312, 5.0760],
        [5.2582, 5.2651, 5.4488],
        [3.8897, 3.9540, 5.3549],
        [4.3197, 3.8900, 5.5089],
        [4.9825, 5.1875, 5.5354],
        [5.0934, 4.8657, 5.8208],
        [4.7605, 4.8502, 5.4158],
        [5.2577, 4.8688, 5.7003]])



In [23]:
# Find the min and max values in the current sims tensor
old_min_disc_i = sims_disc_i.min()
old_max_disc_i = sims_disc_i.max()

# Define the new desired range
new_min = 1.0
new_max = 9.0

# Apply the min-max scaling formula
scaled_sims_disc_i = ((sims_disc_i - old_min_disc_i) / (old_max_disc_i - old_min_disc_i)) * (new_max - new_min) + new_min

print(f"Scaled sims shape: {scaled_sims_disc_i.shape}")
print(f"Scaled sims (first 5 rows):\n{scaled_sims_disc_i[:10]}\n")
print(f"Scaled sims min value: {scaled_sims_disc_i.min():.4f}")
print(f"Scaled sims max value: {scaled_sims_disc_i.max():.4f}\n")

human_ratings_tensor_disc_i = torch.tensor([
    IDisc_responses['happiness_vector'],
    IDisc_responses['sadness_vector'],
    IDisc_responses['anger_vector'],
    IDisc_responses['tenderness_vector'],
    IDisc_responses['fear_vector']
], dtype=torch.float32).T # Transpose to get shape (59, 5)

print(f"Human ratings tensor shape: {human_ratings_tensor_disc_i.shape}")
print(f"Human ratings tensor (first 5 rows):\n{human_ratings_tensor_disc_i[:10]}\n")

Scaled sims shape: torch.Size([59, 5])
Scaled sims (first 5 rows):
tensor([[6.9297, 6.5952, 6.1260, 4.3491, 7.0391],
        [6.2437, 4.0828, 4.7225, 2.4772, 4.7005],
        [5.3833, 5.0553, 4.4227, 3.3485, 5.4494],
        [5.3806, 5.4280, 5.2407, 3.9668, 6.0074],
        [5.3131, 3.8855, 3.5048, 2.4227, 4.6176],
        [3.9015, 2.8038, 2.5668, 1.3331, 3.1783],
        [5.2924, 4.3045, 3.3048, 3.1260, 4.6689],
        [6.1300, 4.6030, 4.4492, 3.6453, 5.2633],
        [5.3839, 4.6822, 3.4106, 3.2569, 4.9301],
        [6.4754, 6.2556, 5.4380, 4.7923, 7.0795]], grad_fn=<SliceBackward0>)

Scaled sims min value: 1.0000
Scaled sims max value: 9.0000

Human ratings tensor shape: torch.Size([59, 5])
Human ratings tensor (first 5 rows):
tensor([[1.8202, 3.3418, 2.7948, 1.9789, 4.1430],
        [1.9091, 3.4459, 2.9059, 2.3785, 2.8558],
        [2.0644, 3.1847, 2.4182, 2.6371, 2.3356],
        [2.5321, 3.1900, 2.1509, 3.0583, 2.0938],
        [1.8391, 2.4733, 4.0144, 1.9902, 3.4942],
        [

In [24]:
# Find the min and max values in the current sims tensor
old_min_disc_p = sims_disc_p.min()
old_max_disc_p = sims_disc_p.max()

# Define the new desired range
new_min = 1.0
new_max = 9.0

# Apply the min-max scaling formula
scaled_sims_disc_p = ((sims_disc_p - old_min_disc_p) / (old_max_disc_p - old_min_disc_p)) * (new_max - new_min) + new_min
print(scaled_sims_disc_p.shape)

print(f"Scaled sims shape: {scaled_sims_disc_p.shape}")
print(f"Scaled sims (first 5 rows):\n{scaled_sims_disc_p[:10]}\n")
print(f"Scaled sims min value: {scaled_sims_disc_p.min():.4f}")
print(f"Scaled sims max value: {scaled_sims_disc_p.max():.4f}\n")

human_ratings_tensor_disc_p = torch.tensor([
    PDisc_responses['happiness_vector'],
    PDisc_responses['sadness_vector'],
    PDisc_responses['anger_vector'],
    PDisc_responses['tenderness_vector'],
    PDisc_responses['fear_vector']
], dtype=torch.float32).T # Transpose to get shape (59, 5)

print(f"Human ratings tensor shape: {human_ratings_tensor_disc_p.shape}")
print(f"Human ratings tensor (first 5 rows):\n{human_ratings_tensor_disc_p[:10]}\n")

torch.Size([59, 5])
Scaled sims shape: torch.Size([59, 5])
Scaled sims (first 5 rows):
tensor([[6.1976, 6.2456, 6.1956, 4.3895, 6.9023],
        [5.2280, 4.6433, 4.6806, 2.5856, 5.4889],
        [4.7045, 4.8403, 4.5341, 3.6282, 5.1122],
        [5.2751, 5.0938, 4.4440, 3.6542, 5.5221],
        [5.6084, 5.5640, 4.5904, 4.3078, 6.1068],
        [4.9663, 5.2076, 3.9776, 4.4521, 4.9362],
        [5.1661, 5.4503, 4.0557, 4.3041, 5.1573],
        [6.4221, 6.4832, 5.3226, 5.4644, 6.3224],
        [5.8989, 5.5509, 4.4231, 4.4114, 5.4335],
        [7.6100, 7.3957, 6.3430, 6.0621, 7.3122]], grad_fn=<SliceBackward0>)

Scaled sims min value: 1.0000
Scaled sims max value: 9.0000

Human ratings tensor shape: torch.Size([59, 5])
Human ratings tensor (first 5 rows):
tensor([[1.5362, 4.4191, 5.1551, 2.0345, 5.6468],
        [1.7403, 4.8642, 4.2637, 2.5285, 3.9835],
        [2.0486, 5.5669, 3.0558, 2.9492, 2.8565],
        [2.4389, 4.8343, 2.1514, 3.5546, 3.0082],
        [1.5691, 3.6631, 5.0478, 1.9322

In [25]:
# Find the min and max values in the current sims tensor
old_min_dim_p = sims_dim_p.min()
old_max_dim_p = sims_dim_p.max()

# Define the new desired range
new_min = 1.0
new_max = 9.0

# Apply the min-max scaling formula
scaled_sims_dim_p = ((sims_dim_p - old_min_dim_p) / (old_max_dim_p - old_min_dim_p)) * (new_max - new_min) + new_min

print(f"Scaled sims shape: {scaled_sims_dim_p.shape}")
print(f"Scaled sims (first 5 rows):\n{scaled_sims_dim_p[:10]}\n")
print(f"Scaled sims min value: {scaled_sims_dim_p.min():.4f}")
print(f"Scaled sims max value: {scaled_sims_dim_p.max():.4f}\n")

human_ratings_tensor_dim_p = torch.tensor([
    PDim_responses['positive_vector'],
    PDim_responses['relaxed_vector'],
    PDim_responses['awake_vector']
], dtype=torch.float32).T # Transpose to get shape (59, 3)

print(f"Human ratings tensor shape: {human_ratings_tensor_dim_p.shape}")
print(f"Human ratings tensor (first 5 rows):\n{human_ratings_tensor_dim_p[:10]}\n")

Scaled sims shape: torch.Size([59, 3])
Scaled sims (first 5 rows):
tensor([[5.7782, 5.8491, 4.6984],
        [4.9480, 5.7153, 3.5230],
        [4.5816, 6.5361, 3.3945],
        [4.5833, 5.6016, 3.5071],
        [5.6998, 4.3380, 4.9493],
        [4.8709, 3.5428, 4.1024],
        [4.7499, 5.7886, 4.0852],
        [5.6971, 5.5066, 4.6746],
        [5.3806, 5.5848, 3.9045],
        [6.7121, 6.6467, 5.3355]], grad_fn=<SliceBackward0>)

Scaled sims min value: 1.0000
Scaled sims max value: 9.0000

Human ratings tensor shape: torch.Size([59, 3])
Human ratings tensor (first 5 rows):
tensor([[3.5116, 4.4973, 3.4563],
        [4.1519, 4.9952, 3.2785],
        [4.6982, 5.4154, 4.2416],
        [5.2018, 5.2612, 4.9213],
        [3.1964, 3.4387, 3.9169],
        [3.9328, 3.8416, 4.5261],
        [4.5933, 4.8528, 4.9660],
        [4.5391, 4.6797, 5.4388],
        [4.2891, 4.7658, 4.2599],
        [4.6275, 4.4824, 5.1707]])



 # Comparison Metrics

In [26]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr # For Pearson correlation

# Comparison Method 1: Mean Absolute Error (MAE)
# MAE measures the average magnitude of the errors in a set of predictions, without considering their direction.
# Average of all absolute errors across all 59 stimuli and 3 dimensions.
# e.g An MAE of 1.5 means CLAP is typically ±1.5 points away from human ratings.
# Mean is calculated over the matrix so one value is outputted
mae = torch.mean(torch.abs(scaled_sims - human_ratings_tensor))
mae_disc_i = torch.mean(torch.abs(scaled_sims_disc_i - human_ratings_tensor_disc_i))
mae_disc_p = torch.mean(torch.abs(scaled_sims_disc_p - human_ratings_tensor_disc_p))
mae_dim_p = torch.mean(torch.abs(scaled_sims_dim_p - human_ratings_tensor_dim_p))

print(f"Mean Absolute Error (MAE) between scaled_sims and human_ratings (IDim) : {mae:.4f}")
print(f"Mean Absolute Error (MAE) between scaled_sims and human_ratings (IDisc) : {mae_disc_i:.4f}")
print(f"Mean Absolute Error (MAE) between scaled_sims and human_ratings (PDisc) : {mae_disc_p:.4f}")
print(f"Mean Absolute Error (MAE) between scaled_sims and human_ratings (PDim) : {mae_dim_p:.4f}")

# Comparison Method 2: Pearson Correlation Coefficient (per column)
# Pearson correlation measures the linear relationship between two sets of data.
# We'll calculate it for each of the three columns (dimensions).
# Which emotion dimensions does CLAP understand best?

#IDim
correlation_positive, _ = pearsonr(scaled_sims[:, 0].detach().numpy(), human_ratings_tensor[:, 0].numpy())
correlation_relaxed, _ = pearsonr(scaled_sims[:, 1].detach().numpy(), human_ratings_tensor[:, 1].numpy())
correlation_awake, _ = pearsonr(scaled_sims[:, 2].detach().numpy(), human_ratings_tensor[:, 2].numpy())

#PDim
correlation_positive_dim_p, _ = pearsonr(scaled_sims_dim_p[:, 0].detach().numpy(), human_ratings_tensor_dim_p[:, 0].numpy())
correlation_relaxed_dim_p, _ = pearsonr(scaled_sims_dim_p[:, 1].detach().numpy(), human_ratings_tensor_dim_p[:, 1].numpy())
correlation_awake_dim_p, _ = pearsonr(scaled_sims_dim_p[:, 2].detach().numpy(), human_ratings_tensor_dim_p[:, 2].numpy())

# IDisc
correlation_happiness, _ = pearsonr(scaled_sims_disc_i[:, 0].detach().numpy(), human_ratings_tensor_disc_i[:, 0].numpy())
correlation_sadness, _ = pearsonr(scaled_sims_disc_i[:, 1].detach().numpy(), human_ratings_tensor_disc_i[:, 1].numpy())
correlation_anger, _ = pearsonr(scaled_sims_disc_i[:, 2].detach().numpy(), human_ratings_tensor_disc_i[:, 2].numpy())
correlation_tenderness, _ = pearsonr(scaled_sims_disc_i[:, 3].detach().numpy(), human_ratings_tensor_disc_i[:, 3].numpy())
correlation_fear, _ = pearsonr(scaled_sims_disc_i[:, 4].detach().numpy(), human_ratings_tensor_disc_i[:, 4].numpy())

#PDisc
correlation_happiness_disc_p, _ = pearsonr(scaled_sims_disc_p[:, 0].detach().numpy(), human_ratings_tensor_disc_p[:, 0].numpy())
correlation_sadness_disc_p, _ = pearsonr(scaled_sims_disc_p[:, 1].detach().numpy(), human_ratings_tensor_disc_p[:, 1].numpy())
correlation_anger_disc_p, _ = pearsonr(scaled_sims_disc_p[:, 2].detach().numpy(), human_ratings_tensor_disc_p[:, 2].numpy())
correlation_tenderness_disc_p, _ = pearsonr(scaled_sims_disc_p[:, 3].detach().numpy(), human_ratings_tensor_disc_p[:, 3].numpy())
correlation_fear_disc_p, _ = pearsonr(scaled_sims_disc_p[:, 4].detach().numpy(), human_ratings_tensor_disc_p[:, 4].numpy())

print("Pearson Correlation Coefficients (between scaled_sims and human_ratings) IDim:")
print(f"  Positive Dimension: {correlation_positive:.4f}")
print(f"  Relaxed Dimension:  {correlation_relaxed:.4f}")
print(f"  Awake Dimension:    {correlation_awake:.4f}\n")

print("Pearson Correlation Coefficients (between scaled_sims and human_ratings) PDim:")
print(f"  Positive Dimension: {correlation_positive_dim_p:.4f}")
print(f"  Relaxed Dimension:  {correlation_relaxed_dim_p:.4f}")
print(f"  Awake Dimension:    {correlation_awake_dim_p:.4f}\n")

print("Pearson Correlation Coefficients (between scaled_sims and human_ratings) IDisc:")
print(f"  Happiness Dimension: {correlation_happiness:.4f}")
print(f"  Sadness Dimension:  {correlation_sadness:.4f}")
print(f"  Anger Dimension:    {correlation_anger:.4f}")
print(f"  Tenderness Dimension:    {correlation_tenderness:.4f}")
print(f"  Fear Dimension:    {correlation_fear:.4f}\n")

print("Pearson Correlation Coefficients (between scaled_sims and human_ratings) PDisc:")
print(f"  Happiness Dimension: {correlation_happiness_disc_p:.4f}")
print(f"  Sadness Dimension:  {correlation_sadness_disc_p:.4f}")
print(f"  Anger Dimension:    {correlation_anger_disc_p:.4f}")
print(f"  Tenderness Dimension:    {correlation_tenderness_disc_p:.4f}")
print(f"  Fear Dimension:    {correlation_fear_disc_p:.4f}\n")



average_correlation = (correlation_positive + correlation_relaxed + correlation_awake) / 3
average_correlation_disc_i = (correlation_happiness + correlation_sadness + correlation_anger + correlation_tenderness + correlation_fear) / 5
average_correlation_disc_p = (correlation_happiness_disc_p + correlation_sadness_disc_p + correlation_anger_disc_p + correlation_tenderness_disc_p + correlation_fear_disc_p) / 5
average_correlation_dim_p = (correlation_positive_dim_p + correlation_relaxed_dim_p + correlation_awake_dim_p) / 3

print(f"  Average Correlation (IDim): {average_correlation:.4f}")
print(f"  Average Correlation (IDisc): {average_correlation_disc_i:.4f}")
print(f"  Average Correlation (PDisc): {average_correlation_disc_p:.4f}")
print(f"  Average Correlation (PDim): {average_correlation_dim_p:.4f}")

Mean Absolute Error (MAE) between scaled_sims and human_ratings (IDim) : 1.6601
Mean Absolute Error (MAE) between scaled_sims and human_ratings (IDisc) : 2.3063
Mean Absolute Error (MAE) between scaled_sims and human_ratings (PDisc) : 2.2473
Mean Absolute Error (MAE) between scaled_sims and human_ratings (PDim) : 1.1906
Pearson Correlation Coefficients (between scaled_sims and human_ratings) IDim:
  Positive Dimension: 0.6344
  Relaxed Dimension:  0.6316
  Awake Dimension:    -0.4555

Pearson Correlation Coefficients (between scaled_sims and human_ratings) PDim:
  Positive Dimension: 0.7236
  Relaxed Dimension:  0.7134
  Awake Dimension:    -0.2655

Pearson Correlation Coefficients (between scaled_sims and human_ratings) IDisc:
  Happiness Dimension: 0.7445
  Sadness Dimension:  -0.2885
  Anger Dimension:    -0.3966
  Tenderness Dimension:    0.5535
  Fear Dimension:    -0.1141

Pearson Correlation Coefficients (between scaled_sims and human_ratings) PDisc:
  Happiness Dimension: 0.734

In [27]:
# Comparison Method 2: Mean Absolute Percentage Error (MAPE)
# MAPE measures the accuracy of a forecasting method in terms of percentage.
# Formula: MAPE = (1/n) * sum(|(Actual - Forecast) / Actual|) * 100%
# On average, what percentage do CLAP's predictions deviate from human ratings?

# Calculate the absolute percentage error for each element
# Since human responses are between 1 and 9, division by zero is not a concern.
absolute_percentage_error = torch.abs((human_ratings_tensor - scaled_sims) / human_ratings_tensor) * 100
absolute_percentage_error_disc_i = torch.abs((human_ratings_tensor_disc_i - scaled_sims_disc_i) / human_ratings_tensor_disc_i) * 100
absolute_percentage_error_disc_p = torch.abs((human_ratings_tensor_disc_p - scaled_sims_disc_p) / human_ratings_tensor_disc_p) * 100
absolute_percentage_error_dim_p = torch.abs((human_ratings_tensor_dim_p - scaled_sims_dim_p) / human_ratings_tensor_dim_p) * 100

# Calculate the mean of these percentage errors
mape = torch.mean(absolute_percentage_error)
mape_disc_i = torch.mean(absolute_percentage_error_disc_i)
mape_disc_p = torch.mean(absolute_percentage_error_disc_p)
mape_dim_p = torch.mean(absolute_percentage_error_dim_p)

print(f"Mean Absolute Percentage Error (MAPE) IDim: {mape:.2f}%")
print(f"Mean Absolute Percentage Error (MAPE) IDisc: {mape_disc_i:.2f}%")
print(f"Mean Absolute Percentage Error (MAPE) PDisc: {mape_disc_p:.2f}%")
print(f"Mean Absolute Percentage Error (MAPE) PDim: {mape_dim_p:.2f}%\n")

# Comparison Method 3: Root Mean Squared Error (RMSE)
# RMSE measures the square root of the average of the squared differences between predicted and actual values.
# It gives a relatively high weight to large errors.
# Formula: RMSE = sqrt(mean((Actual - Forecast)^2))
# How far, on average, are CLAP’s predictions from human ratings—while heavily punishing big errors?

# Calculate the squared differences
squared_differences = (human_ratings_tensor - scaled_sims)**2
squared_differences_disc_i = (human_ratings_tensor_disc_i - scaled_sims_disc_i)**2
squared_differences_disc_p = (human_ratings_tensor_disc_p - scaled_sims_disc_p)**2
squared_differences_dim_p = (human_ratings_tensor_dim_p - scaled_sims_dim_p)**2


# Calculate the mean of the squared differences (Mean Squared Error - MSE)
mse = torch.mean(squared_differences)
mse_disc_i = torch.mean(squared_differences_disc_i)
mse_disc_p = torch.mean(squared_differences_disc_p)
mse_dim_p = torch.mean(squared_differences_dim_p)

# Calculate the square root to get RMSE
rmse = torch.sqrt(mse)
rmse_disc_i = torch.sqrt(mse_disc_i)
rmse_disc_p = torch.sqrt(mse_disc_p)
rmse_dim_p = torch.sqrt(mse_dim_p)

print(f"Root Mean Squared Error (RMSE) IDim: {rmse:.4f}\n")
print(f"Root Mean Squared Error (RMSE) IDisc: {rmse_disc_i:.4f}\n")
print(f"Root Mean Squared Error (RMSE) PDisc: {rmse_disc_p:.4f}\n")
print(f"Root Mean Squared Error (RMSE) PDim: {rmse_dim_p:.4f}\n")

from sklearn.metrics import r2_score

# Comparison Method 4: R-Squared
# Measures how well CLAP's predictions explain the variance in human ratings.
# What percentage of the changes in human ratings can be predicted by CLAP's model? (1=perfect prediction)

# R-squared IDim
print("\nR-squared scores (IDim):")
r2_valence = r2_score(human_ratings_tensor[:, 0].numpy(), scaled_sims[:, 0].detach().numpy())
print("  valence =", r2_valence)
r2_tension = r2_score(human_ratings_tensor[:, 1].numpy(), scaled_sims[:, 1].detach().numpy())
print("  tension =", r2_tension)
r2_energy = r2_score(human_ratings_tensor[:, 2].numpy(), scaled_sims[:, 2].detach().numpy())
print("  energy =", r2_energy)

# R-squared IDisc
print("\nR-squared scores (IDisc):")
r2_happiness_disc_i = r2_score(human_ratings_tensor_disc_i[:, 0].numpy(), scaled_sims_disc_i[:, 0].detach().numpy())
print("  happiness =", r2_happiness_disc_i)
r2_sadness_disc_i = r2_score(human_ratings_tensor_disc_i[:, 1].numpy(), scaled_sims_disc_i[:, 1].detach().numpy())
print("  sadness =", r2_sadness_disc_i)
r2_anger_disc_i = r2_score(human_ratings_tensor_disc_i[:, 2].numpy(), scaled_sims_disc_i[:, 2].detach().numpy())
print("  anger =", r2_anger_disc_i)
r2_tenderness_disc_i = r2_score(human_ratings_tensor_disc_i[:, 3].numpy(), scaled_sims_disc_i[:, 3].detach().numpy())
print("  tenderness =", r2_tenderness_disc_i)
r2_fear_disc_i = r2_score(human_ratings_tensor_disc_i[:, 4].numpy(), scaled_sims_disc_i[:, 4].detach().numpy())
print("  fear =", r2_fear_disc_i)

# R-squared PDim
print("\nR-squared scores (PDim):")
r2_valence_dim_p = r2_score(human_ratings_tensor_dim_p[:, 0].numpy(), scaled_sims_dim_p[:, 0].detach().numpy())
print("  valence =", r2_valence_dim_p)
r2_tension_dim_p = r2_score(human_ratings_tensor_dim_p[:, 1].numpy(), scaled_sims_dim_p[:, 1].detach().numpy())
print("  tension =", r2_tension_dim_p)
r2_energy_dim_p = r2_score(human_ratings_tensor_dim_p[:, 2].numpy(), scaled_sims_dim_p[:, 2].detach().numpy())
print("  energy =", r2_energy_dim_p)

# R-squared PDisc
print("\nR-squared scores (PDisc):")
r2_happiness_disc_p = r2_score(human_ratings_tensor_disc_p[:, 0].numpy(), scaled_sims_disc_p[:, 0].detach().numpy())
print("  happiness =", r2_happiness_disc_p)
r2_sadness_disc_p = r2_score(human_ratings_tensor_disc_p[:, 1].numpy(), scaled_sims_disc_p[:, 1].detach().numpy())
print("  sadness =", r2_sadness_disc_p)
r2_anger_disc_p = r2_score(human_ratings_tensor_disc_p[:, 2].numpy(), scaled_sims_disc_p[:, 2].detach().numpy())
print("  anger =", r2_anger_disc_p)
r2_tenderness_disc_p = r2_score(human_ratings_tensor_disc_p[:, 3].numpy(), scaled_sims_disc_p[:, 3].detach().numpy())
print("  tenderness =", r2_tenderness_disc_p)
r2_fear_disc_p = r2_score(human_ratings_tensor_disc_p[:, 4].numpy(), scaled_sims_disc_p[:, 4].detach().numpy())
print("  fear =", r2_fear_disc_p)

average_r2 = (r2_valence + r2_energy + r2_tension) /3
average_r2_dim_p = (r2_valence_dim_p + r2_energy_dim_p + r2_tension_dim_p) /3
average_r2_disc_i = (r2_happiness_disc_i + r2_sadness_disc_i + r2_anger_disc_i + r2_tenderness_disc_i + r2_fear_disc_i) / 5
average_r2_disc_p = (r2_happiness_disc_p + r2_sadness_disc_p + r2_anger_disc_p + r2_tenderness_disc_p + r2_fear_disc_p) / 5

print("\nAverage R-squared scores:")
print("IDim: ", average_r2)
print("PDim: ", average_r2_dim_p)
print("IDisc: ", average_r2_disc_i)
print("PDisc: ", average_r2_disc_p)


Mean Absolute Percentage Error (MAPE) IDim: 30.64%
Mean Absolute Percentage Error (MAPE) IDisc: 103.66%
Mean Absolute Percentage Error (MAPE) PDisc: 91.63%
Mean Absolute Percentage Error (MAPE) PDim: 23.18%

Root Mean Squared Error (RMSE) IDim: 2.2099

Root Mean Squared Error (RMSE) IDisc: 2.7769

Root Mean Squared Error (RMSE) PDisc: 2.7365

Root Mean Squared Error (RMSE) PDim: 1.5612


R-squared scores (IDim):
  valence = -0.28426051139831543
  tension = 0.16534847021102905
  energy = -31.580467224121094

R-squared scores (IDisc):
  happiness = -13.728118896484375
  sadness = -15.520496368408203
  anger = -4.017887592315674
  tenderness = -6.920380592346191
  fear = -14.645764350891113

R-squared scores (PDim):
  valence = 0.2767387628555298
  tension = 0.03492605686187744
  energy = -3.6054892539978027

R-squared scores (PDisc):
  happiness = -6.970876693725586
  sadness = -5.412888526916504
  anger = -4.02385139465332
  tenderness = -1.9018855094909668
  fear = -7.021528244018555

