In [1]:
# NOTE: change this to the path in your setup
korsmit_exp1_path = "../../data/Korsmit/Exp1/"

In [2]:
import torch
import librosa
import os
import pandas as pd
import numpy as np

In [3]:
from muq import MuQMuLan

# This will automatically fetch checkpoints from huggingface
device = 'cpu'
mulan = MuQMuLan.from_pretrained("OpenMuQ/MuQ-MuLan-large")
mulan = mulan.to(device).eval()

  from .autonotebook import tqdm as notebook_tqdm
  WeightNorm.apply(module, name, dim)


# Process audio

In [4]:
embeddings = []
stimuli_path = korsmit_exp1_path+"Stimuli/"

for file in sorted(os.listdir(stimuli_path)):
    if file.endswith(".wav"):
        wav_path = os.path.join(stimuli_path, file)
        audio, sample_rate = librosa.load(wav_path, sr=24000)
        wav_tensor = torch.tensor(audio).unsqueeze(0).to(device)
        with torch.no_grad():
            embedding = mulan(wavs = wav_tensor)
        embeddings.append(embedding)

In [None]:
audio_embeddings = torch.stack(embeddings)

In [None]:
audio_embeddings = audio_embeddings.squeeze(1)

In [None]:
audio_embeddings.shape

torch.Size([59, 512])

# Process text

In [8]:
discrete_tags = ["happiness", "sadness", "anger", "tenderness", "fear"]

discrete_captions_perceived = ["I perceive this sound as " + tag for tag in discrete_tags]
print(discrete_captions_perceived)
discrete_captions_induced = ["This sound makes me feel " + tag for tag in discrete_tags]
print(discrete_captions_induced)

dimensional_tags = ["positive", "relaxed", "awake"]

dimensional_captions_perceived = ["I perceive this sound as " + tag for tag in dimensional_tags]
print(dimensional_captions_perceived)
dimensional_captions_induced = ["This sound makes me feel " + tag for tag in dimensional_tags]
print(dimensional_captions_induced)

['I perceive this sound as happiness', 'I perceive this sound as sadness', 'I perceive this sound as anger', 'I perceive this sound as tenderness', 'I perceive this sound as fear']
['This sound makes me feel happiness', 'This sound makes me feel sadness', 'This sound makes me feel anger', 'This sound makes me feel tenderness', 'This sound makes me feel fear']
['I perceive this sound as positive', 'I perceive this sound as relaxed', 'I perceive this sound as awake']
['This sound makes me feel positive', 'This sound makes me feel relaxed', 'This sound makes me feel awake']


In [9]:
with torch.no_grad():
    tag_embeds = mulan(texts = dimensional_captions_induced)

# Generate outputs

In [None]:
sims = mulan.calc_similarity(audio_embeddings, tag_embeds)
print(sims.shape)

torch.Size([59, 3])


In [17]:
sims

tensor([[-1.5795e-02,  7.1918e-02,  1.2883e-01],
        [-8.3636e-04,  6.9027e-02,  1.4914e-01],
        [ 8.3426e-03,  3.5080e-02,  1.0751e-01],
        [-1.9663e-02,  2.7963e-02,  1.1313e-01],
        [ 4.7456e-02,  5.8541e-02,  1.6332e-01],
        [ 6.2177e-02,  1.0858e-01,  2.0441e-01],
        [-1.6177e-04,  3.9973e-02,  1.3816e-01],
        [ 6.9783e-03,  1.0594e-01,  1.5483e-01],
        [ 4.8948e-03,  1.2536e-01,  1.6362e-01],
        [-2.9681e-02,  8.4724e-02,  1.1794e-01],
        [-2.3657e-02,  4.3358e-02,  1.0542e-01],
        [ 2.1669e-02,  7.7448e-02,  1.0986e-01],
        [-6.0771e-02,  8.9668e-02,  8.9832e-02],
        [ 6.2231e-02,  1.5983e-01,  1.8535e-01],
        [-7.3319e-02,  4.9480e-02,  1.3519e-01],
        [-7.3915e-02,  3.2677e-03,  1.1919e-01],
        [-2.0476e-02,  1.2880e-02,  1.0707e-01],
        [-4.2250e-02,  1.3218e-02,  9.9971e-02],
        [-8.6838e-02,  1.0584e-02,  1.2069e-01],
        [-1.5947e-02,  9.2721e-02,  1.6563e-01],
        [-8.4143e-03

## Load csv files and extract related columns

In [15]:
IDim_path = korsmit_exp1_path+"Data/IDim/"
IDim_responses = []

all_dfs = []

for file in os.listdir(IDim_path):
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(IDim_path, file))
        df.columns = df.columns.str.strip()
        required_columns = ['positive', 'relaxed', 'awake']
        if all(col in df.columns for col in required_columns):
            all_dfs.append(df)

mean_positive_vector = []
mean_relaxed_vector = []
mean_awake_vector = []

num_rows = all_dfs[0].shape[0]

for i in range(num_rows):
    current_row_positives = []
    current_row_relaxeds = []
    current_row_awakes = []

    # For the current row index 'i', collect values from all DataFrames
    for df in all_dfs:
        current_row_positives.append(df.iloc[i]['positive'])
        current_row_relaxeds.append(df.iloc[i]['relaxed'])
        current_row_awakes.append(df.iloc[i]['awake'])

    # Calculate the mean for the current row across all files, for each column
    mean_positive_vector.append(np.mean(current_row_positives))
    mean_relaxed_vector.append(np.mean(current_row_relaxeds))
    mean_awake_vector.append(np.mean(current_row_awakes))

IDim_responses = {
    'positive_vector': mean_positive_vector,
    'relaxed_vector': mean_relaxed_vector,
    'awake_vector': mean_awake_vector
}

print("\n--- Mean Vectors (Mean across CSVs for each row position) ---")
print(f"Length of positive_vector: {len(IDim_responses['positive_vector'])}")
print(f"Length of relaxed_vector: {len(IDim_responses['relaxed_vector'])}")
print(f"Length of awake_vector: {len(IDim_responses['awake_vector'])}")


--- Mean Vectors (Mean across CSVs for each row position) ---
Length of positive_vector: 59
Length of relaxed_vector: 59
Length of awake_vector: 59


# Evaluate

In [16]:
# Find the min and max values in the current sims tensor
old_min = sims.min()
old_max = sims.max()

# Define the new desired range
new_min = 1.0
new_max = 9.0

# Apply the min-max scaling formula
scaled_sims = ((sims - old_min) / (old_max - old_min)) * (new_max - new_min) + new_min

print(f"Scaled sims shape: {scaled_sims.shape}")
print(f"Scaled sims (first 5 rows):\n{scaled_sims[:10]}\n")
print(f"Scaled sims min value: {scaled_sims.min():.4f}")
print(f"Scaled sims max value: {scaled_sims.max():.4f}\n")

Scaled sims shape: torch.Size([59, 3])
Scaled sims (first 5 rows):
tensor([[3.3413, 5.5953, 7.0578],
        [3.7257, 5.5210, 7.5796],
        [3.9616, 4.6487, 6.5100],
        [3.2420, 4.4658, 6.6544],
        [4.9667, 5.2516, 7.9440],
        [5.3450, 6.5375, 9.0000],
        [3.7431, 4.7744, 7.2974],
        [3.9266, 6.4695, 7.7259],
        [3.8730, 6.9686, 7.9516],
        [2.9845, 5.9244, 6.7778]])

Scaled sims min value: 1.0000
Scaled sims max value: 9.0000



In [18]:
human_ratings_tensor = torch.tensor([
    IDim_responses['positive_vector'],
    IDim_responses['relaxed_vector'],
    IDim_responses['awake_vector']
], dtype=torch.float32).T # Transpose to get shape (59, 3)

print(f"Human ratings tensor shape: {human_ratings_tensor.shape}")
print(f"Human ratings tensor (first 5 rows):\n{human_ratings_tensor[:10]}\n")

Human ratings tensor shape: torch.Size([59, 3])
Human ratings tensor (first 5 rows):
tensor([[4.4872, 4.8798, 4.5185],
        [4.7682, 5.3057, 4.7414],
        [5.3105, 5.7312, 5.0760],
        [5.2582, 5.2651, 5.4488],
        [3.8897, 3.9540, 5.3549],
        [4.3197, 3.8900, 5.5089],
        [4.9825, 5.1875, 5.5354],
        [5.0934, 4.8657, 5.8208],
        [4.7605, 4.8502, 5.4158],
        [5.2577, 4.8688, 5.7003]])



In [19]:
from scipy.stats import pearsonr # For Pearson correlation

# Comparison Method 1: Mean Absolute Error (MAE)
# MAE measures the average magnitude of the errors in a set of predictions, without considering their direction.
mae = torch.mean(torch.abs(scaled_sims - human_ratings_tensor))
print(f"Mean Absolute Error (MAE) between scaled_sims and human_ratings: {mae:.4f}\n")

# Comparison Method 2: Mean Absolute Percentage Error (MAPE)
# MAPE measures the accuracy of a forecasting method in terms of percentage.
# Formula: MAPE = (1/n) * sum(|(Actual - Forecast) / Actual|) * 100%

# Calculate the absolute percentage error for each element
# Since human responses are between 1 and 9, division by zero is not a concern.
absolute_percentage_error = torch.abs((human_ratings_tensor - scaled_sims) / human_ratings_tensor) * 100

# Calculate the mean of these percentage errors
mape = torch.mean(absolute_percentage_error)

print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%\n")

# Comparison Method 3: Root Mean Squared Error (RMSE)
# RMSE measures the square root of the average of the squared differences between predicted and actual values.
# It gives a relatively high weight to large errors.
# Formula: RMSE = sqrt(mean((Actual - Forecast)^2))

# Calculate the squared differences
squared_differences = (human_ratings_tensor - scaled_sims)**2

# Calculate the mean of the squared differences (Mean Squared Error - MSE)
mse = torch.mean(squared_differences)

# Calculate the square root to get RMSE
rmse = torch.sqrt(mse)

print(f"Root Mean Squared Error (RMSE): {rmse:.4f}\n")

# Comparison Method 4: Pearson Correlation Coefficient (per column)
# Pearson correlation measures the linear relationship between two sets of data.
# We'll calculate it for each of the three columns (dimensions).

correlation_positive, _ = pearsonr(scaled_sims[:, 0].detach().numpy(), human_ratings_tensor[:, 0].numpy())
correlation_relaxed, _ = pearsonr(scaled_sims[:, 1].detach().numpy(), human_ratings_tensor[:, 1].numpy())
correlation_awake, _ = pearsonr(scaled_sims[:, 2].detach().numpy(), human_ratings_tensor[:, 2].numpy())

print("Pearson Correlation Coefficients (between scaled_sims and human_ratings):")
print(f"  Positive Dimension: {correlation_positive:.4f}")
print(f"  Relaxed Dimension:  {correlation_relaxed:.4f}")
print(f"  Awake Dimension:    {correlation_awake:.4f}")

average_correlation = (correlation_positive + correlation_relaxed + correlation_awake) / 3
print(f"  Average Correlation: {average_correlation:.4f}")

from sklearn.metrics import r2_score

# R-squared
print("\nR-squared scores:")
r2_valence = r2_score(human_ratings_tensor[:, 0].numpy(), scaled_sims[:, 0].detach().numpy())
print("  valence =", r2_valence)

r2_tension = r2_score(human_ratings_tensor[:, 1].numpy(), scaled_sims[:, 1].detach().numpy())
print("  tension =", r2_tension)

r2_energy = r2_score(human_ratings_tensor[:, 2].numpy(), scaled_sims[:, 2].detach().numpy())
print("  energy =", r2_energy)


Mean Absolute Error (MAE) between scaled_sims and human_ratings: 1.5578

Mean Absolute Percentage Error (MAPE): 34.77%

Root Mean Squared Error (RMSE): 1.9616

Pearson Correlation Coefficients (between scaled_sims and human_ratings):
  Positive Dimension: 0.2555
  Relaxed Dimension:  -0.3114
  Awake Dimension:    0.2678
  Average Correlation: 0.0706

R-squared scores:
  valence = -2.250750780105591
  tension = -2.301231861114502
  energy = -5.558738708496094
