In [1]:
from transformers import ClapModel, AutoProcessor
import torch
import librosa
import os
import pandas as pd
import numpy as np
from scipy.stats import pearsonr # For Pearson correlation
from sklearn.metrics import r2_score # For R-squared scores

In [2]:
#CLAPmodel = "laion/larger_clap_general"
#CLAPmodel = "laion/larger_clap_music_and_speech"
CLAPmodel = "laion/larger_clap_music"
#CLAPmodel = "laion/clap-htsat-fused"
#CLAPmodel = "laion/clap-htsat-unfused"

In [3]:
# Load CLAP model + processor
model = ClapModel.from_pretrained(CLAPmodel)
processor = AutoProcessor.from_pretrained(CLAPmodel)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/628 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/776M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/776M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [4]:
print('total number of CLAP\'s parameters:', sum(p.numel() for p in model.parameters()))

total number of CLAP's parameters: 193913882


In [5]:
param_size = 0
for param in model.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
print('CLAP model size: {:.3f}MB'.format(size_all_mb))

CLAP model size: 740.294MB


In [6]:
#This section is to convert the uploaded zip files into folders
import zipfile

# Define the paths to your zip files
zip_files = ["Exp1.zip", "Exp2.zip", "Analysis.zip"]

# Define the directory where you want to extract the files
extract_dir = "/content/" # You can change this if you want to extract elsewhere

# Create the extraction directory if it doesn't exist
os.makedirs(extract_dir, exist_ok=True)

# Extract each zip file
for zip_file in zip_files:
    if os.path.exists(zip_file):
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)
        print(f"Extracted {zip_file} to {extract_dir}")
    else:
        print(f"Erro r: {zip_file} not found.")

Extracted Exp1.zip to /content/
Extracted Exp2.zip to /content/
Extracted Analysis.zip to /content/


# Process audio

In [7]:
audio_stimuli = []
stimuli_path = "/content/Exp1/Stimuli/"

for file in sorted(os.listdir(stimuli_path)):
    if file.endswith(".wav"):
        wav_path = os.path.join(stimuli_path, file)
        audio, sample_rate = librosa.load(wav_path, sr=48000)
        audio_stimuli.append(audio)

In [8]:
inputs = processor(audios=audio_stimuli, return_tensors="pt", padding=True, sampling_rate=48000)
audio_embeddings = model.get_audio_features(**inputs)

In [9]:
print(audio_embeddings.shape)

torch.Size([59, 512])


# Process text

In [10]:
discrete_tags = ["happiness", "sadness", "anger", "tenderness", "fear"]

discrete_captions_perceived = ["I perceive this sound as " + tag for tag in discrete_tags]
print(discrete_captions_perceived)
discrete_captions_induced = ["This sound makes me feel " + tag for tag in discrete_tags]
print(discrete_captions_induced)

dimensional_tags = ["positive", "relaxed", "awake", "like"]

dimensional_captions_perceived = ["I perceive this sound as " + tag for tag in dimensional_tags]
print(dimensional_captions_perceived)
dimensional_captions_induced = ["This sound makes me feel " + tag for tag in dimensional_tags]
print(dimensional_captions_induced)

['I perceive this sound as happiness', 'I perceive this sound as sadness', 'I perceive this sound as anger', 'I perceive this sound as tenderness', 'I perceive this sound as fear']
['This sound makes me feel happiness', 'This sound makes me feel sadness', 'This sound makes me feel anger', 'This sound makes me feel tenderness', 'This sound makes me feel fear']
['I perceive this sound as positive', 'I perceive this sound as relaxed', 'I perceive this sound as awake', 'I perceive this sound as like']
['This sound makes me feel positive', 'This sound makes me feel relaxed', 'This sound makes me feel awake', 'This sound makes me feel like']


In [11]:
all_tags = discrete_captions_perceived + discrete_captions_induced + dimensional_captions_perceived + dimensional_captions_induced

#For Dimensional Induced (IDim)
Itag_inputs = processor(text=dimensional_captions_induced, return_tensors="pt", padding=True)
Itag_embeds = model.get_text_features(**Itag_inputs)

#For Dimensional Perceived (PDim)
Ptag_inputs = processor(text=dimensional_captions_perceived, return_tensors="pt", padding=True)
Ptag_embeds = model.get_text_features(**Ptag_inputs)

#For all of Dimensional
tag_embeds = (Itag_embeds + Ptag_embeds)/2

# Generate outputs

In [12]:
#For Dimensional Induced (IDim)
print("IDim")
Isims = torch.matmul(audio_embeddings, Itag_embeds.T)
print(Isims.shape)

#For Discrete Perceived (PDisc)
print("PDim")
Psims = torch.matmul(audio_embeddings, Ptag_embeds.T)
print(Psims.shape)

print("Dim")
#For all of Discrete
sims = torch.matmul(audio_embeddings, tag_embeds.T)
print(sims.shape)

IDim
torch.Size([59, 4])
PDim
torch.Size([59, 4])
Dim
torch.Size([59, 4])


## Load csv files and extract related columns

In [14]:
def GetData(path, sims):
  IDim_path = path
  IDim_responses = []

  all_dfs = []

  for file in os.listdir(IDim_path):
      if file.endswith(".csv"):
          df = pd.read_csv(os.path.join(IDim_path, file))
          df.columns = df.columns.str.strip()
          required_columns = ['positive', 'relaxed', 'awake', 'like']
          if all(col in df.columns for col in required_columns):
              all_dfs.append(df)

  mean_positive_vector = []
  mean_relaxed_vector = []
  mean_awake_vector = []
  mean_like_vector = []

  num_rows = all_dfs[0].shape[0]

  for i in range(num_rows):
      current_row_positives = []
      current_row_relaxeds = []
      current_row_awakes = []
      current_row_likes = []

      # For the current row index 'i', collect values from all DataFrames
      for df in all_dfs:
          current_row_positives.append(df.iloc[i]['positive'])
          current_row_relaxeds.append(df.iloc[i]['relaxed'])
          current_row_awakes.append(df.iloc[i]['awake'])
          current_row_likes.append(df.iloc[i]['like'])

      # Calculate the mean for the current row across all files, for each column
      mean_positive_vector.append(np.mean(current_row_positives))
      mean_relaxed_vector.append(np.mean(current_row_relaxeds))
      mean_awake_vector.append(np.mean(current_row_awakes))
      mean_like_vector.append(np.mean(current_row_likes))

  IDim_responses = {
      'positive_vector': mean_positive_vector,
      'relaxed_vector': mean_relaxed_vector,
      'awake_vector': mean_awake_vector,
      'like_vector': mean_like_vector
  }

  print("\n--- Mean Vectors (Mean across CSVs for each row position) ---")
  print(f"Length of positive_vector: {len(IDim_responses['positive_vector'])}")
  print(f"Length of relaxed_vector: {len(IDim_responses['relaxed_vector'])}")
  print(f"Length of awake_vector: {len(IDim_responses['awake_vector'])}")
  print(f"Length of like_vector: {len(IDim_responses['like_vector'])}")

  # Find the min and max values in the current sims tensor
  old_min = sims.min()
  old_max = sims.max()

  # Define the new desired range
  new_min = 1.0
  new_max = 9.0

  # Apply the min-max scaling formula
  scaled_sims = ((sims - old_min) / (old_max - old_min)) * (new_max - new_min) + new_min

  print(f"Scaled sims shape: {scaled_sims.shape}")
  print(f"Scaled sims (first 5 rows):\n{scaled_sims[:10]}\n")
  print(f"Scaled sims min value: {scaled_sims.min():.4f}")
  print(f"Scaled sims max value: {scaled_sims.max():.4f}\n")

  human_ratings_tensor = torch.tensor([
      IDim_responses['positive_vector'],
      IDim_responses['relaxed_vector'],
      IDim_responses['awake_vector'],
      IDim_responses['like_vector']
  ], dtype=torch.float32).T # Transpose to get shape (59, 3)

  print(f"Human ratings tensor shape: {human_ratings_tensor.shape}")
  print(f"Human ratings tensor (first 5 rows):\n{human_ratings_tensor[:10]}\n")

  return scaled_sims, human_ratings_tensor

Iscaled_sims, Ihuman_ratings_tensor = GetData('/content/Exp1/Data/IDim', Isims)
Pscaled_sims, Phuman_ratings_tensor = GetData('/content/Exp1/Data/PDim', Psims)


--- Mean Vectors (Mean across CSVs for each row position) ---
Length of positive_vector: 59
Length of relaxed_vector: 59
Length of awake_vector: 59
Length of like_vector: 59
Scaled sims shape: torch.Size([59, 4])
Scaled sims (first 5 rows):
tensor([[7.1647, 7.0362, 6.7361, 6.7575],
        [5.4812, 5.3205, 4.9816, 5.0540],
        [3.6491, 3.4734, 3.0879, 3.2147],
        [3.1602, 3.1239, 2.7137, 2.7706],
        [7.9248, 7.7498, 7.4512, 7.4594],
        [8.4446, 8.2234, 7.9175, 7.9433],
        [2.8712, 2.7444, 2.4006, 2.3843],
        [4.6985, 4.6478, 4.2676, 4.2847],
        [6.2722, 6.0907, 5.6706, 5.8114],
        [5.0022, 4.9391, 4.5388, 4.5949]], grad_fn=<SliceBackward0>)

Scaled sims min value: 1.0000
Scaled sims max value: 9.0000

Human ratings tensor shape: torch.Size([59, 4])
Human ratings tensor (first 5 rows):
tensor([[4.4872, 4.8798, 4.5185, 4.6463],
        [4.7682, 5.3057, 4.7414, 5.0326],
        [5.3105, 5.7312, 5.0760, 5.3854],
        [5.2582, 5.2651, 5.4488, 5.249

In [15]:
# Load pre-aggregated mean vectors from aggregate_data.csv
df = pd.read_csv('/content/Exp1/Data/aggregate_data.csv')  # This already includes mean IDim and PDim

# Sort by StimNo to match audio alignment
df = df.sort_values("StimNo").reset_index(drop=True)

# Extract the mean dimensional responses
Dim_responses = {
    'positive_vector': df['positive'].tolist(),
    'relaxed_vector': df['relaxed'].tolist(),
    'awake_vector': df['awake'].tolist(),
    'like_vector': df['like'].tolist()
}

print("\n--- Mean Vectors (from aggregate_data.csv) ---")
print(f"Length of positive_vector: {len(Dim_responses['positive_vector'])}")
print(f"Length of relaxed_vector: {len(Dim_responses['relaxed_vector'])}")
print(f"Length of awake_vector: {len(Dim_responses['awake_vector'])}")
print(f"Length of like_vector: {len(Dim_responses['like_vector'])}")
# Find the min and max values in the current sims tensor
old_min = sims.min()
old_max = sims.max()

# Define the new desired range
new_min = 1.0
new_max = 9.0

# Apply the min-max scaling formula
scaled_sims = ((sims - old_min) / (old_max - old_min)) * (new_max - new_min) + new_min

print(f"Scaled sims shape: {scaled_sims.shape}")
print(f"Scaled sims (first 5 rows):\n{scaled_sims[:10]}\n")
print(f"Scaled sims min value: {scaled_sims.min():.4f}")
print(f"Scaled sims max value: {scaled_sims.max():.4f}\n")

human_ratings_tensor = torch.tensor([
    Dim_responses['positive_vector'],
    Dim_responses['relaxed_vector'],
    Dim_responses['awake_vector'],
    Dim_responses['like_vector']
], dtype=torch.float32).T # Transpose to get shape (59, 4)

print(f"Human ratings tensor shape: {human_ratings_tensor.shape}")
print(f"Human ratings tensor (first 5 rows):\n{human_ratings_tensor[:10]}\n")


--- Mean Vectors (from aggregate_data.csv) ---
Length of positive_vector: 59
Length of relaxed_vector: 59
Length of awake_vector: 59
Length of like_vector: 59
Scaled sims shape: torch.Size([59, 4])
Scaled sims (first 5 rows):
tensor([[7.1851, 7.0450, 6.7276, 6.7696],
        [5.5221, 5.3492, 4.9911, 5.0861],
        [3.7177, 3.5312, 3.1281, 3.2758],
        [3.2041, 3.1543, 2.7241, 2.8042],
        [7.8993, 7.7109, 7.3940, 7.4241],
        [8.4120, 8.1778, 7.8531, 7.9012],
        [2.8822, 2.7414, 2.3805, 2.3856],
        [4.7003, 4.6368, 4.2365, 4.2767],
        [6.3252, 6.1280, 5.6887, 5.8532],
        [5.0145, 4.9376, 4.5174, 4.5967]], grad_fn=<SliceBackward0>)

Scaled sims min value: 1.0000
Scaled sims max value: 9.0000

Human ratings tensor shape: torch.Size([59, 4])
Human ratings tensor (first 5 rows):
tensor([[3.9920, 4.7386, 3.9793, 4.7069],
        [4.4554, 5.1961, 3.9989, 5.0853],
        [4.9997, 5.6211, 4.6525, 5.2916],
        [5.2295, 5.3023, 5.1811, 5.1007],
        [3.

# Evaluate

In [16]:
def format4(x):
    return "{:.4f}".format(float(x))

def format2(x):
    return "{:.2f}".format(float(x))

def format_tuple(t):
    return "(" + ", ".join(format4(x) for x in t) + ")"

def DisplayStats(Title, scaled_sims, human_ratings_tensor):
  # Comparison Method 1: Mean Absolute Error (MAE)
  # MAE measures the average magnitude of the errors in a set of predictions, without considering their direction.
  mae = torch.mean(torch.abs(scaled_sims - human_ratings_tensor))


  # Comparison Method 2: Pearson Correlation Coefficient (per column)
  # Pearson correlation measures the linear relationship between two sets of data.
  # We'll calculate it for each of the  columns.
  correlation_positive, _ = pearsonr(scaled_sims[:, 0].detach().numpy(), human_ratings_tensor[:, 0].numpy())
  correlation_relaxed, _ = pearsonr(scaled_sims[:, 1].detach().numpy(), human_ratings_tensor[:, 1].numpy())
  correlation_awake, _ = pearsonr(scaled_sims[:, 2].detach().numpy(), human_ratings_tensor[:, 2].numpy())
  correlation_like, _ = pearsonr(scaled_sims[:, 3].detach().numpy(), human_ratings_tensor[:, 3].numpy())

  correlation_scores = [correlation_positive, correlation_relaxed, correlation_awake, correlation_like]
  # Calculate the average correlation
  average_correlation = np.mean(correlation_scores)


  # Comparison Method 3: Mean Absolute Percentage Error (MAPE)
  # MAPE measures the accuracy of a forecasting method in terms of percentage.
  # Formula: MAPE = (1/n) * sum(|(Actual - Forecast) / Actual|) * 100%

  # Calculate the absolute percentage error for each element
  # Since human responses are between 1 and 9, division by zero is not a concern.
  absolute_percentage_error = torch.abs((human_ratings_tensor - scaled_sims) / human_ratings_tensor) * 100
  # Calculate the mean of these percentage errors
  mape = torch.mean(absolute_percentage_error)


  # Comparison Method 4: Root Mean Squared Error (RMSE)
  # RMSE measures the square root of the average of the squared differences between predicted and actual values.
  # It gives a relatively high weight to large errors.
  # Formula: RMSE = sqrt(mean((Actual - Forecast)^2))
  # Calculate the squared differences
  squared_differences = (human_ratings_tensor - scaled_sims)**2
  # Calculate the mean of the squared differences (Mean Squared Error - MSE)
  mse = torch.mean(squared_differences)
  # Calculate the square root to get RMSE
  rmse = torch.sqrt(mse)

  # Comparison Method 5: R-squared values (Coefficient of Determination)
  # R² measures how well the predicted values approximate the actual values.
  # It represents the proportion of variance in the dependent variable
  # that is predictable from the independent variable(s).
  #
  # Formula: R² = 1 - (Sum of Squared Residuals / Total Sum of Squares)
  #
  # Interpretation:
  # - R² = 1.0: Perfect prediction
  # - R² = 0.0: Predictions are as good as the mean of the actual values
  # - R² < 0.0: Predictions are worse than just using the mean
  r2_valence = r2_score(human_ratings_tensor[:, 0].numpy(), scaled_sims[:, 0].detach().numpy())
  r2_tension = r2_score(human_ratings_tensor[:, 1].numpy(), scaled_sims[:, 1].detach().numpy())
  r2_energy = r2_score(human_ratings_tensor[:, 2].numpy(), scaled_sims[:, 2].detach().numpy())
  r2_like = r2_score(human_ratings_tensor[:, 3].numpy(), scaled_sims[:, 3].detach().numpy())


  r2_scores = [r2_valence, r2_tension, r2_energy, r2_like]
  # Calculate the average R²
  average_r2 = np.mean(r2_scores)

  Disc_rows = [
    ("MAE", format4(mae)),
    ("Pearson Correlation Coefficient (Valence, Tension, Energy, Like)",
    format_tuple(correlation_scores)),
    ("Average Correlation", format4(average_correlation)),
    ("MAPE", f"{format2(mape)}%"),
    ("RMSE", format4(rmse)),
    ("R-Squared Scores (Valence, Tension, Energy, Like)",
    format_tuple(r2_scores)),
    ("R-Squared Average", format4(average_r2)),
  ]
  Disc_df = pd.DataFrame(Disc_rows, columns=[Title, "Statistics"])
  # Display with column borders, no row index
  style = [{"selector": "td, th", "props": [("border", "1px solid gray")]}]
  display(Disc_df.style.hide(axis="index").set_table_styles(style))


DisplayStats("Dimensional Induced (IDim)", Iscaled_sims, Ihuman_ratings_tensor)
print("\n" + "-"*80 + "\n")
DisplayStats("Dimensional Perceived (PDim)", Pscaled_sims, Phuman_ratings_tensor)
print("\n" + "-"*80 + "\n")
DisplayStats("Dimensional", scaled_sims, human_ratings_tensor)

Dimensional Induced (IDim),Statistics
MAE,1.7650
"Pearson Correlation Coefficient (Valence, Tension, Energy, Like)","(0.4015, 0.3448, 0.0842, 0.3975)"
Average Correlation,0.3070
MAPE,40.23%
RMSE,2.1492
"R-Squared Scores (Valence, Tension, Energy, Like)","(-2.7065, -2.3388, -11.2826, -1.7158)"
R-Squared Average,-4.5109



--------------------------------------------------------------------------------



Dimensional Perceived (PDim),Statistics
MAE,1.7517
"Pearson Correlation Coefficient (Valence, Tension, Energy, Like)","(0.3893, 0.3634, 0.0251, 0.4696)"
Average Correlation,0.3118
MAPE,39.53%
RMSE,2.1568
"R-Squared Scores (Valence, Tension, Energy, Like)","(-2.3502, -2.4752, -3.8102, -1.2972)"
R-Squared Average,-2.4832



--------------------------------------------------------------------------------



Dimensional,Statistics
MAE,1.7463
"Pearson Correlation Coefficient (Valence, Tension, Energy, Like)","(0.4057, 0.3602, 0.0481, 0.4383)"
Average Correlation,0.3131
MAPE,39.17%
RMSE,2.1349
"R-Squared Scores (Valence, Tension, Energy, Like)","(-2.6443, -2.3528, -6.2043, -1.5271)"
R-Squared Average,-3.1821
