In [2]:
from transformers import ClapModel, AutoProcessor
import torch
import librosa
import os
import pandas as pd
import numpy as np

In [3]:
# Load CLAP model + processor
model = ClapModel.from_pretrained("laion/larger_clap_general")
processor = AutoProcessor.from_pretrained("laion/larger_clap_general")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/643 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/776M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/776M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [4]:
print('total number of CLAP\'s parameters:', sum(p.numel() for p in model.parameters()))

total number of CLAP's parameters: 193913882


In [5]:
param_size = 0
for param in model.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
print('CLAP model size: {:.3f}MB'.format(size_all_mb))

CLAP model size: 740.294MB


In [6]:
#This section is to convert the uploaded zip files into folders
import zipfile

# Define the paths to your zip files
zip_files = ["Exp1.zip", "Exp2.zip", "Analysis.zip"]

# Define the directory where you want to extract the files
extract_dir = "/content/" # You can change this if you want to extract elsewhere

# Create the extraction directory if it doesn't exist
os.makedirs(extract_dir, exist_ok=True)

# Extract each zip file
for zip_file in zip_files:
    if os.path.exists(zip_file):
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)
        print(f"Extracted {zip_file} to {extract_dir}")
    else:
        print(f"Erro r: {zip_file} not found.")

Extracted Exp1.zip to /content/
Extracted Exp2.zip to /content/
Extracted Analysis.zip to /content/


# Process audio

In [7]:
audio_stimuli = []
stimuli_path = "/content/Exp2/Stimuli/"

for file in sorted(os.listdir(stimuli_path)):
    if file.endswith(".wav"):
        wav_path = os.path.join(stimuli_path, file)
        audio, sample_rate = librosa.load(wav_path, sr=48000)
        audio_stimuli.append(audio)

In [8]:
inputs = processor(audios=audio_stimuli, return_tensors="pt", padding=True, sampling_rate=48000)
audio_embeddings = model.get_audio_features(**inputs)

In [9]:
print(audio_embeddings.shape)

torch.Size([32, 512])


# Process text

In [10]:
discrete_tags = ["happiness", "sadness", "anger", "tenderness", "fear"]

discrete_captions_perceived = ["I perceive this sound as " + tag for tag in discrete_tags]
print(discrete_captions_perceived)
discrete_captions_induced = ["This sound makes me feel " + tag for tag in discrete_tags]
print(discrete_captions_induced)

dimensional_tags = ["positive", "relaxed", "awake"]

dimensional_captions_perceived = ["I perceive this sound as " + tag for tag in dimensional_tags]
print(dimensional_captions_perceived)
dimensional_captions_induced = ["This sound makes me feel " + tag for tag in dimensional_tags]
print(dimensional_captions_induced)


['I perceive this sound as happiness', 'I perceive this sound as sadness', 'I perceive this sound as anger', 'I perceive this sound as tenderness', 'I perceive this sound as fear']
['This sound makes me feel happiness', 'This sound makes me feel sadness', 'This sound makes me feel anger', 'This sound makes me feel tenderness', 'This sound makes me feel fear']
['I perceive this sound as positive', 'I perceive this sound as relaxed', 'I perceive this sound as awake']
['This sound makes me feel positive', 'This sound makes me feel relaxed', 'This sound makes me feel awake']


In [11]:
all_tags = discrete_captions_perceived + discrete_captions_induced + dimensional_captions_perceived + dimensional_captions_induced


# NOTE: currently using only dimensional_captions_induced
inputs_induced = processor(text=discrete_captions_induced, return_tensors="pt", padding=True)
inputs_perceived = processor(text=discrete_captions_perceived, return_tensors="pt", padding=True)

induced_embeddings = model.get_text_features(**inputs_induced)
perceived_embeddings = model.get_text_features(**inputs_perceived)

tag_embeds = (induced_embeddings + perceived_embeddings)/2

## Load csv files and extract related columns

In [12]:
IDim_path = '/content/Exp2/Data/Disc'
IDim_response_dfs = []

for file in os.listdir(IDim_path):
    if file.endswith(".csv"):
        file_path = os.path.join(IDim_path, file)
        try:
            df = pd.read_csv(file_path, sep=r'\s*,\s*', engine='python')
            # Crucial: Strip whitespace from column names
            df.columns = df.columns.str.strip()

            # Ensure required rating columns exist
            df.columns = df.columns.str.strip()
            df["happiness"] = (df["Ihappiness"] + df["Phappiness"]) / 2
            df["sadness"] = (df["Isadness"] + df["Psadness"]) / 2
            df["anger"] = (df["Ianger"] + df["Panger"]) / 2
            df["tenderness"] = (df["Itenderness"] + df["Ptenderness"]) / 2
            df["fear"] = (df["Ifear"] + df["Pfear"]) / 2

            required_cols = ["happiness", "sadness", "anger", "tenderness", "fear"]
            if all(col in df.columns for col in required_cols):
                # Select only the relevant columns and append to our list
                IDim_response_dfs.append(df[required_cols])
            else:
                print(f"Skipping file '{file_path}': Missing required columns ({required_cols}). Found columns: {df.columns.tolist()}")

        except Exception as e:
            print(f"Error reading or processing file {file_path}: {e}")


# Concatenate all individual DataFrames into one master DataFrame for human responses
if IDim_response_dfs:
    master_human_responses_df = pd.concat(IDim_response_dfs, ignore_index=True)
    print(f"Master human responses DataFrame shape: {master_human_responses_df.shape}\n")
    print(f"Master human responses (first 5 rows):\n{master_human_responses_df}\n")
else:
    raise ValueError("No valid CSV files found or processed in IDim_path.")

Master human responses DataFrame shape: (2432, 5)

Master human responses (first 5 rows):
      happiness  sadness  anger  tenderness   fear
0         1.055    3.525  5.460       5.010  7.515
1         3.515    3.085  3.455       5.055  4.050
2         1.975    4.545  3.020       4.655  4.640
3         2.975    2.035  2.030       1.595  3.965
4         3.025    3.965  4.040       3.525  5.995
...         ...      ...    ...         ...    ...
2427      2.000    1.000  1.000       3.020  1.515
2428      1.480    1.000  1.000       2.475  1.485
2429      1.770    1.000  1.000       1.825  1.500
2430      1.740    1.000  1.000       2.505  1.775
2431      1.805    1.000  1.000       1.775  3.475

[2432 rows x 5 columns]



# Prepare features X and targets y

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

num_participants = len(IDim_response_dfs)
if master_human_responses_df.shape[0] % len(audio_stimuli) != 0:
    print("Warning: Total responses is not a perfect multiple of unique audio files. This might indicate inconsistent data or that not all participants rated all items, which could break implicit ordering.")

X_list = []
for _ in range(num_participants):
    X_list.extend(audio_embeddings.detach()) # Add a full set of embeddings for each participant

# Convert to NumPy array
X = np.array(X_list)

# Extract y from the concatenated DataFrame
y = master_human_responses_df[['happiness', 'sadness', 'anger', 'tenderness', 'fear']].values

print(f"Shape of X (features) after implicit alignment: {X.shape}")
print(f"Shape of y (labels) after implicit alignment: {y.shape}\n")

# Sanity check: X and y must have the same number of rows
if X.shape[0] != y.shape[0]:
    raise ValueError("Number of rows in X and y do not match after implicit alignment. This indicates an issue with the implicit ordering assumption or data loading.")

# --- Split Data into Training and Testing Sets ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(f"Training set size (X_train, y_train): {X_train.shape}, {y_train.shape}")
print(f"Testing set size (X_test, y_test): {X_test.shape}, {y_test.shape}\n")

Shape of X (features) after implicit alignment: (2432, 512)
Shape of y (labels) after implicit alignment: (2432, 5)

Training set size (X_train, y_train): (1945, 512), (1945, 5)
Testing set size (X_test, y_test): (487, 512), (487, 5)



# Train regression head (=MLP, a few projection layers)

In [14]:
from sklearn.neural_network import MLPRegressor

mlp_regressor = MLPRegressor(
    hidden_layer_sizes=(100, 50),
    activation='relu',
    solver='adam',
    max_iter=1000,
    random_state=42,
    verbose=True,
    early_stopping=True,
    n_iter_no_change=50,
    tol=1e-4
)

print("Starting MLP Regressor training...")
mlp_regressor.fit(X_train, y_train)
print("\nMLP Regressor training complete.")

Starting MLP Regressor training...
Iteration 1, loss = 3.27636938
Validation score: 0.039201
Iteration 2, loss = 1.73100233
Validation score: 0.077392
Iteration 3, loss = 1.60097876
Validation score: 0.126341
Iteration 4, loss = 1.53436033
Validation score: 0.131740
Iteration 5, loss = 1.51321935
Validation score: 0.153422
Iteration 6, loss = 1.51037921
Validation score: 0.154268
Iteration 7, loss = 1.50668279
Validation score: 0.150348
Iteration 8, loss = 1.50528410
Validation score: 0.155276
Iteration 9, loss = 1.50619815
Validation score: 0.149959
Iteration 10, loss = 1.50863561
Validation score: 0.151471
Iteration 11, loss = 1.51071072
Validation score: 0.148526
Iteration 12, loss = 1.50396293
Validation score: 0.155265
Iteration 13, loss = 1.50603099
Validation score: 0.154695
Iteration 14, loss = 1.50552429
Validation score: 0.154183
Iteration 15, loss = 1.50967664
Validation score: 0.151473
Iteration 16, loss = 1.50961919
Validation score: 0.153760
Iteration 17, loss = 1.5056188

# Evaluate

In [15]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr
print("Results for Dimensional (Dim)")
print("______________________________________________")
y_pred = mlp_regressor.predict(X_test)

print(f"\nShape of predictions (y_pred): {y_pred.shape}")
print(f"First 5 actual values (y_test):\n{y_test[:5]}")
print(f"First 5 predicted values (y_pred):\n{y_pred[:5]}\n")

# Evaluation Metrics:

# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae:.4f}")

# Mean Absolute Percentage Error (MAPE)
absolute_percentage_error = np.abs((y_test - y_pred) / y_test) * 100
mape = np.mean(absolute_percentage_error)

print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%\n")

# Root Mean Squared Error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

# Pearson Correlation Coefficient (per dimension)
print("\nPearson Correlation Coefficients (per dimension):")
for i, dim_name in enumerate(['happiness', 'sadness', 'anger', 'tenderness', 'fear']):
    # Check for sufficient variance to calculate correlation
    if np.std(y_test[:, i]) > 1e-6 and np.std(y_pred[:, i]) > 1e-6:
        correlation, _ = pearsonr(y_test[:, i], y_pred[:, i])
        print(f"  {dim_name} Dimension: {correlation:.4f}")
    else:
        print(f"  {dim_name} Dimension: Cannot calculate (insufficient variance in data for this dimension)")

correlations = []
for i in range(y_test.shape[1]):
    if np.std(y_test[:, i]) > 1e-6 and np.std(y_pred[:, i]) > 1e-6:
        correlations.append(pearsonr(y_test[:, i], y_pred[:, i])[0])
if correlations:
    average_correlation = np.mean(correlations)
    print(f"  Average Pearson Correlation across dimensions: {average_correlation:.4f}")
else:
    print("  No correlations could be calculated for averaging.")


from sklearn.metrics import r2_score

# R-squared
print("\nR-squared scores:")
r2_happiness = r2_score(y_test[:, 0], y_pred[:, 0])
print(f"  happiness = {r2_happiness:.4f}")

r2_sadness = r2_score(y_test[:, 1], y_pred[:, 1])
print(f"  sadness = {r2_sadness:.4f}")

r2_anger = r2_score(y_test[:, 2], y_pred[:, 2])
print(f"  anger = {r2_anger:.4f}")

r2_tenderness = r2_score(y_test[:, 3], y_pred[:, 3])
print(f"  tenderness = {r2_tenderness:.4f}")

r2_fear = r2_score(y_test[:, 4], y_pred[:, 4])
print(f"  fear = {r2_fear:.4f}")

r2_scores = [r2_happiness, r2_sadness, r2_anger, r2_tenderness, r2_fear]

# Calculate the average R²
average_r2 = np.mean(r2_scores)
print(f"\nAverage R-squared across dimensions: {average_r2:.4f}")

Results for Dimensional (Dim)
______________________________________________

Shape of predictions (y_pred): (487, 5)
First 5 actual values (y_test):
[[1.51  1.07  1.085 1.095 1.955]
 [1.645 1.715 2.71  1.47  4.03 ]
 [4.465 2.555 1.045 6.535 1.485]
 [1.99  3.93  3.965 3.135 5.415]
 [2.37  4.26  1.825 4.305 2.31 ]]
First 5 predicted values (y_pred):
[[1.6674327 3.7690456 4.57517   2.3055952 5.0690365]
 [1.8615339 3.486     3.6739357 2.5446885 3.7839491]
 [2.765617  3.2764757 2.0032375 3.604989  2.8273764]
 [1.6406221 3.2009497 3.447986  2.3332233 4.6642604]
 [3.685009  2.549881  1.6195984 4.1705165 2.5454156]]

Mean Absolute Error (MAE): 1.3248
Mean Absolute Percentage Error (MAPE): 61.59%

Root Mean Squared Error (RMSE): 1.6719

Pearson Correlation Coefficients (per dimension):
  happiness Dimension: 0.3979
  sadness Dimension: 0.2167
  anger Dimension: 0.5834
  tenderness Dimension: 0.3073
  fear Dimension: 0.3787
  Average Pearson Correlation across dimensions: 0.3768

R-squared scor