In [1]:
from transformers import ClapModel, AutoProcessor
import torch
import librosa
import os
import pandas as pd
import numpy as np

In [2]:
# Load CLAP model + processor
model = ClapModel.from_pretrained("laion/larger_clap_general")
processor = AutoProcessor.from_pretrained("laion/larger_clap_general")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/643 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/776M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/776M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [3]:
print('total number of CLAP\'s parameters:', sum(p.numel() for p in model.parameters()))

total number of CLAP's parameters: 193913882


In [4]:
param_size = 0
for param in model.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
print('CLAP model size: {:.3f}MB'.format(size_all_mb))

CLAP model size: 740.294MB


In [5]:
#This section is to convert the uploaded zip files into folders
import zipfile

# Define the paths to your zip files
zip_files = ["Exp1.zip", "Exp2.zip", "Analysis.zip"]

# Define the directory where you want to extract the files
extract_dir = "/content/" # You can change this if you want to extract elsewhere

# Create the extraction directory if it doesn't exist
os.makedirs(extract_dir, exist_ok=True)

# Extract each zip file
for zip_file in zip_files:
    if os.path.exists(zip_file):
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)
        print(f"Extracted {zip_file} to {extract_dir}")
    else:
        print(f"Erro r: {zip_file} not found.")

Extracted Exp1.zip to /content/
Extracted Exp2.zip to /content/
Extracted Analysis.zip to /content/


# Process audio

In [6]:
audio_stimuli = []
stimuli_path = "/content/Exp2/Stimuli/"

for file in sorted(os.listdir(stimuli_path)):
    if file.endswith(".wav"):
        wav_path = os.path.join(stimuli_path, file)
        audio, sample_rate = librosa.load(wav_path, sr=48000)
        audio_stimuli.append(audio)

In [7]:
inputs = processor(audios=audio_stimuli, return_tensors="pt", padding=True, sampling_rate=48000)
audio_embeddings = model.get_audio_features(**inputs)

In [8]:
print(audio_embeddings.shape)

torch.Size([32, 512])


# Process text

In [9]:
discrete_tags = ["happiness", "sadness", "anger", "tenderness", "fear"]

discrete_captions_perceived = ["I perceive this sound as " + tag for tag in discrete_tags]
print(discrete_captions_perceived)
discrete_captions_induced = ["This sound makes me feel " + tag for tag in discrete_tags]
print(discrete_captions_induced)

dimensional_tags = ["positive", "relaxed", "awake"]

dimensional_captions_perceived = ["I perceive this sound as " + tag for tag in dimensional_tags]
print(dimensional_captions_perceived)
dimensional_captions_induced = ["This sound makes me feel " + tag for tag in dimensional_tags]
print(dimensional_captions_induced)


['I perceive this sound as happiness', 'I perceive this sound as sadness', 'I perceive this sound as anger', 'I perceive this sound as tenderness', 'I perceive this sound as fear']
['This sound makes me feel happiness', 'This sound makes me feel sadness', 'This sound makes me feel anger', 'This sound makes me feel tenderness', 'This sound makes me feel fear']
['I perceive this sound as positive', 'I perceive this sound as relaxed', 'I perceive this sound as awake']
['This sound makes me feel positive', 'This sound makes me feel relaxed', 'This sound makes me feel awake']


In [10]:
all_tags = discrete_captions_perceived + discrete_captions_induced + dimensional_captions_perceived + dimensional_captions_induced


# NOTE: currently using only dimensional_captions_induced
inputs_induced = processor(text=dimensional_captions_induced, return_tensors="pt", padding=True)
inputs_perceived = processor(text=dimensional_captions_perceived, return_tensors="pt", padding=True)

induced_embeddings = model.get_text_features(**inputs_induced)
perceived_embeddings = model.get_text_features(**inputs_perceived)

tag_embeds = (induced_embeddings + perceived_embeddings)/2

## Load csv files and extract related columns

In [11]:
IDim_path = '/content/Exp2/Data/Dim'
IDim_response_dfs = []

for file in os.listdir(IDim_path):
    if file.endswith(".csv"):
        file_path = os.path.join(IDim_path, file)
        try:
            df = pd.read_csv(file_path, sep=r'\s*,\s*', engine='python')
            # Crucial: Strip whitespace from column names
            df.columns = df.columns.str.strip()

            # Ensure required rating columns exist
            df["positive"] = (df["Ipositive"] + df["Ppositive"]) / 2
            df["relaxed"] = (df["Irelaxed"] + df["Prelaxed"]) / 2
            df["awake"] = (df["Iawake"] + df["Pawake"]) / 2
            required_cols = ['positive', 'relaxed', 'awake']
            if all(col in df.columns for col in required_cols):
                # Select only the relevant columns and append to our list
                IDim_response_dfs.append(df[required_cols])
            else:
                print(f"Skipping file '{file_path}': Missing required columns ({required_cols}). Found columns: {df.columns.tolist()}")

        except Exception as e:
            print(f"Error reading or processing file {file_path}: {e}")


# Concatenate all individual DataFrames into one master DataFrame for human responses
if IDim_response_dfs:
    master_human_responses_df = pd.concat(IDim_response_dfs, ignore_index=True)
    print(f"Master human responses DataFrame shape: {master_human_responses_df.shape}\n")
    print(f"Master human responses (first 5 rows):\n{master_human_responses_df}\n")
else:
    raise ValueError("No valid CSV files found or processed in IDim_path.")

Master human responses DataFrame shape: (2432, 3)

Master human responses (first 5 rows):
      positive  relaxed  awake
0        1.000    1.000  5.000
1        4.340    2.520  7.550
2        3.525    2.505  6.515
3        5.445    3.625  6.540
4        7.005    5.535  5.485
...        ...      ...    ...
2427     3.760    2.975  6.410
2428     8.470    8.055  4.995
2429     8.815    8.705  1.550
2430     7.190    7.440  2.780
2431     3.955    4.025  7.990

[2432 rows x 3 columns]



# Prepare features X and targets y

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

num_participants = len(IDim_response_dfs)
if master_human_responses_df.shape[0] % len(audio_stimuli) != 0:
    print("Warning: Total responses is not a perfect multiple of unique audio files. This might indicate inconsistent data or that not all participants rated all items, which could break implicit ordering.")

X_list = []
for _ in range(num_participants):
    X_list.extend(audio_embeddings.detach()) # Add a full set of embeddings for each participant

# Convert to NumPy array
X = np.array(X_list)

# Extract y from the concatenated DataFrame
y = master_human_responses_df[['positive', 'relaxed', 'awake']].values

print(f"Shape of X (features) after implicit alignment: {X.shape}")
print(f"Shape of y (labels) after implicit alignment: {y.shape}\n")

# Sanity check: X and y must have the same number of rows
if X.shape[0] != y.shape[0]:
    raise ValueError("Number of rows in X and y do not match after implicit alignment. This indicates an issue with the implicit ordering assumption or data loading.")

# --- Split Data into Training and Testing Sets ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(f"Training set size (X_train, y_train): {X_train.shape}, {y_train.shape}")
print(f"Testing set size (X_test, y_test): {X_test.shape}, {y_test.shape}\n")

Shape of X (features) after implicit alignment: (2432, 512)
Shape of y (labels) after implicit alignment: (2432, 3)

Training set size (X_train, y_train): (1945, 512), (1945, 3)
Testing set size (X_test, y_test): (487, 512), (487, 3)



# Train regression head (=MLP, a few projection layers)

In [13]:
from sklearn.neural_network import MLPRegressor

mlp_regressor = MLPRegressor(
    hidden_layer_sizes=(100, 50),
    activation='relu',
    solver='adam',
    max_iter=1000,
    random_state=42,
    verbose=True,
    early_stopping=True,
    n_iter_no_change=50,
    tol=1e-4
)

print("Starting MLP Regressor training...")
mlp_regressor.fit(X_train, y_train)
print("\nMLP Regressor training complete.")

Starting MLP Regressor training...
Iteration 1, loss = 7.65524251
Validation score: -0.500034
Iteration 2, loss = 2.00659752
Validation score: -0.296574
Iteration 3, loss = 1.61850233
Validation score: 0.135920
Iteration 4, loss = 1.33247208
Validation score: 0.199116
Iteration 5, loss = 1.30093109
Validation score: 0.226345
Iteration 6, loss = 1.26288134
Validation score: 0.212980
Iteration 7, loss = 1.25185504
Validation score: 0.241485
Iteration 8, loss = 1.24696706
Validation score: 0.240095
Iteration 9, loss = 1.24610029
Validation score: 0.249851
Iteration 10, loss = 1.24003458
Validation score: 0.230030
Iteration 11, loss = 1.24360031
Validation score: 0.236223
Iteration 12, loss = 1.24543889
Validation score: 0.239669
Iteration 13, loss = 1.25019563
Validation score: 0.241817
Iteration 14, loss = 1.24624091
Validation score: 0.240895
Iteration 15, loss = 1.24552948
Validation score: 0.237198
Iteration 16, loss = 1.24618016
Validation score: 0.238173
Iteration 17, loss = 1.24800

# Evaluate

In [18]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr
print("Results for Dimensional (Dim)")
print("______________________________________________")

y_pred = mlp_regressor.predict(X_test)

print(f"\nShape of predictions (y_pred): {y_pred.shape}")
print(f"First 5 actual values (y_test):\n{y_test[:5]}")
print(f"First 5 predicted values (y_pred):\n{y_pred[:5]}\n")

# Evaluation Metrics:

# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae:.4f}")

# Mean Absolute Percentage Error (MAPE)
absolute_percentage_error = np.abs((y_test - y_pred) / y_test) * 100
mape = np.mean(absolute_percentage_error)

print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%\n")

# Root Mean Squared Error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

# Pearson Correlation Coefficient (per dimension)
print("\nPearson Correlation Coefficients (per dimension):")
for i, dim_name in enumerate(['Positive', 'Relaxed', 'Awake']):
    # Check for sufficient variance to calculate correlation
    if np.std(y_test[:, i]) > 1e-6 and np.std(y_pred[:, i]) > 1e-6:
        correlation, _ = pearsonr(y_test[:, i], y_pred[:, i])
        print(f"  {dim_name} Dimension: {correlation:.4f}")
    else:
        print(f"  {dim_name} Dimension: Cannot calculate (insufficient variance in data for this dimension)")

correlations = []
for i in range(y_test.shape[1]):
    if np.std(y_test[:, i]) > 1e-6 and np.std(y_pred[:, i]) > 1e-6:
        correlations.append(pearsonr(y_test[:, i], y_pred[:, i])[0])
if correlations:
    average_correlation = np.mean(correlations)
    print(f"  Average Pearson Correlation across dimensions: {average_correlation:.4f}")
else:
    print("  No correlations could be calculated for averaging.")


from sklearn.metrics import r2_score

# R-squared
print("\nR-squared scores:")
r2_valence = r2_score(y_test[:, 0], y_pred[:, 0])
print(f"  valence = {r2_valence:.4f}")

r2_tension = r2_score(y_test[:, 1], y_pred[:, 1])
print(f"  tension = {r2_tension:.4f}")

r2_energy = r2_score(y_test[:, 2], y_pred[:, 2])
print(f"  energy = {r2_energy:.4f}")

r2_scores = [r2_valence, r2_tension, r2_energy]

# Calculate the average R²
average_r2 = np.mean(r2_scores)
print(f"\nAverage R-squared across dimensions: {average_r2:.4f}")

Results for Dimensional (Dim)
______________________________________________

Shape of predictions (y_pred): (487, 3)
First 5 actual values (y_test):
[[3.495 3.47  6.875]
 [2.555 3.99  3.365]
 [5.545 4.98  4.505]
 [3.745 6.455 7.77 ]
 [8.985 3.495 6.   ]]
First 5 predicted values (y_pred):
[[2.9883497 3.2680185 3.9987533]
 [3.9970763 4.442398  4.7502127]
 [5.0638003 4.8348913 5.6798835]
 [3.7311907 3.5859222 4.8532043]
 [5.9579253 5.9090023 5.598829 ]]

Mean Absolute Error (MAE): 1.2677
Mean Absolute Percentage Error (MAPE): 36.44%

Root Mean Squared Error (RMSE): 1.5750

Pearson Correlation Coefficients (per dimension):
  Positive Dimension: 0.5513
  Relaxed Dimension: 0.5357
  Awake Dimension: 0.4347
  Average Pearson Correlation across dimensions: 0.5073

R-squared scores:
  valence = 0.2987
  tension = 0.2626
  energy = 0.1811

Average R-squared across dimensions: 0.2475


In [20]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr
print("Results for Dimensional (Dim)")
print("______________________________________________")

y_pred = mlp_regressor.predict(X_test)

print(f"\nShape of predictions (y_pred): {y_pred.shape}")
print(f"First 5 actual values (y_test):\n{y_test[:5]}")
print(f"First 5 predicted values (y_pred):\n{y_pred[:5]}\n")

# Evaluation Metrics:

# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae:.4f}")

# Mean Absolute Percentage Error (MAPE)
absolute_percentage_error = np.abs((y_test - y_pred) / y_test) * 100
mape = np.mean(absolute_percentage_error)

print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%\n")

# Root Mean Squared Error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

# Pearson Correlation Coefficient (per dimension)
print("\nPearson Correlation Coefficients (per dimension):")
for i, dim_name in enumerate(['Positive', 'Relaxed', 'Awake']):
    # Check for sufficient variance to calculate correlation
    if np.std(y_test[:, i]) > 1e-6 and np.std(y_pred[:, i]) > 1e-6:
        correlation, _ = pearsonr(y_test[:, i], y_pred[:, i])
        print(f"  {dim_name} Dimension: {correlation:.4f}")
    else:
        print(f"  {dim_name} Dimension: Cannot calculate (insufficient variance in data for this dimension)")

correlations = []
for i in range(y_test.shape[1]):
    if np.std(y_test[:, i]) > 1e-6 and np.std(y_pred[:, i]) > 1e-6:
        correlations.append(pearsonr(y_test[:, i], y_pred[:, i])[0])
if correlations:
    average_correlation = np.mean(correlations)
    print(f"  Average Pearson Correlation across dimensions: {average_correlation:.4f}")
else:
    print("  No correlations could be calculated for averaging.")


from sklearn.metrics import r2_score

# R-squared
print("\nR-squared scores:")
r2_valence = r2_score(y_test[:, 0], y_pred[:, 0])
print(f"  valence = {r2_valence:.4f}")

r2_tension = r2_score(y_test[:, 1], y_pred[:, 1])
print(f"  tension = {r2_tension:.4f}")

r2_energy = r2_score(y_test[:, 2], y_pred[:, 2])
print(f"  energy = {r2_energy:.4f}")

r2_scores = [r2_valence, r2_tension, r2_energy]

# Calculate the average R²
average_r2 = np.mean(r2_scores)
print(f"\nAverage R-squared across dimensions: {average_r2:.4f}")

Results for Dimensional (Dim)
______________________________________________

Shape of predictions (y_pred): (487, 3)
First 5 actual values (y_test):
[[3.495 3.47  6.875]
 [2.555 3.99  3.365]
 [5.545 4.98  4.505]
 [3.745 6.455 7.77 ]
 [8.985 3.495 6.   ]]
First 5 predicted values (y_pred):
[[2.9883497 3.2680185 3.9987533]
 [3.9970763 4.442398  4.7502127]
 [5.0638003 4.8348913 5.6798835]
 [3.7311907 3.5859222 4.8532043]
 [5.9579253 5.9090023 5.598829 ]]

Mean Absolute Error (MAE): 1.2677
Mean Absolute Percentage Error (MAPE): 36.44%

Root Mean Squared Error (RMSE): 1.5750

Pearson Correlation Coefficients (per dimension):
  Positive Dimension: 0.5513
  Relaxed Dimension: 0.5357
  Awake Dimension: 0.4347
  Average Pearson Correlation across dimensions: 0.5073

R-squared scores:
  valence = 0.2987
  tension = 0.2626
  energy = 0.1811

Average R-squared across dimensions: 0.2475
