In [None]:
from transformers import ClapModel, AutoProcessor
import torch
import librosa
import os
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [None]:
# Load CLAP model + processor
model = ClapModel.from_pretrained("laion/larger_clap_music")
processor = AutoProcessor.from_pretrained("laion/larger_clap_music")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/628 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/776M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/776M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

# Process audio

In [None]:
import zipfile
import os

zip_files = ["Exp1.zip", "Exp2.zip", "Analysis.zip"]

extract_dir = "/content/" # You can change this if you want to extract elsewhere
os.makedirs(extract_dir, exist_ok=True)

for zip_file in zip_files:
    if os.path.exists(zip_file):
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)
        print(f"Extracted {zip_file} to {extract_dir}")
    else:
        print(f"Error: {zip_file} not found.")

Extracted Exp1.zip to /content/
Extracted Exp2.zip to /content/
Extracted Analysis.zip to /content/


In [None]:
audio_stimuli = []
stimuli_path = "/content/Exp1/Stimuli/"

for file in sorted(os.listdir(stimuli_path)):
    if file.endswith(".wav"):
        wav_path = os.path.join(stimuli_path, file)
        #Clap has already been trained on a sample rate of 48,000 so we should use what it knows already
        audio, sample_rate = librosa.load(wav_path, sr=48000)
        audio_stimuli.append(audio)

In [None]:
inputs = processor(audios=audio_stimuli, return_tensors="pt", padding=True, sampling_rate=48000)
audio_embeddings = model.get_audio_features(**inputs)

# Process text

In [None]:
discrete_tags = ["happiness", "sadness", "anger", "tenderness", "fear"]

discrete_captions_perceived = ["I perceive this sound as " + tag for tag in discrete_tags]
print(discrete_captions_perceived)
discrete_captions_induced = ["This sound makes me feel " + tag for tag in discrete_tags]
print(discrete_captions_induced)

dimensional_tags = ["positive", "relaxed", "awake"]

dimensional_captions_perceived = ["I perceive this sound as " + tag for tag in dimensional_tags]
print(dimensional_captions_perceived)
dimensional_captions_induced = ["This sound makes me feel " + tag for tag in dimensional_tags]
print(dimensional_captions_induced)


['I perceive this sound as happiness', 'I perceive this sound as sadness', 'I perceive this sound as anger', 'I perceive this sound as tenderness', 'I perceive this sound as fear']
['This sound makes me feel happiness', 'This sound makes me feel sadness', 'This sound makes me feel anger', 'This sound makes me feel tenderness', 'This sound makes me feel fear']
['I perceive this sound as positive', 'I perceive this sound as relaxed', 'I perceive this sound as awake']
['This sound makes me feel positive', 'This sound makes me feel relaxed', 'This sound makes me feel awake']


In [None]:
all_tags = discrete_captions_perceived + discrete_captions_induced + dimensional_captions_perceived + dimensional_captions_induced

# NOTE: currently using only dimensional_captions_induced
tag_inputs = processor(text=dimensional_captions_induced, return_tensors="pt", padding=True)
tag_embeds = model.get_text_features(**tag_inputs)

## Load csv files and extract related columns

In [None]:
# Path to your timbre features CSV
timbre_csv_path = '/content/Analysis/Thesis_Ch5/TimbreToolbox/long_AT1_TimbreToolbox_220509.csv'

# Load CSV
timbre_df = pd.read_csv(timbre_csv_path)

# Strip any extra whitespace from column names (good practice)
timbre_df.columns = timbre_df.columns.str.strip()
timbre_df.columns

Index(['ppno', 'perceived/induced', 'dimensional/discrete', 'sequence',
       'stimNo', 'stim',
       'family (B = brass; W = woodwind; S = string; P = percussion)', 'instr',
       'octave', 'nPlays', 'like', 'positive', 'relaxed', 'tense', 'awake',
       'anger', 'fear', 'sadness', 'happiness', 'tenderness', 'IQR_Pitch',
       'IQR_HarmonicSpectralDeviation', 'IQR_Tristimulus_1',
       'IQR_Tristimulus_2', 'IQR_Tristimulus_3', 'IQR_HarmonicOddToEvenRatio',
       'IQR_Inharmonicity', 'IQR_HarmonicEnergy', 'IQR_NoiseEnergy',
       'IQR_Noisiness', 'IQR_HarmonicToNoiseEnergy',
       'IQR_PartialsToNoiseEnergy', 'Median_Pitch',
       'Median_HarmonicSpectralDeviation', 'Median_Tristimulus_1',
       'Median_Tristimulus_2', 'Median_Tristimulus_3',
       'Median_HarmonicOddToEvenRatio', 'Median_Inharmonicity',
       'Median_HarmonicEnergy', 'Median_NoiseEnergy', 'Median_Noisiness',
       'Median_HarmonicToNoiseEnergy', 'Median_PartialsToNoiseEnergy',
       'IQR_SpectralCentroi

In [None]:
IDim_path = '/content/Exp1/Data/IDim/'
IDim_response_dfs = []

for file in sorted(os.listdir(IDim_path)):
    if file.endswith(".csv"):
        file_path = os.path.join(IDim_path, file)
        try:
            df = pd.read_csv(file_path, sep=r'\s*,\s*', engine='python')
            # Crucial: Strip whitespace from column names
            df.columns = df.columns.str.strip()

            # Ensure required rating columns exist
            required_cols = ['positive', 'relaxed', 'awake']
            if all(col in df.columns for col in required_cols):
                # Select only the relevant columns and append to our list
                IDim_response_dfs.append(df[required_cols])
            else:
                print(f"Skipping file '{file_path}': Missing required columns ({required_cols}). Found columns: {df.columns.tolist()}")

        except Exception as e:
            print(f"Error reading or processing file {file_path}: {e}")


# Concatenate all individual DataFrames into one master DataFrame for human responses
if IDim_response_dfs:
    master_human_responses_df = pd.concat(IDim_response_dfs, ignore_index=True)
    print(f"Master human responses DataFrame shape: {master_human_responses_df.shape}\n")
    print(f"Master human responses (first 5 rows):\n{master_human_responses_df.head()}\n")
else:
    raise ValueError("No valid CSV files found or processed in IDim_path.")

Master human responses DataFrame shape: (3835, 3)

Master human responses (first 5 rows):
   positive  relaxed  awake
0      3.68     3.78   4.42
1      5.88     5.98   3.89
2      6.53     5.59   6.59
3      6.26     5.71   6.88
4      2.80     2.62   5.15



# Prepare features X and targets y

In [None]:
feature_names = [
    'stimNo'
    'IQR_Pitch','IQR_HarmonicSpectralDeviation', 'IQR_Tristimulus_1',
    'IQR_Tristimulus_2', 'IQR_Tristimulus_3', 'IQR_HarmonicOddToEvenRatio',
    'IQR_Inharmonicity', 'IQR_HarmonicEnergy', 'IQR_NoiseEnergy',
    'IQR_Noisiness', 'IQR_HarmonicToNoiseEnergy',
    'IQR_PartialsToNoiseEnergy', 'Median_Pitch',
    'Median_HarmonicSpectralDeviation', 'Median_Tristimulus_1',
    'Median_Tristimulus_2', 'Median_Tristimulus_3',
    'Median_HarmonicOddToEvenRatio', 'Median_Inharmonicity',
    'Median_HarmonicEnergy', 'Median_NoiseEnergy', 'Median_Noisiness',
    'Median_HarmonicToNoiseEnergy', 'Median_PartialsToNoiseEnergy',
    'IQR_SpectralCentroid', 'IQR_SpectralSpread', 'IQR_SpectralSkewness',
    'IQR_SpectralKurtosis', 'IQR_SpectralFlatness', 'IQR_SpectralCrest',
    'IQR_SpectralSlope', 'IQR_SpectralDecrease', 'IQR_SpectralRollOff',
    'IQR_SpectralVariation', 'IQR_SpectralFlux', 'Median_SpectralCentroid',
    'Median_SpectralSpread', 'Median_SpectralSkewness',
    'Median_SpectralKurtosis', 'Median_SpectralFlatness',
    'Median_SpectralCrest', 'Median_SpectralSlope',
    'Median_SpectralDecrease', 'Median_SpectralRollOff',
    'Median_SpectralVariation', 'Median_SpectralFlux', 'AttackTime',
    'LogAttackTime', 'AttackSlope', 'DecreaseSlope', 'TemporalCentroid',
    'EffectiveDuration', 'FrequencyOfEnergyModulation',
    'AmplitudeOfEnergyModulation'
]

# Limit timbre_df to the first 59 rows (59 sounds)
X_base = timbre_df.loc[:58].sort_values('stimNo')[feature_names].values

# Repeat each sound's features for all participants
# Assume each participant rated all 59 sounds
num_participants = int(master_human_responses_df.shape[0] / 59)

X = np.tile(X_base, (num_participants, 1))  # shape: (3835, num_features)
print(X.shape)
y = master_human_responses_df[['positive', 'relaxed', 'awake']].values
print(y.shape)

# Sanity check
print(X.shape[0])
print(y.shape[0])
assert X.shape[0] == y.shape[0], "Mismatch between feature and label rows!"


(3835, 54)
(3835, 3)
3835
3835


In [None]:
from sklearn.model_selection import train_test_split


num_participants = len(IDim_response_dfs)
if master_human_responses_df.shape[0] % len(audio_stimuli) != 0:
    print("Warning: Total responses is not a perfect multiple of unique audio files. This might indicate inconsistent data or that not all participants rated all items, which could break implicit ordering.")

X_list = []
for _ in range(num_participants):
    X_list.extend(audio_embeddings.detach()) # Add a full set of embeddings for each participant

# Convert to NumPy array
X = np.array(X_list)

# Extract y from the concatenated DataFrame
y = master_human_responses_df[['positive', 'relaxed', 'awake']].values

print(f"Shape of X (features) after implicit alignment: {X.shape}")
print(f"Shape of y (labels) after implicit alignment: {y.shape}\n")

# Sanity check: X and y must have the same number of rows
if X.shape[0] != y.shape[0]:
    raise ValueError("Number of rows in X and y do not match after implicit alignment. This indicates an issue with the implicit ordering assumption or data loading.")

# --- Split Data into Training and Testing Sets ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set size (X_train, y_train): {X_train.shape}, {y_train.shape}")
print(f"Testing set size (X_test, y_test): {X_test.shape}, {y_test.shape}\n")

Shape of X (features) after implicit alignment: (3835, 512)
Shape of y (labels) after implicit alignment: (3835, 3)

Training set size (X_train, y_train): (3068, 512), (3068, 3)
Testing set size (X_test, y_test): (767, 512), (767, 3)



# Random Forest Regressor Training

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Fit Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict
y_pred = rf.predict(X_test)

print("R² Scores:")
for i, label in enumerate(['positive', 'relaxed', 'awake']):
    r2 = r2_score(y_test[:, i], y_pred[:, i])
    print(f"  {label}: {r2:.2f}")

r2_avg = r2_score(y_test, y_pred, multioutput='uniform_average')
print(f"\nAverage R² Score: {r2_avg:.2f}")


R² Scores:
  positive: 0.30
  relaxed: 0.37
  awake: 0.06

Average R² Score: 0.24
