In [32]:
from transformers import ClapModel, AutoProcessor
import torch
import librosa
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from scipy.stats import pearsonr

In [33]:
# Load CLAP model + processor
model = ClapModel.from_pretrained("laion/larger_clap_music")
processor = AutoProcessor.from_pretrained("laion/larger_clap_music")

# Process audio

In [34]:
import zipfile
import os

zip_files = ["Exp1.zip", "Exp2.zip", "Analysis.zip"]

extract_dir = "/content/" # You can change this if you want to extract elsewhere
os.makedirs(extract_dir, exist_ok=True)

for zip_file in zip_files:
    if os.path.exists(zip_file):
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)
        print(f"Extracted {zip_file} to {extract_dir}")
    else:
        print(f"Error: {zip_file} not found.")

Extracted Exp1.zip to /content/
Extracted Exp2.zip to /content/
Extracted Analysis.zip to /content/


In [35]:
audio_stimuli = []
stimuli_path = "/content/Exp1/Stimuli/"

for file in sorted(os.listdir(stimuli_path)):
    if file.endswith(".wav"):
        wav_path = os.path.join(stimuli_path, file)
        #Clap has already been trained on a sample rate of 48,000 so we should use what it knows already
        audio, sample_rate = librosa.load(wav_path, sr=48000)
        audio_stimuli.append(audio)

In [36]:
inputs = processor(audios=audio_stimuli, return_tensors="pt", padding=True, sampling_rate=48000)
audio_embeddings = model.get_audio_features(**inputs)

# Process text

In [6]:
discrete_tags = ["happiness", "sadness", "anger", "tenderness", "fear"]

discrete_captions_perceived = ["I perceive this sound as " + tag for tag in discrete_tags]
print(discrete_captions_perceived)
discrete_captions_induced = ["This sound makes me feel " + tag for tag in discrete_tags]
print(discrete_captions_induced)

dimensional_tags = ["positive", "relaxed", "awake"]

dimensional_captions_perceived = ["I perceive this sound as " + tag for tag in dimensional_tags]
print(dimensional_captions_perceived)
dimensional_captions_induced = ["This sound makes me feel " + tag for tag in dimensional_tags]
print(dimensional_captions_induced)


['I perceive this sound as happiness', 'I perceive this sound as sadness', 'I perceive this sound as anger', 'I perceive this sound as tenderness', 'I perceive this sound as fear']
['This sound makes me feel happiness', 'This sound makes me feel sadness', 'This sound makes me feel anger', 'This sound makes me feel tenderness', 'This sound makes me feel fear']
['I perceive this sound as positive', 'I perceive this sound as relaxed', 'I perceive this sound as awake']
['This sound makes me feel positive', 'This sound makes me feel relaxed', 'This sound makes me feel awake']


In [7]:
all_tags = discrete_captions_perceived + discrete_captions_induced + dimensional_captions_perceived + dimensional_captions_induced

# NOTE: currently using only dimensional_captions_induced
tag_inputs = processor(text=dimensional_captions_induced, return_tensors="pt", padding=True)
tag_embeds = model.get_text_features(**tag_inputs)

## Load csv files and extract related columns

In [21]:
IDim_path = '/content/Exp1/Data/IDim/'
IDim_response_dfs = []

for file in sorted(os.listdir(IDim_path)):
    if file.endswith(".csv"):
        file_path = os.path.join(IDim_path, file)
        try:
            df = pd.read_csv(file_path, sep=r'\s*,\s*', engine='python')
            # Crucial: Strip whitespace from column names
            df.columns = df.columns.str.strip()

            # Ensure required rating columns exist
            required_cols = ['positive', 'relaxed', 'awake']
            if all(col in df.columns for col in required_cols):
                # Select only the relevant columns and append to our list
                IDim_response_dfs.append(df[required_cols])
            else:
                print(f"Skipping file '{file_path}': Missing required columns ({required_cols}). Found columns: {df.columns.tolist()}")

        except Exception as e:
            print(f"Error reading or processing file {file_path}: {e}")


# Concatenate all individual DataFrames into one master DataFrame for human responses
if IDim_response_dfs:
    master_human_responses_df = pd.concat(IDim_response_dfs, ignore_index=True)
    print(f"Master human responses DataFrame shape: {master_human_responses_df.shape}\n")
    print(f"Master human responses (first 5 rows):\n{master_human_responses_df.head()}\n")
else:
    raise ValueError("No valid CSV files found or processed in IDim_path.")

Master human responses DataFrame shape: (3835, 3)

Master human responses (first 5 rows):
   positive  relaxed  awake
0      3.68     3.78   4.42
1      5.88     5.98   3.89
2      6.53     5.59   6.59
3      6.26     5.71   6.88
4      2.80     2.62   5.15



# Prepare features X and targets y

In [25]:
# Step 1: Load timbre data
timbre_path = "/content/Analysis/Thesis_Ch5/TimbreToolbox/long_AT1_TimbreToolbox_220509.csv"
df = pd.read_csv(timbre_path)
df.columns = df.columns.str.strip()

# Step 2: Extract first 59 unique audio files
unique_stims = df['stim'].drop_duplicates().iloc[1:60]
audio_df = df[df['stim'].isin(unique_stims)].copy()

# =Step 3: Group by audio ('stim') and average participant ratings
response_cols = ['positive', 'relaxed', 'awake']
available_responses = [col for col in response_cols if col in audio_df.columns]
if not available_responses:
    raise ValueError("❌ No 'positive', 'relaxed', or 'awake' columns found in data.")

response_avg = audio_df.groupby('stim')[available_responses].mean()

# Step 4: Extract features starting from 'IQR_Pitch' onwards
start_col = audio_df.columns.get_loc('IQR_Pitch')
feature_cols = audio_df.columns[start_col:]

# Group by stim and average those numerical features
features_avg = audio_df.groupby('stim')[feature_cols].mean()

# Step 5: Align features and labels
X = features_avg
Y = response_avg

# Standardize features
scaler = StandardScaler()
X = X.dropna()  # Drop any rows with NaNs
Y = Y.loc[X.index]  # Align Y with cleaned X
X_scaled = scaler.fit_transform(X)

# Random Forest Regressor Training

In [26]:
rf_models = {}
rf_predictions = {}

# Cross-validation
rkf = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)

for col in Y.columns:
    y = Y[col].values

    all_y_true, all_y_pred, all_models = [], [], []

    for train_idx, test_idx in rkf.split(X_scaled):
        X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        all_y_true.extend(y_test)
        all_y_pred.extend(y_pred)
        all_models.append(model)

    rf_models[col] = model
    rf_predictions[col] = {
        'y_true': np.array(all_y_true),
        'y_pred': np.array(all_y_pred),
        'models': all_models
    }

    print(f"✅ Random Forest model trained for: {col}")


✅ Random Forest model trained for: positive
✅ Random Forest model trained for: relaxed
✅ Random Forest model trained for: awake


# Evaluate

In [37]:
print("📊 Random Forest Regression Statistics\n" + "-" * 50)

r2s = []

for col, data in rf_predictions.items():
    y_true = data['y_true']
    y_pred = data['y_pred']
    model = rf_models[col]

    r2 = r2_score(y_true, y_pred)
    r2s.append(r2)

    print(f"\n📈 Dimension: {col}")
    print("→ R² Score:", round(r2, 4))

📊 Random Forest Regression Statistics
--------------------------------------------------

📈 Dimension: positive
→ R² Score: 0.8416

📈 Dimension: relaxed
→ R² Score: 0.7477

📈 Dimension: awake
→ R² Score: 0.7095
