In [None]:
# Import libraries
# transformers: Provides access to pre-trained models like CLAP.
from transformers import ClapModel, AutoProcessor

# librosa: Used for audio analysis and loading audio files.
import librosa

# os: Provides a way to interact with the operating system, like navigating directories.
import os

import torch
import pandas as pd
import numpy as np

In [None]:
# Load CLAP model + processor
model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
processor = AutoProcessor.from_pretrained("laion/clap-htsat-unfused")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/615M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/614M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [None]:
#Small test to print all the parameters of CLAP
print('total number of CLAP\'s parameters:', sum(p.numel() for p in model.parameters()))

total number of CLAP's parameters: 153492890


In [None]:
#Calculating the size of the data of all the parameters
param_size = 0 # We are using a for loop to add size of all parameters os initialised as 0 at the beginning

for param in model.parameters():
    #In increments we are adding the size of each parameter given by formula
    #param.nelement() is giving total number of elements in each parameter
    #param.element_size() is giving size of element in bytes
    param_size += param.nelement() * param.element_size()

#Buffers are tensors but not learnable parameters but still take up space
buffer_size = 0
for buffer in model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

#Total size adds buffer size and parameter size
#Converted into MB
size_all_mb = (param_size + buffer_size) / 1024**2
print('CLAP model size: {:.3f}MB'.format(size_all_mb))

CLAP model size: 585.912MB


In [None]:
#This section is to convert the uploaded zip files into folders
import zipfile

# Define the paths to your zip files
zip_files = ["Exp1.zip", "Exp2.zip", "Analysis.zip"]

# Define the directory where you want to extract the files
extract_dir = "/content/" # You can change this if you want to extract elsewhere

# Create the extraction directory if it doesn't exist
os.makedirs(extract_dir, exist_ok=True)

# Extract each zip file
for zip_file in zip_files:
    if os.path.exists(zip_file):
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)
        print(f"Extracted {zip_file} to {extract_dir}")
    else:
        print(f"Erro r: {zip_file} not found.")

Extracted Exp1.zip to /content/
Extracted Exp2.zip to /content/
Extracted Analysis.zip to /content/


# Process audio

In [None]:
audio_stimuli = []
stimuli_path = "/content/Exp1/Stimuli/"  #/content/Exp2/Stimuli/ if we are running experiment 2

#This section adds all the audio waveforms into the list audio_stimuli
for file in sorted(os.listdir(stimuli_path)):
    if file.endswith(".wav"):
        wav_path = os.path.join(stimuli_path, file)
        #Clap has already been trained on a sample rate of 48,000 so we should use what it knows already
        print(wav_path)
        audio, sample_rate = librosa.load(wav_path, sr=48000)
        audio_stimuli.append(audio)

/content/Exp1/Stimuli/01_B_CTu_1.wav
/content/Exp1/Stimuli/02_B_CTu_2.wav
/content/Exp1/Stimuli/03_B_Tu_3.wav
/content/Exp1/Stimuli/04_B_Tu_4.wav
/content/Exp1/Stimuli/05_B_CTb_1.wav
/content/Exp1/Stimuli/06_B_CTb_2.wav
/content/Exp1/Stimuli/07_B_BTb_3.wav
/content/Exp1/Stimuli/08_B_TTb_4.wav
/content/Exp1/Stimuli/09_B_Ho_3.wav
/content/Exp1/Stimuli/10_B_Ho_4.wav
/content/Exp1/Stimuli/11_B_Ho_5.wav
/content/Exp1/Stimuli/12_B_Tr_4.wav
/content/Exp1/Stimuli/13_B_PTr_5.wav
/content/Exp1/Stimuli/14_B_PTr_6.wav
/content/Exp1/Stimuli/15_W_CBa_1.wav
/content/Exp1/Stimuli/16_W_CBa_2.wav
/content/Exp1/Stimuli/17_W_Ba_3.wav
/content/Exp1/Stimuli/18_W_Ba_4.wav
/content/Exp1/Stimuli/19_W_BCl_2.wav
/content/Exp1/Stimuli/20_W_BCl_3.wav
/content/Exp1/Stimuli/21_W_ClB_4.wav
/content/Exp1/Stimuli/22_W_ClB_5.wav
/content/Exp1/Stimuli/23_W_ClB_6.wav
/content/Exp1/Stimuli/24_W_Ob_4.wav
/content/Exp1/Stimuli/25_W_Ob_5.wav
/content/Exp1/Stimuli/26_W_EH_4.wav
/content/Exp1/Stimuli/27_W_EH_5.wav
/content/Exp1

In [None]:
#verifcation if we opened correct file
print(audio_stimuli)

[array([-2.6257991e-05, -1.8168332e-05,  4.3254666e-05, ...,
       -1.1775010e-05,  1.0805696e-06, -3.3297738e-08], dtype=float32), array([-2.1216942e-05, -4.7350541e-06,  1.5635709e-05, ...,
        2.8016274e-08, -1.7804325e-08,  6.9179218e-09], dtype=float32), array([-9.8106102e-06,  1.1119668e-05, -3.5152134e-05, ...,
       -3.4414730e-05, -1.3167290e-05,  4.1392400e-06], dtype=float32), array([-2.9096404e-06, -1.3023089e-05, -4.7573376e-06, ...,
       -3.4261415e-05, -2.2727640e-05,  2.1467149e-06], dtype=float32), array([ 2.7754413e-06, -7.0806864e-06,  1.1189784e-05, ...,
       -1.2227122e-06,  7.4793968e-07, -2.8134855e-07], dtype=float32), array([-6.7172846e-07,  5.4504471e-06, -5.0830917e-05, ...,
       -1.6670765e-06,  1.9083664e-06,  0.0000000e+00], dtype=float32), array([ 1.6268901e-05, -3.3643319e-05,  2.5531041e-05, ...,
       -3.3158278e-05, -2.5886366e-05,  7.7379474e-07], dtype=float32), array([ 6.1509754e-06,  2.7266901e-06, -4.9879025e-05, ...,
       -1.23645

In [None]:
#Processes the audio waveform in the audio_stimuli list
#The process involves converting the audio files into tensors (As PyTorch deals with Tensors)
inputs = processor(audios=audio_stimuli, return_tensors="pt", padding=True, sampling_rate=48000)

#Inputs the tensors into the CLAP model to extract the audio embeddings
#These embeddings are numerical representations
audio_embeddings = model.get_audio_features(**inputs)

In [None]:
#Returning dimensions of the tensors
#The 59 is how many audio files there are
#The 512 is the size of any audio embedding
print(audio_embeddings.shape)

torch.Size([59, 512])


# Process text

In [None]:
#This is generating the prompts for each section
#We have 2 sections of discrete and dimensional
#Each section has 2 subsections of perceived and induced
discrete_tags = ["happiness", "sadness", "anger", "tenderness", "fear"]

discrete_captions_perceived = ["I perceive this sound as " + tag for tag in discrete_tags]
print(discrete_captions_perceived)
discrete_captions_induced = ["This sound makes me feel " + tag for tag in discrete_tags]
print(discrete_captions_induced)

dimensional_tags = ["positive", "relaxed", "awake"]

dimensional_captions_perceived = ["I perceive this sound as " + tag for tag in dimensional_tags]
print(dimensional_captions_perceived)
dimensional_captions_induced = ["This sound makes me feel " + tag for tag in dimensional_tags]
print(dimensional_captions_induced)


['I perceive this sound as happiness', 'I perceive this sound as sadness', 'I perceive this sound as anger', 'I perceive this sound as tenderness', 'I perceive this sound as fear']
['This sound makes me feel happiness', 'This sound makes me feel sadness', 'This sound makes me feel anger', 'This sound makes me feel tenderness', 'This sound makes me feel fear']
['I perceive this sound as positive', 'I perceive this sound as relaxed', 'I perceive this sound as awake']
['This sound makes me feel positive', 'This sound makes me feel relaxed', 'This sound makes me feel awake']


In [None]:
#Combines all the generated text prompts from previous cell into one list
all_tags = discrete_captions_perceived + discrete_captions_induced + dimensional_captions_perceived + dimensional_captions_induced

# NOTE: currently using only dimensional_captions_induced
tag_inputs = processor(text=discrete_captions_induced, return_tensors="pt", padding=True)
tag_embeds = model.get_text_features(**tag_inputs)

# Generate outputs

In [None]:
#Computes the similaity matrix between each audio clip and text tag
#Computes this using dot product between the audio embeddings and text embeddings
sims = torch.matmul(audio_embeddings, tag_embeds.T)
print(sims.shape)

torch.Size([59, 5])


## Load csv files and extract related columns

In [None]:
IDisc_happiness_path = '/content/Exp1/Data/IDisc'
IDisc_happiness_response_dfs = []

for file in os.listdir(IDisc_happiness_path):
    if file.endswith(".csv"):
        file_path = os.path.join(IDisc_happiness_path, file)
        try:
            df = pd.read_csv(file_path, sep=r'\s*,\s*', engine='python')
            # Crucial: Strip whitespace from column names
            df.columns = df.columns.str.strip()

            # Ensure required rating columns exist
            required_cols = ['happiness']
            if all(col in df.columns for col in required_cols):
                # Select only the relevant columns and append to our list
                IDisc_happiness_response_dfs.append(df[required_cols])
            else:
                print(f"Skipping file '{file_path}': Missing required columns ({required_cols}). Found columns: {df.columns.tolist()}")

        except Exception as e:
            print(f"Error reading or processing file {file_path}: {e}")


# Concatenate all individual DataFrames into one master DataFrame for human responses
if IDisc_happiness_response_dfs:
    master_human_responses_df_happiness = pd.concat(IDisc_happiness_response_dfs, ignore_index=True)
    print(f"Master human responses DataFrame shape: {master_human_responses_df_happiness.shape}\n")
    print(f"Master human responses (first 5 rows):\n{master_human_responses_df_happiness.head()}\n")
else:
    raise ValueError("No valid CSV files found or processed in IDisc_happiness_path.")


Master human responses DataFrame shape: (3894, 1)

Master human responses (first 5 rows):
   happiness
0        1.0
1        1.0
2        1.0
3        1.0
4        1.0



In [None]:
IDisc_sadness_path = '/content/Exp1/Data/IDisc'
IDisc_sadness_response_dfs = []

for file in os.listdir(IDisc_sadness_path):
    if file.endswith(".csv"):
        file_path = os.path.join(IDisc_sadness_path, file)
        try:
            df = pd.read_csv(file_path, sep=r'\s*,\s*', engine='python')
            # Crucial: Strip whitespace from column names
            df.columns = df.columns.str.strip()

            # Ensure required rating columns exist
            required_cols = ['sadness']
            if all(col in df.columns for col in required_cols):
                # Select only the relevant columns and append to our list
                IDisc_sadness_response_dfs.append(df[required_cols])
            else:
                print(f"Skipping file '{file_path}': Missing required columns ({required_cols}). Found columns: {df.columns.tolist()}")

        except Exception as e:
            print(f"Error reading or processing file {file_path}: {e}")


# Concatenate all individual DataFrames into one master DataFrame for human responses
if IDisc_sadness_response_dfs:
    master_human_responses_df_sadness = pd.concat(IDisc_sadness_response_dfs, ignore_index=True)
    print(f"Master human responses DataFrame shape: {master_human_responses_df_sadness.shape}\n")
    print(f"Master human responses (first 5 rows):\n{master_human_responses_df_sadness.head()}\n")
else:
    raise ValueError("No valid CSV files found or processed in IDisc_sadness_path.")


Master human responses DataFrame shape: (3894, 1)

Master human responses (first 5 rows):
   sadness
0     6.14
1     3.91
2     5.03
3     4.94
4     1.00



In [None]:
IDisc_anger_path = '/content/Exp1/Data/IDisc'
IDisc_anger_response_dfs = []

for file in os.listdir(IDisc_anger_path):
    if file.endswith(".csv"):
        file_path = os.path.join(IDisc_anger_path, file)
        try:
            df = pd.read_csv(file_path, sep=r'\s*,\s*', engine='python')
            # Crucial: Strip whitespace from column names
            df.columns = df.columns.str.strip()

            # Ensure required rating columns exist
            required_cols = ['anger']
            if all(col in df.columns for col in required_cols):
                # Select only the relevant columns and append to our list
                IDisc_anger_response_dfs.append(df[required_cols])
            else:
                print(f"Skipping file '{file_path}': Missing required columns ({required_cols}). Found columns: {df.columns.tolist()}")

        except Exception as e:
            print(f"Error reading or processing file {file_path}: {e}")


# Concatenate all individual DataFrames into one master DataFrame for human responses
if IDisc_anger_response_dfs:
    master_human_responses_df_anger = pd.concat(IDisc_anger_response_dfs, ignore_index=True)
    print(f"Master human responses DataFrame shape: {master_human_responses_df_anger.shape}\n")
    print(f"Master human responses (first 5 rows):\n{master_human_responses_df_anger.head()}\n")
else:
    raise ValueError("No valid CSV files found or processed in IDisc_anger_path.")


Master human responses DataFrame shape: (3894, 1)

Master human responses (first 5 rows):
   anger
0   1.00
1   1.00
2   1.00
3   3.92
4   8.62



In [None]:
IDisc_tenderness_path = '/content/Exp1/Data/IDisc'
IDisc_tenderness_response_dfs = []

for file in os.listdir(IDisc_tenderness_path):
    if file.endswith(".csv"):
        file_path = os.path.join(IDisc_tenderness_path, file)
        try:
            df = pd.read_csv(file_path, sep=r'\s*,\s*', engine='python')
            # Crucial: Strip whitespace from column names
            df.columns = df.columns.str.strip()

            # Ensure required rating columns exist
            required_cols = ['tenderness']
            if all(col in df.columns for col in required_cols):
                # Select only the relevant columns and append to our list
                IDisc_tenderness_response_dfs.append(df[required_cols])
            else:
                print(f"Skipping file '{file_path}': Missing required columns ({required_cols}). Found columns: {df.columns.tolist()}")

        except Exception as e:
            print(f"Error reading or processing file {file_path}: {e}")


# Concatenate all individual DataFrames into one master DataFrame for human responses
if IDisc_tenderness_response_dfs:
    master_human_responses_df_tenderness = pd.concat(IDisc_tenderness_response_dfs, ignore_index=True)
    print(f"Master human responses DataFrame shape: {master_human_responses_df_tenderness.shape}\n")
    print(f"Master human responses (first 5 rows):\n{master_human_responses_df_tenderness.head()}\n")
else:
    raise ValueError("No valid CSV files found or processed in IDisc_tenderness_path.")


Master human responses DataFrame shape: (3894, 1)

Master human responses (first 5 rows):
   tenderness
0        1.00
1        2.87
2        1.00
3        1.81
4        1.00



In [None]:
IDisc_fear_path = '/content/Exp1/Data/IDisc'
IDisc_fear_response_dfs = []

for file in os.listdir(IDisc_fear_path):
    if file.endswith(".csv"):
        file_path = os.path.join(IDisc_fear_path, file)
        try:
            df = pd.read_csv(file_path, sep=r'\s*,\s*', engine='python')
            # Crucial: Strip whitespace from column names
            df.columns = df.columns.str.strip()

            # Ensure required rating columns exist
            required_cols = ['fear']
            if all(col in df.columns for col in required_cols):
                # Select only the relevant columns and append to our list
                IDisc_fear_response_dfs.append(df[required_cols])
            else:
                print(f"Skipping file '{file_path}': Missing required columns ({required_cols}). Found columns: {df.columns.tolist()}")

        except Exception as e:
            print(f"Error reading or processing file {file_path}: {e}")


# Concatenate all individual DataFrames into one master DataFrame for human responses
if IDisc_fear_response_dfs:
    master_human_responses_df_fear = pd.concat(IDisc_fear_response_dfs, ignore_index=True)
    print(f"Master human responses DataFrame shape: {master_human_responses_df_fear.shape}\n")
    print(f"Master human responses (first 5 rows):\n{master_human_responses_df_fear.head()}\n")
else:
    raise ValueError("No valid CSV files found or processed in IDisc_fear_path.")


Master human responses DataFrame shape: (3894, 1)

Master human responses (first 5 rows):
   fear
0  3.19
1  2.19
2  1.25
3  2.97
4  1.00



# Prepare features X and targets y

In [None]:
from sklearn.model_selection import train_test_split


num_participants_happiness = len(IDisc_happiness_response_dfs)
if master_human_responses_df_happiness.shape[0] % len(audio_stimuli) != 0:
    print("Warning: Total responses is not a perfect multiple of unique audio files. This might indicate inconsistent data or that not all participants rated all items, which could break implicit ordering.")

X_list_happiness = []
for _ in range(num_participants_happiness):
    X_list_happiness.extend(audio_embeddings.detach()) # Add a full set of embeddings for each participant

# Convert to NumPy array
X_happiness = np.array(X_list_happiness)

# Extract y from the concatenated DataFrame
y_happiness = master_human_responses_df_happiness[['happiness']].values

print(f"Shape of X (features) after implicit alignment: {X_happiness.shape}")
print(f"Shape of y (labels) after implicit alignment: {y_happiness.shape}\n")

# Sanity check: X and y must have the same number of rows
if X_happiness.shape[0] != y_happiness.shape[0]:
    raise ValueError("Number of rows in X and y do not match after implicit alignment. This indicates an issue with the implicit ordering assumption or data loading.")

# --- Split Data into Training and Testing Sets ---

TEST_PERCENTAGE_happiness = 0.2  # Adjust as needed
TRAIN_PERCENTAGE_happiness = 1 - TEST_PERCENTAGE_happiness  # Adjust as needed

X_train_happiness, X_test_happiness, y_train_happiness, y_test_happiness = train_test_split(
    X_happiness, y_happiness, test_size=TEST_PERCENTAGE_happiness, random_state=42
)

print(f"Training set size (X_train_happiness, y_train_happiness): {X_train_happiness.shape}, {y_train_happiness.shape}")
print(f"Testing set size (X_test_happiness, y_test_happiness): {X_test_happiness.shape}, {y_test_happiness.shape}\n")

Shape of X (features) after implicit alignment: (3894, 512)
Shape of y (labels) after implicit alignment: (3894, 1)

Training set size (X_train_happiness, y_train_happiness): (3115, 512), (3115, 1)
Testing set size (X_test_happiness, y_test_happiness): (779, 512), (779, 1)



In [None]:
from sklearn.model_selection import train_test_split


num_participants_sadness = len(IDisc_sadness_response_dfs)
if master_human_responses_df_sadness.shape[0] % len(audio_stimuli) != 0:
    print("Warning: Total responses is not a perfect multiple of unique audio files. This might indicate inconsistent data or that not all participants rated all items, which could break implicit ordering.")

X_list_sadness = []
for _ in range(num_participants_sadness):
    X_list_sadness.extend(audio_embeddings.detach()) # Add a full set of embeddings for each participant

# Convert to NumPy array
X_sadness = np.array(X_list_sadness)

# Extract y from the concatenated DataFrame
y_sadness = master_human_responses_df_sadness[['sadness']].values

print(f"Shape of X (features) after implicit alignment: {X_sadness.shape}")
print(f"Shape of y (labels) after implicit alignment: {y_sadness.shape}\n")

# Sanity check: X and y must have the same number of rows
if X_sadness.shape[0] != y_sadness.shape[0]:
    raise ValueError("Number of rows in X and y do not match after implicit alignment. This indicates an issue with the implicit ordering assumption or data loading.")

# --- Split Data into Training and Testing Sets ---

TEST_PERCENTAGE_sadness = 0.2  # Adjust as needed
TRAIN_PERCENTAGE_sadness = 1 - TEST_PERCENTAGE_sadness  # Adjust as needed

X_train_sadness, X_test_sadness, y_train_sadness, y_test_sadness = train_test_split(
    X_sadness, y_sadness, test_size=TEST_PERCENTAGE_sadness, random_state=42
)

print(f"Training set size (X_train_sadness, y_train_sadness): {X_train_sadness.shape}, {y_train_sadness.shape}")
print(f"Testing set size (X_test_sadness, y_test_sadness): {X_test_sadness.shape}, {y_test_sadness.shape}\n")

Shape of X (features) after implicit alignment: (3894, 512)
Shape of y (labels) after implicit alignment: (3894, 1)

Training set size (X_train_sadness, y_train_sadness): (3115, 512), (3115, 1)
Testing set size (X_test_sadness, y_test_sadness): (779, 512), (779, 1)



In [None]:
from sklearn.model_selection import train_test_split


num_participants_anger = len(IDisc_anger_response_dfs)
if master_human_responses_df_anger.shape[0] % len(audio_stimuli) != 0:
    print("Warning: Total responses is not a perfect multiple of unique audio files. This might indicate inconsistent data or that not all participants rated all items, which could break implicit ordering.")

X_list_anger = []
for _ in range(num_participants_anger):
    X_list_anger.extend(audio_embeddings.detach()) # Add a full set of embeddings for each participant

# Convert to NumPy array
X_anger = np.array(X_list_anger)

# Extract y from the concatenated DataFrame
y_anger = master_human_responses_df_anger[['anger']].values

print(f"Shape of X (features) after implicit alignment: {X_anger.shape}")
print(f"Shape of y (labels) after implicit alignment: {y_anger.shape}\n")

# Sanity check: X and y must have the same number of rows
if X_anger.shape[0] != y_anger.shape[0]:
    raise ValueError("Number of rows in X and y do not match after implicit alignment. This indicates an issue with the implicit ordering assumption or data loading.")

# --- Split Data into Training and Testing Sets ---

TEST_PERCENTAGE_anger = 0.2  # Adjust as needed
TRAIN_PERCENTAGE_anger = 1 - TEST_PERCENTAGE_anger  # Adjust as needed

X_train_anger, X_test_anger, y_train_anger, y_test_anger = train_test_split(
    X_anger, y_anger, test_size=TEST_PERCENTAGE_anger, random_state=42
)

print(f"Training set size (X_train_anger, y_train_anger): {X_train_anger.shape}, {y_train_anger.shape}")
print(f"Testing set size (X_test_anger, y_test_anger): {X_test_anger.shape}, {y_test_anger.shape}\n")

Shape of X (features) after implicit alignment: (3894, 512)
Shape of y (labels) after implicit alignment: (3894, 1)

Training set size (X_train_anger, y_train_anger): (3115, 512), (3115, 1)
Testing set size (X_test_anger, y_test_anger): (779, 512), (779, 1)



In [None]:
from sklearn.model_selection import train_test_split


num_participants_tenderness = len(IDisc_tenderness_response_dfs)
if master_human_responses_df_tenderness.shape[0] % len(audio_stimuli) != 0:
    print("Warning: Total responses is not a perfect multiple of unique audio files. This might indicate inconsistent data or that not all participants rated all items, which could break implicit ordering.")

X_list_tenderness = []
for _ in range(num_participants_tenderness):
    X_list_tenderness.extend(audio_embeddings.detach()) # Add a full set of embeddings for each participant

# Convert to NumPy array
X_tenderness = np.array(X_list_tenderness)

# Extract y from the concatenated DataFrame
y_tenderness = master_human_responses_df_tenderness[['tenderness']].values

print(f"Shape of X (features) after implicit alignment: {X_tenderness.shape}")
print(f"Shape of y (labels) after implicit alignment: {y_tenderness.shape}\n")

# Sanity check: X and y must have the same number of rows
if X_tenderness.shape[0] != y_tenderness.shape[0]:
    raise ValueError("Number of rows in X and y do not match after implicit alignment. This indicates an issue with the implicit ordering assumption or data loading.")

# --- Split Data into Training and Testing Sets ---

TEST_PERCENTAGE_tenderness = 0.2  # Adjust as needed
TRAIN_PERCENTAGE_tenderness = 1 - TEST_PERCENTAGE_tenderness  # Adjust as needed

X_train_tenderness, X_test_tenderness, y_train_tenderness, y_test_tenderness = train_test_split(
    X_tenderness, y_tenderness, test_size=TEST_PERCENTAGE_tenderness, random_state=42
)

print(f"Training set size (X_train_tenderness, y_train_tenderness): {X_train_tenderness.shape}, {y_train_tenderness.shape}")
print(f"Testing set size (X_test_tenderness, y_test_tenderness): {X_test_tenderness.shape}, {y_test_tenderness.shape}\n")

Shape of X (features) after implicit alignment: (3894, 512)
Shape of y (labels) after implicit alignment: (3894, 1)

Training set size (X_train_tenderness, y_train_tenderness): (3115, 512), (3115, 1)
Testing set size (X_test_tenderness, y_test_tenderness): (779, 512), (779, 1)



In [None]:
from sklearn.model_selection import train_test_split


num_participants_fear = len(IDisc_fear_response_dfs)
if master_human_responses_df_fear.shape[0] % len(audio_stimuli) != 0:
    print("Warning: Total responses is not a perfect multiple of unique audio files. This might indicate inconsistent data or that not all participants rated all items, which could break implicit ordering.")

X_list_fear = []
for _ in range(num_participants_fear):
    X_list_fear.extend(audio_embeddings.detach()) # Add a full set of embeddings for each participant

# Convert to NumPy array
X_fear = np.array(X_list_fear)

# Extract y from the concatenated DataFrame
y_fear = master_human_responses_df_fear[['fear']].values

print(f"Shape of X (features) after implicit alignment: {X_fear.shape}")
print(f"Shape of y (labels) after implicit alignment: {y_fear.shape}\n")

# Sanity check: X and y must have the same number of rows
if X_fear.shape[0] != y_fear.shape[0]:
    raise ValueError("Number of rows in X and y do not match after implicit alignment. This indicates an issue with the implicit ordering assumption or data loading.")

# --- Split Data into Training and Testing Sets ---

TEST_PERCENTAGE_fear = 0.2  # Adjust as needed
TRAIN_PERCENTAGE_fear = 1 - TEST_PERCENTAGE_fear  # Adjust as needed

X_train_fear, X_test_fear, y_train_fear, y_test_fear = train_test_split(
    X_fear, y_fear, test_size=TEST_PERCENTAGE_fear, random_state=42
)

print(f"Training set size (X_train_fear, y_train_fear): {X_train_fear.shape}, {y_train_fear.shape}")
print(f"Testing set size (X_test_fear, y_test_fear): {X_test_fear.shape}, {y_test_fear.shape}\n")

Shape of X (features) after implicit alignment: (3894, 512)
Shape of y (labels) after implicit alignment: (3894, 1)

Training set size (X_train_fear, y_train_fear): (3115, 512), (3115, 1)
Testing set size (X_test_fear, y_test_fear): (779, 512), (779, 1)



# Train regression head (=MLP, a few projection layers)

In [None]:
from sklearn.neural_network import MLPRegressor

mlp_regressor_happiness = MLPRegressor(
    hidden_layer_sizes=(100, 50),
    activation='relu',
    solver='adam',
    max_iter=1000,
    random_state=42, #Determines random number generation for weights and bias
    verbose=True,
    early_stopping=True,
    n_iter_no_change=50,
    tol=1e-4
)

print("Starting MLP Regressor_happiness training...")
mlp_regressor_happiness.fit(X_train_happiness, y_train_happiness)
print("\nMLP Regressor_happiness training complete.")


Starting MLP Regressor_happiness training...
Iteration 1, loss = 4.96475863
Validation score: -0.560828
Iteration 2, loss = 2.98814187
Validation score: 0.023531
Iteration 3, loss = 2.30745130
Validation score: 0.128682


  y = column_or_1d(y, warn=True)


Iteration 4, loss = 2.11623284
Validation score: 0.146756
Iteration 5, loss = 2.10428196
Validation score: 0.149369
Iteration 6, loss = 2.08819195
Validation score: 0.155553
Iteration 7, loss = 2.07441400
Validation score: 0.159461
Iteration 8, loss = 2.07432687
Validation score: 0.160989
Iteration 9, loss = 2.06966347
Validation score: 0.152798
Iteration 10, loss = 2.06954639
Validation score: 0.159403
Iteration 11, loss = 2.09502953
Validation score: 0.151574
Iteration 12, loss = 2.07078894
Validation score: 0.163001
Iteration 13, loss = 2.06127625
Validation score: 0.158432
Iteration 14, loss = 2.07085350
Validation score: 0.161810
Iteration 15, loss = 2.08787021
Validation score: 0.158425
Iteration 16, loss = 2.05598277
Validation score: 0.159466
Iteration 17, loss = 2.05325702
Validation score: 0.162968
Iteration 18, loss = 2.05722322
Validation score: 0.163659
Iteration 19, loss = 2.05468982
Validation score: 0.158276
Iteration 20, loss = 2.05973623
Validation score: 0.155890
Ite

In [None]:
from sklearn.neural_network import MLPRegressor

mlp_regressor_sadness = MLPRegressor(
    hidden_layer_sizes=(100, 50),
    activation='relu',
    solver='adam',
    max_iter=1000,
    random_state=42, #Determines random number generation for weights and bias
    verbose=True,
    early_stopping=True,
    n_iter_no_change=50,
    tol=1e-4
)

print("Starting MLP Regressor_sadness training...")
mlp_regressor_sadness.fit(X_train_sadness, y_train_sadness)
print("\nMLP Regressor_sadness training complete.")


Starting MLP Regressor_sadness training...
Iteration 1, loss = 4.70819772
Validation score: -0.709071
Iteration 2, loss = 2.61160118
Validation score: 0.043071


  y = column_or_1d(y, warn=True)


Iteration 3, loss = 2.08149588
Validation score: 0.053697
Iteration 4, loss = 2.03260920
Validation score: 0.047619
Iteration 5, loss = 2.02509858
Validation score: 0.052831
Iteration 6, loss = 2.05738549
Validation score: 0.068197
Iteration 7, loss = 2.00934880
Validation score: 0.052604
Iteration 8, loss = 2.01836081
Validation score: 0.060892
Iteration 9, loss = 2.00862381
Validation score: 0.067183
Iteration 10, loss = 2.00963527
Validation score: 0.060442
Iteration 11, loss = 2.01606797
Validation score: 0.068911
Iteration 12, loss = 2.00191478
Validation score: 0.067666
Iteration 13, loss = 2.02601772
Validation score: 0.061119
Iteration 14, loss = 2.01071464
Validation score: 0.076444
Iteration 15, loss = 2.01889667
Validation score: 0.045848
Iteration 16, loss = 2.03223015
Validation score: 0.049353
Iteration 17, loss = 2.01071079
Validation score: 0.060165
Iteration 18, loss = 2.02000198
Validation score: 0.057504
Iteration 19, loss = 2.00806136
Validation score: 0.082178
Iter

In [None]:
from sklearn.neural_network import MLPRegressor

mlp_regressor_anger = MLPRegressor(
    hidden_layer_sizes=(100, 50),
    activation='relu',
    solver='adam',
    max_iter=1000,
    random_state=42, #Determines random number generation for weights and bias
    verbose=True,
    early_stopping=True,
    n_iter_no_change=50,
    tol=1e-4
)

print("Starting MLP Regressor_anger training...")
mlp_regressor_anger.fit(X_train_anger, y_train_anger)
print("\nMLP Regressor_anger training complete.")


Starting MLP Regressor_anger training...
Iteration 1, loss = 5.35418880
Validation score: -0.640924
Iteration 2, loss = 3.00283726
Validation score: 0.109127
Iteration 3, loss = 2.21415842
Validation score: 0.142668
Iteration 4, loss = 2.12833414
Validation score: 0.151284


  y = column_or_1d(y, warn=True)


Iteration 5, loss = 2.08227591
Validation score: 0.168720
Iteration 6, loss = 2.06687503
Validation score: 0.190452
Iteration 7, loss = 2.07790111
Validation score: 0.172476
Iteration 8, loss = 2.04272372
Validation score: 0.204907
Iteration 9, loss = 2.00898882
Validation score: 0.208395
Iteration 10, loss = 2.00261866
Validation score: 0.215387
Iteration 11, loss = 2.00847483
Validation score: 0.216501
Iteration 12, loss = 1.99559013
Validation score: 0.213956
Iteration 13, loss = 1.98562836
Validation score: 0.220235
Iteration 14, loss = 1.98718725
Validation score: 0.230584
Iteration 15, loss = 1.97498957
Validation score: 0.235870
Iteration 16, loss = 1.97848598
Validation score: 0.243175
Iteration 17, loss = 1.96949281
Validation score: 0.243005
Iteration 18, loss = 1.96707426
Validation score: 0.231886
Iteration 19, loss = 1.99671757
Validation score: 0.232464
Iteration 20, loss = 1.99773440
Validation score: 0.253325
Iteration 21, loss = 1.98250605
Validation score: 0.232742
It

In [None]:
from sklearn.neural_network import MLPRegressor

mlp_regressor_tenderness = MLPRegressor(
    hidden_layer_sizes=(100, 50),
    activation='relu',
    solver='adam',
    max_iter=1000,
    random_state=42, #Determines random number generation for weights and bias
    verbose=True,
    early_stopping=True,
    n_iter_no_change=50,
    tol=1e-4
)

print("Starting MLP Regressor_tenderness training...")
mlp_regressor_tenderness.fit(X_train_tenderness, y_train_tenderness)
print("\nMLP Regressor_tenderness training complete.")


Starting MLP Regressor_tenderness training...
Iteration 1, loss = 5.49708322
Validation score: -0.689530
Iteration 2, loss = 3.28206318
Validation score: -0.035302
Iteration 3, loss = 2.60425443
Validation score: 0.025448
Iteration 4, loss = 2.44147610
Validation score: 0.051710


  y = column_or_1d(y, warn=True)


Iteration 5, loss = 2.42448258
Validation score: 0.058381
Iteration 6, loss = 2.41000410
Validation score: 0.059735
Iteration 7, loss = 2.40670244
Validation score: 0.064863
Iteration 8, loss = 2.40346335
Validation score: 0.063286
Iteration 9, loss = 2.39670038
Validation score: 0.055197
Iteration 10, loss = 2.38775857
Validation score: 0.056136
Iteration 11, loss = 2.42853678
Validation score: 0.047857
Iteration 12, loss = 2.41617824
Validation score: 0.055637
Iteration 13, loss = 2.38723347
Validation score: 0.048630
Iteration 14, loss = 2.40245572
Validation score: 0.053077
Iteration 15, loss = 2.42071278
Validation score: 0.055819
Iteration 16, loss = 2.38855857
Validation score: 0.061145
Iteration 17, loss = 2.40251586
Validation score: 0.059707
Iteration 18, loss = 2.38408068
Validation score: 0.064682
Iteration 19, loss = 2.37432230
Validation score: 0.061634
Iteration 20, loss = 2.38566234
Validation score: 0.051409
Iteration 21, loss = 2.39728303
Validation score: 0.055065
It

In [None]:
from sklearn.neural_network import MLPRegressor

mlp_regressor_fear = MLPRegressor(
    hidden_layer_sizes=(100, 50),
    activation='relu',
    solver='adam',
    max_iter=1000,
    random_state=42, #Determines random number generation for weights and bias
    verbose=True,
    early_stopping=True,
    n_iter_no_change=50,
    tol=1e-4
)

print("Starting MLP Regressor_fear training...")
mlp_regressor_fear.fit(X_train_fear, y_train_fear)
print("\nMLP Regressor_fear training complete.")


Starting MLP Regressor_fear training...
Iteration 1, loss = 4.41717323
Validation score: -0.564094
Iteration 2, loss = 2.54062569
Validation score: 0.020269
Iteration 3, loss = 2.15323577
Validation score: 0.058695
Iteration 4, loss = 2.04798215
Validation score: 0.076390


  y = column_or_1d(y, warn=True)


Iteration 5, loss = 2.01049282
Validation score: 0.087909
Iteration 6, loss = 2.03746319
Validation score: 0.104235
Iteration 7, loss = 1.97012107
Validation score: 0.105624
Iteration 8, loss = 1.96194091
Validation score: 0.126221
Iteration 9, loss = 1.93283182
Validation score: 0.133692
Iteration 10, loss = 1.91678946
Validation score: 0.131168
Iteration 11, loss = 1.91395279
Validation score: 0.139759
Iteration 12, loss = 1.89961539
Validation score: 0.139211
Iteration 13, loss = 1.92480832
Validation score: 0.127112
Iteration 14, loss = 1.89896060
Validation score: 0.137131
Iteration 15, loss = 1.89350704
Validation score: 0.127966
Iteration 16, loss = 1.88972850
Validation score: 0.131655
Iteration 17, loss = 1.87368661
Validation score: 0.138688
Iteration 18, loss = 1.89050755
Validation score: 0.144727
Iteration 19, loss = 1.88178353
Validation score: 0.143850
Iteration 20, loss = 1.91760070
Validation score: 0.142482
Iteration 21, loss = 1.89443847
Validation score: 0.136255
It

# Evaluate

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr
import numpy as np

print("Results for IDisc – happiness")
print("______________________________________________")

# Predict and reshape if needed
y_pred_happiness = mlp_regressor_happiness.predict(X_test_happiness)
if y_pred_happiness.ndim > 1:
    y_pred_happiness = y_pred_happiness.flatten()
if y_test_happiness.ndim > 1:
    y_test_happiness = y_test_happiness.flatten()

print(f"\nShape of predictions (y_pred): {y_pred_happiness.shape}")
print(f"First 5 actual values (y_test):\n{y_test_happiness[:5].reshape(-1, 1)}")
print(f"First 5 predicted values (y_pred):\n{y_pred_happiness[:5].reshape(-1, 1)}")

print(f"Training/ Test split: {int(TRAIN_PERCENTAGE_happiness * 100)}/{int(TEST_PERCENTAGE_happiness * 100)}\n")

# Evaluation Metrics
mae_happiness = mean_absolute_error(y_test_happiness, y_pred_happiness)
rmse_happiness = np.sqrt(mean_squared_error(y_test_happiness, y_pred_happiness))
r2_happiness = r2_score(y_test_happiness, y_pred_happiness)

print(f"Mean Absolute Error (MAE): {mae_happiness:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_happiness:.4f}")
print(f"R-squared (R²): {r2_happiness:.4f}")

# Pearson Correlation
if np.std(y_test_happiness) > 1e-6 and np.std(y_pred_happiness) > 1e-6:
    corr_happiness, _ = pearsonr(y_test_happiness, y_pred_happiness)
    print(f"Pearson Correlation (happiness): {corr_happiness:.4f}")
else:
    print("Pearson Correlation: Cannot calculate (insufficient variance)")

#Mean Absolute Percentage Error
y_test_happiness_safe = np.clip(y_test_happiness, a_min=1e-6, a_max=None)
mape_happiness = np.mean(np.mean(np.abs((y_test_happiness - y_pred_happiness) / y_test_happiness_safe)) * 100)
print(f"Mean Absolute Percentage Error (MAPE): {mape_happiness:.2f}%")

Results for IDisc – happiness
______________________________________________

Shape of predictions (y_pred): (779,)
First 5 actual values (y_test):
[[1.  ]
 [2.98]
 [3.12]
 [1.  ]
 [1.  ]]
First 5 predicted values (y_pred):
[[2.1123066]
 [1.8209325]
 [2.9233766]
 [1.9203777]
 [2.3641515]]
Training/ Test split: 80/20

Mean Absolute Error (MAE): 1.5518
Root Mean Squared Error (RMSE): 1.9596
R-squared (R²): 0.1765
Pearson Correlation (happiness): 0.4203
Mean Absolute Percentage Error (MAPE): 81.73%


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr
import numpy as np

print("Results for IDisc – sadness")
print("______________________________________________")

# Predict and reshape if needed
y_pred_sadness = mlp_regressor_sadness.predict(X_test_sadness)
if y_pred_sadness.ndim > 1:
    y_pred_sadness = y_pred_sadness.flatten()
if y_test_sadness.ndim > 1:
    y_test_sadness = y_test_sadness.flatten()

print(f"\nShape of predictions (y_pred): {y_pred_sadness.shape}")
print(f"First 5 actual values (y_test):\n{y_test_sadness[:5].reshape(-1, 1)}")
print(f"First 5 predicted values (y_pred):\n{y_pred_sadness[:5].reshape(-1, 1)}")

print(f"Training/ Test split: {int(TRAIN_PERCENTAGE_sadness * 100)}/{int(TEST_PERCENTAGE_sadness * 100)}\n")

# Evaluation Metrics
mae_sadness = mean_absolute_error(y_test_sadness, y_pred_sadness)
rmse_sadness = np.sqrt(mean_squared_error(y_test_sadness, y_pred_sadness))
r2_sadness = r2_score(y_test_sadness, y_pred_sadness)

print(f"Mean Absolute Error (MAE): {mae_sadness:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_sadness:.4f}")
print(f"R-squared (R²): {r2_sadness:.4f}")

# Pearson Correlation
if np.std(y_test_sadness) > 1e-6 and np.std(y_pred_sadness) > 1e-6:
    corr_sadness, _ = pearsonr(y_test_sadness, y_pred_sadness)
    print(f"Pearson Correlation (sadness): {corr_sadness:.4f}")
else:
    print("Pearson Correlation: Cannot calculate (insufficient variance)")

#Mean Absolute Percentage Error
y_test_sadness_safe = np.clip(y_test_sadness, a_min=1e-6, a_max=None)
mape_sadness = np.mean(np.mean(np.abs((y_test_sadness - y_pred_sadness) / y_test_sadness_safe)) * 100)
print(f"Mean Absolute Percentage Error (MAPE): {mape_sadness:.2f}%")

Results for IDisc – sadness
______________________________________________

Shape of predictions (y_pred): (779,)
First 5 actual values (y_test):
[[1.  ]
 [4.95]
 [3.21]
 [1.62]
 [1.  ]]
First 5 predicted values (y_pred):
[[3.308406 ]
 [2.6988063]
 [2.2683558]
 [3.0509307]
 [3.2275805]]
Training/ Test split: 80/20

Mean Absolute Error (MAE): 1.6392
Root Mean Squared Error (RMSE): 1.9989
R-squared (R²): 0.0379
Pearson Correlation (sadness): 0.2243
Mean Absolute Percentage Error (MAPE): 93.43%


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr
import numpy as np

print("Results for IDisc – anger")
print("______________________________________________")

# Predict and reshape if needed
y_pred_anger = mlp_regressor_anger.predict(X_test_anger)
if y_pred_anger.ndim > 1:
    y_pred_anger = y_pred_anger.flatten()
if y_test_anger.ndim > 1:
    y_test_anger = y_test_anger.flatten()

print(f"\nShape of predictions (y_pred): {y_pred_anger.shape}")
print(f"First 5 actual values (y_test):\n{y_test_anger[:5].reshape(-1, 1)}")
print(f"First 5 predicted values (y_pred):\n{y_pred_anger[:5].reshape(-1, 1)}")

print(f"Training/ Test split: {int(TRAIN_PERCENTAGE_anger * 100)}/{int(TEST_PERCENTAGE_anger * 100)}\n")

# Evaluation Metrics
mae_anger = mean_absolute_error(y_test_anger, y_pred_anger)
rmse_anger = np.sqrt(mean_squared_error(y_test_anger, y_pred_anger))
r2_anger = r2_score(y_test_anger, y_pred_anger)

print(f"Mean Absolute Error (MAE): {mae_anger:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_anger:.4f}")
print(f"R-squared (R²): {r2_anger:.4f}")

# Pearson Correlation
if np.std(y_test_anger) > 1e-6 and np.std(y_pred_anger) > 1e-6:
    corr_anger, _ = pearsonr(y_test_anger, y_pred_anger)
    print(f"Pearson Correlation (anger): {corr_anger:.4f}")
else:
    print("Pearson Correlation: Cannot calculate (insufficient variance)")

#Mean Absolute Percentage Error
y_test_anger_safe = np.clip(y_test_anger, a_min=1e-6, a_max=None)
mape_anger = np.mean(np.mean(np.abs((y_test_anger - y_pred_anger) / y_test_anger_safe)) * 100)
print(f"Mean Absolute Percentage Error (MAPE): {mape_anger:.2f}%")

Results for IDisc – anger
______________________________________________

Shape of predictions (y_pred): (779,)
First 5 actual values (y_test):
[[3.89]
 [1.  ]
 [1.54]
 [2.65]
 [1.  ]]
First 5 predicted values (y_pred):
[[3.2230453]
 [3.9910529]
 [1.869818 ]
 [5.1437526]
 [2.464972 ]]
Training/ Test split: 80/20

Mean Absolute Error (MAE): 1.5104
Root Mean Squared Error (RMSE): 1.9455
R-squared (R²): 0.2263
Pearson Correlation (anger): 0.4770
Mean Absolute Percentage Error (MAPE): 78.60%


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr
import numpy as np

print("Results for IDisc – tenderness")
print("______________________________________________")

# Predict and reshape if needed
y_pred_tenderness = mlp_regressor_tenderness.predict(X_test_tenderness)
if y_pred_tenderness.ndim > 1:
    y_pred_tenderness = y_pred_tenderness.flatten()
if y_test_tenderness.ndim > 1:
    y_test_tenderness = y_test_tenderness.flatten()

print(f"\nShape of predictions (y_pred): {y_pred_tenderness.shape}")
print(f"First 5 actual values (y_test):\n{y_test_tenderness[:5].reshape(-1, 1)}")
print(f"First 5 predicted values (y_pred):\n{y_pred_tenderness[:5].reshape(-1, 1)}")

print(f"Training/ Test split: {int(TRAIN_PERCENTAGE_tenderness * 100)}/{int(TEST_PERCENTAGE_tenderness * 100)}\n")

# Evaluation Metrics
mae_tenderness = mean_absolute_error(y_test_tenderness, y_pred_tenderness)
rmse_tenderness = np.sqrt(mean_squared_error(y_test_tenderness, y_pred_tenderness))
r2_tenderness = r2_score(y_test_tenderness, y_pred_tenderness)

print(f"Mean Absolute Error (MAE): {mae_tenderness:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_tenderness:.4f}")
print(f"R-squared (R²): {r2_tenderness:.4f}")

# Pearson Correlation
if np.std(y_test_tenderness) > 1e-6 and np.std(y_pred_tenderness) > 1e-6:
    corr_tenderness, _ = pearsonr(y_test_tenderness, y_pred_tenderness)
    print(f"Pearson Correlation (tenderness): {corr_tenderness:.4f}")
else:
    print("Pearson Correlation: Cannot calculate (insufficient variance)")

#Mean Absolute Percentage Error
y_test_tenderness_safe = np.clip(y_test_tenderness, a_min=1e-6, a_max=None)
mape_tenderness = np.mean(np.mean(np.abs((y_test_tenderness - y_pred_tenderness) / y_test_tenderness_safe)) * 100)
print(f"Mean Absolute Percentage Error (MAPE): {mape_tenderness:.2f}%")

Results for IDisc – tenderness
______________________________________________

Shape of predictions (y_pred): (779,)
First 5 actual values (y_test):
[[1.  ]
 [7.02]
 [1.83]
 [1.  ]
 [1.  ]]
First 5 predicted values (y_pred):
[[2.491554 ]
 [2.2047465]
 [2.8882365]
 [2.213874 ]
 [2.7581584]]
Training/ Test split: 80/20

Mean Absolute Error (MAE): 1.6364
Root Mean Squared Error (RMSE): 2.0172
R-squared (R²): 0.0954
Pearson Correlation (tenderness): 0.3097
Mean Absolute Percentage Error (MAPE): 86.44%


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr
import numpy as np

print("Results for IDisc – fear")
print("______________________________________________")

# Predict and reshape if needed
y_pred_fear = mlp_regressor_fear.predict(X_test_fear)
if y_pred_fear.ndim > 1:
    y_pred_fear = y_pred_fear.flatten()
if y_test_fear.ndim > 1:
    y_test_fear = y_test_fear.flatten()

print(f"\nShape of predictions (y_pred): {y_pred_fear.shape}")
print(f"First 5 actual values (y_test):\n{y_test_fear[:5].reshape(-1, 1)}")
print(f"First 5 predicted values (y_pred):\n{y_pred_fear[:5].reshape(-1, 1)}")

print(f"Training/ Test split: {int(TRAIN_PERCENTAGE_fear * 100)}/{int(TEST_PERCENTAGE_fear * 100)}\n")

# Evaluation Metrics
mae_fear = mean_absolute_error(y_test_fear, y_pred_fear)
rmse_fear = np.sqrt(mean_squared_error(y_test_fear, y_pred_fear))
r2_fear = r2_score(y_test_fear, y_pred_fear)

print(f"Mean Absolute Error (MAE): {mae_fear:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_fear:.4f}")
print(f"R-squared (R²): {r2_fear:.4f}")

# Pearson Correlation
if np.std(y_test_fear) > 1e-6 and np.std(y_pred_fear) > 1e-6:
    corr_fear, _ = pearsonr(y_test_fear, y_pred_fear)
    print(f"Pearson Correlation (Fear): {corr_fear:.4f}")
else:
    print("Pearson Correlation: Cannot calculate (insufficient variance)")

#Mean Absolute Percentage Error
y_test_fear_safe = np.clip(y_test_fear, a_min=1e-6, a_max=None)
mape_fear = np.mean(np.mean(np.abs((y_test_fear - y_pred_fear) / y_test_fear_safe)) * 100)
print(f"Mean Absolute Percentage Error (MAPE): {mape_fear:.2f}%")

Results for IDisc – fear
______________________________________________

Shape of predictions (y_pred): (779,)
First 5 actual values (y_test):
[[1.  ]
 [3.98]
 [3.59]
 [3.44]
 [1.  ]]
First 5 predicted values (y_pred):
[[2.6756346]
 [3.0922363]
 [2.6753726]
 [3.6241891]
 [2.2412617]]
Training/ Test split: 80/20

Mean Absolute Error (MAE): 1.5410
Root Mean Squared Error (RMSE): 1.9825
R-squared (R²): 0.1056
Pearson Correlation (Fear): 0.3315
Mean Absolute Percentage Error (MAPE): 84.18%


In [None]:
# Organize metrics into a dictionary
print("Results for Discrete Induced (IDisc)")
print("______________________________________________")
data = {
    "Dimension": ["happiness", "sadness", "anger","tenderness", "fear"],
    "MAE": [mae_happiness, mae_sadness, mae_anger, mae_tenderness,mae_fear],
    "RMSE": [rmse_happiness, rmse_sadness, rmse_anger, rmse_tenderness, rmse_fear],
    "R²": [r2_happiness, r2_sadness, r2_anger, r2_tenderness, r2_fear],
    "Pearson Corr.": [corr_happiness, corr_sadness, corr_anger, corr_tenderness, corr_fear],
    "MAPE": [mape_happiness, mape_sadness, mape_anger, mape_tenderness, mape_fear]
}

# Create DataFrame
df_metrics = pd.DataFrame(data)

# Add average row
avg_row = {
    "Dimension": "Average",
    "MAE": df_metrics["MAE"].mean(),
    "RMSE": df_metrics["RMSE"].mean(),
    "R²": df_metrics["R²"].mean(),
    "Pearson Corr.": df_metrics["Pearson Corr."].mean(),
    "MAPE": df_metrics["MAPE"].mean()
}
df_metrics = pd.concat([df_metrics, pd.DataFrame([avg_row])], ignore_index=True)


# Display
print(df_metrics.round(4))


Results for Discrete Induced (IDisc)
______________________________________________
    Dimension     MAE    RMSE      R²  Pearson Corr.     MAPE
0   happiness  1.5518  1.9596  0.1765         0.4203  81.7296
1     sadness  1.6392  1.9989  0.0379         0.2243  93.4318
2       anger  1.5104  1.9455  0.2263         0.4770  78.5966
3  tenderness  1.6364  2.0172  0.0954         0.3097  86.4369
4        fear  1.5410  1.9825  0.1056         0.3315  84.1807
5     Average  1.5758  1.9807  0.1283         0.3526  84.8751
