In [42]:
!pip install pandas



In [92]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler


In [93]:
np.random.seed(7) 
import tensorflow as tf
tf.random.set_seed(7)
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras import losses
from sklearn.model_selection import KFold
from scipy import stats


In [94]:
num_folds = 10

In [95]:
label_file = "data/labels/track_memorability_scores_beta.csv"
df = pd.read_csv(label_file)
labels = df.score.values

### chroma

In [18]:
chroma_dir = "baseline_representation/chroma"
chroma_data = []

# append chroma according to list to label's order
for track in df.track.values:
    chroma_path = os.path.join(chroma_dir, track.replace(".wav", ".npy"))
    chroma = np.load(chroma_path)
    chroma_data.append(chroma)
# convert data to np array
chroma_data = np.array(chroma_data)

In [19]:
chroma_data.shape

(235, 72)

In [20]:
correlations, losses = [], []
# K-fold Cross Validation model evaluation
kfold = KFold(n_splits=num_folds, shuffle=True)
for train, test in tqdm(kfold.split(chroma_data, labels)):

    # Define the model architecture
    model = Sequential()
    model.add(layers.Dense(64, input_dim=chroma_data.shape[1], activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    # Compile the model
    model.compile(loss='mean_squared_error',
                optimizer=tf.keras.optimizers.Adam(0.0001))
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10)
    
    train_features = chroma_data[train]
    train_features = StandardScaler().fit_transform(train_features)
    # Fit data to model
    history = model.fit(train_features,
                        labels[train],
                        validation_split = 0.1,
                        batch_size=16, 
                        epochs=100,
                       verbose=0,
                       callbacks=[callback])

    test_features = chroma_data[test]
    test_features = StandardScaler().fit_transform(test_features)
    pred = model.predict(test_features)
    correlations.append(stats.spearmanr(pred, labels[test])[0])
    losses.append(model.evaluate(test_features, labels[test], verbose=0))

print("correlations mean: {}, std: {}".format(np.array(correlations).mean(), np.array(correlations).std()))
print("loss mean: {}, std: {}".format(np.array(losses).mean(), np.array(losses).std()) )

10it [00:11,  1.13s/it]

correlations mean: 0.04964517963527892, std: 0.24378401210638145
loss mean: 0.03579263109713793, std: 0.009373647509314638





In [8]:
# correlations mean: 0.04964517963527892, std: 0.24378401210638145
# loss mean: 0.03579263109713793, std: 0.009373648300209108

### mfcc

In [9]:
mfcc_dir = "baseline_representation/mfcc"
mfcc_data = []

# append mfcc according to list to label's order
for track in df.track.values:
    mfcc_path = os.path.join(mfcc_dir, track.replace(".wav", ".npy"))
    mfcc = np.load(mfcc_path)
    mfcc_data.append(mfcc)
# convert data to np array
mfcc_data = np.array(mfcc_data)

In [10]:
mfcc_data.shape

(235, 120)

In [11]:
correlations, losses = [], []
# K-fold Cross Validation model evaluation
kfold = KFold(n_splits=num_folds, shuffle=True)
for train, test in tqdm(kfold.split(mfcc_data, labels)):

    # Define the model architecture
    model = Sequential()
    model.add(layers.Dense(64, input_dim=mfcc_data.shape[1], activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    # Compile the model
    model.compile(loss='mean_squared_error',
                optimizer=tf.keras.optimizers.Adam(0.0001))
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10)
    
    # Fit data to model
    train_features = mfcc_data[train]
    train_features = StandardScaler().fit_transform(train_features)
    # Fit data to model
    history = model.fit(train_features,
                        labels[train],
                        validation_split = 0.1,
                        batch_size=16, 
                        epochs=100,
                       verbose=0,
                       callbacks=[callback])

    test_features = mfcc_data[test]
    test_features = StandardScaler().fit_transform(test_features)
    pred = model.predict(test_features)
    correlations.append(stats.spearmanr(pred, labels[test])[0])
    losses.append(model.evaluate(test_features, labels[test], verbose=0))

print("correlations mean: {}, std: {}".format(np.array(correlations).mean(), np.array(correlations).std()))
print("loss mean: {}, std: {}".format(np.array(losses).mean(), np.array(losses).std()) )

10it [00:12,  1.32s/it]

correlations mean: 0.2025604399531912, std: 0.22066895748641988
loss mean: 0.03040823470801115, std: 0.00535532887904776





In [12]:
# correlations mean: 0.2025604399531912, std: 0.22066895748641988
# loss mean: 0.03040823470801115, std: 0.00535532887904776

### choi

In [13]:
choi_dir = "baseline_representation/choi"
choi_data = []

# append mfcc according to list to label's order
for track in df.track.values:
    choi_path = os.path.join(choi_dir, track.replace(".wav", ".npy"))
    choi = np.load(choi_path)
    choi_data.append(choi)
# convert data to np array
choi_data = np.array(choi_data)

In [14]:
choi_data.shape

(235, 160)

In [21]:
correlations, losses = [], []
# K-fold Cross Validation model evaluation
kfold = KFold(n_splits=num_folds, shuffle=True)
for train, test in tqdm(kfold.split(choi_data, labels)):

    # Define the model architecture
    model = Sequential()
    model.add(layers.Dense(100, input_dim=choi_data.shape[1], activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    # Compile the model
    model.compile(loss='mean_squared_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10)
    
    # Fit data to model
    train_features = choi_data[train]
    train_features = StandardScaler().fit_transform(train_features)
    # Fit data to model
    history = model.fit(train_features,
                        labels[train],
                        validation_split = 0.1,
                        batch_size=16, 
                        epochs=100,
                       verbose=0,
                       callbacks=[callback])

    test_features = choi_data[test]
    test_features = StandardScaler().fit_transform(test_features)
    pred = model.predict(test_features)
    correlations.append(stats.spearmanr(pred, labels[test])[0])
    losses.append(model.evaluate(test_features, labels[test], verbose=0))

print("correlations mean: {}, std: {}".format(np.array(correlations).mean(), np.array(correlations).std()))
print("loss mean: {}, std: {}".format(np.array(losses).mean(), np.array(losses).std()) )

10it [00:12,  1.26s/it]

correlations mean: 0.10312894005227528, std: 0.2271080382532025
loss mean: 0.034260318800807, std: 0.011052766953853571





In [16]:
# correlations mean: 0.10312894005227528, std: 0.2271080382532025
# loss mean: 0.034260318800807, std: 0.011052766953853571

### clmr

In [22]:
clmr_dir = "baseline_representation/clmr"
clmr_data = []

# append mfcc according to list to label's order
for track in df.track.values:
    clmr_path = os.path.join(clmr_dir, track.replace(".wav", ".npy"))
    clmr = np.load(clmr_path)
    clmr_data.append(clmr)
# convert data to np array
clmr_data = np.array(clmr_data)

In [23]:
clmr_data.shape

(235, 512)

In [24]:
correlations, losses = [], []
# K-fold Cross Validation model evaluation
kfold = KFold(n_splits=num_folds, shuffle=True)
for train, test in tqdm(kfold.split(clmr_data, labels)):

    # Define the model architecture
    model = Sequential()
    model.add(layers.Dense(100, input_dim=clmr_data.shape[1], activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    # Compile the model
    model.compile(loss='mean_squared_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10)
    
    # Fit data to model
    train_features = clmr_data[train]
    train_features = StandardScaler().fit_transform(train_features)
    # Fit data to model
    history = model.fit(train_features,
                        labels[train],
                        validation_split = 0.1,
                        batch_size=16, 
                        epochs=100,
                       verbose=0,
                       callbacks=[callback])

    test_features = clmr_data[test]
    test_features = StandardScaler().fit_transform(test_features)
    pred = model.predict(test_features)
    correlations.append(stats.spearmanr(pred, labels[test])[0])
    losses.append(model.evaluate(test_features, labels[test], verbose=0))

print("correlations mean: {}, std: {}".format(np.array(correlations).mean(), np.array(correlations).std()))
print("loss mean: {}, std: {}".format(np.array(losses).mean(), np.array(losses).std()) )

10it [00:16,  1.64s/it]

correlations mean: 0.052647734917081634, std: 0.19079969278339423
loss mean: 0.0341348934918642, std: 0.00996552185332706





In [20]:
# correlations mean: 0.052647734917081634, std: 0.19079969278339423
# loss mean: 0.0341348934918642, std: 0.00996552185332706

### MusiCNN

In [73]:
musicnn_dir = "baseline_representation/musicnn"
musicnn_data = []

# append mfcc according to list to label's order
for track in df.track.values:
    musicnn_path = os.path.join(musicnn_dir, track.replace(".wav", ".npy"))
    musicnn = np.load(musicnn_path)
    musicnn_data.append(musicnn)
# convert data to np array
musicnn_data = np.array(musicnn_data)

In [74]:
musicnn_data.shape

(235, 4194)

In [75]:
correlations, losses = [], []
# K-fold Cross Validation model evaluation
kfold = KFold(n_splits=num_folds, shuffle=True)
for train, test in tqdm(kfold.split(musicnn_data, labels)):

    # Define the model architecture
    model = Sequential()
    model.add(layers.Dense(100, input_dim=musicnn_data.shape[1], activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    # Compile the model
    model.compile(loss='mean_squared_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10)
    
    # Fit data to model
    train_features = musicnn_data[train]
    train_features = StandardScaler().fit_transform(train_features)
    # Fit data to model
    history = model.fit(train_features,
                        labels[train],
                        validation_split = 0.1,
                        batch_size=16, 
                        epochs=100,
                       verbose=0,
                       callbacks=[callback])

    test_features = musicnn_data[test]
    test_features = StandardScaler().fit_transform(test_features)
    pred = model.predict(test_features)
    correlations.append(stats.spearmanr(pred, labels[test])[0])
    losses.append(model.evaluate(test_features, labels[test], verbose=0))

print("correlations mean: {}, std: {}".format(np.array(correlations).mean(), np.array(correlations).std()))
print("loss mean: {}, std: {}".format(np.array(losses).mean(), np.array(losses).std()) )

10it [00:19,  1.92s/it]

correlations mean: 0.23706924302250304, std: 0.14870892357692989
loss mean: 0.02985965069383383, std: 0.008997555588942796





In [None]:
# correlations mean: 0.23706924302250304, std: 0.14870892357692989
# loss mean: 0.02985965069383383, std: 0.008997555588942796

### PANNS

In [96]:
panns_dir = "baseline_representation/panns"
panns_data = []

# append mfcc according to list to label's order
for track in df.track.values:
    panns_path = os.path.join(panns_dir, track.replace(".wav", ".npy"))
    panns = np.load(panns_path)
    panns = panns.flatten()
    panns_data.append(panns)
# convert data to np array
panns_data = np.array(panns_data)

In [97]:
panns_data.shape

(235, 2048)

In [98]:
correlations, losses = [], []
# K-fold Cross Validation model evaluation
kfold = KFold(n_splits=num_folds, shuffle=True)
for train, test in tqdm(kfold.split(panns_data, labels)):

    # Define the model architecture
    model = Sequential()
    model.add(layers.Dense(100, input_dim=panns_data.shape[1], activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    # Compile the model
    model.compile(loss='mean_squared_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10)
    
    # Fit data to model
    train_features = panns_data[train]
    train_features = StandardScaler().fit_transform(train_features)
    # Fit data to model
    history = model.fit(train_features,
                        labels[train],
                        validation_split = 0.1,
                        batch_size=16, 
                        epochs=100,
                       verbose=0,
                       callbacks=[callback])

    test_features = panns_data[test]
    test_features = StandardScaler().fit_transform(test_features)
    pred = model.predict(test_features)
    correlations.append(stats.spearmanr(pred, labels[test])[0])
    losses.append(model.evaluate(test_features, labels[test], verbose=0))

print("correlations mean: {}, std: {}".format(np.array(correlations).mean(), np.array(correlations).std()))
print("loss mean: {}, std: {}".format(np.array(losses).mean(), np.array(losses).std()) )

10it [00:15,  1.55s/it]

correlations mean: 0.1606687015090455, std: 0.2054919432925273
loss mean: 0.03130340985953808, std: 0.01036696120662695





In [None]:
# correlations mean: 0.1606687015090455, std: 0.2054919432925273
# loss mean: 0.03130340985953808, std: 0.01036696120662695