### Linear Combination model

##### Inputs: outputs of MFCC and numerical regression models

##### Output: multi-labels: Danceability, Instrumentalness, Speechiness, Acuosticness, Energy 

In [1]:
import torch
import torchaudio

In [2]:
import os
import requests
import torch

import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#### ONLY RUN IN GOOGLE COLAB ###

In [7]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [8]:
# labels_file = "Data/tracks_features.csv"
labels_file = "/content/drive/MyDrive/tracks_features.csv"
all_tracks = pd.read_csv(labels_file)

In [9]:
def name_of_file(track_id):
  filename = f"/content/drive/MyDrive/tracks_features_audio/{track_id}_audio.mp3"
  return filename

In [10]:
myDrive = "/content/drive/MyDrive/"
tensor_file = os.path.join(os.path.dirname(myDrive), "mfcc_tensor.pt")

# Load the tensor back and print the shape
loaded_mfcc_tensor = torch.load(tensor_file)
print("Loaded MFCC tensor shape:", loaded_mfcc_tensor.shape)

Loaded MFCC tensor shape: torch.Size([21325, 1, 20, 160])


#### Import MFCC model weights locally if desired 

Will not work in colab, this just shows where the weights are on github

In [None]:
cnn_model_path = "models/cnn_model_weights.pth"
rnn_model_path = "models/rnn_model_weights.pth"
multi_task_model_path = "models/multi_task_rnn_model_weights.pth"

### 1. Load CNN MFCC Model

It should be noted that this model only predicts danceability. If the following weights are taken as is, we should only see an improvement in performance for danceability. 

*** Note that this model may not function in the Linear model since it is not a multitask model ***

Dataset loader for 2D CNN model (Only danceability labels)

In [22]:
import torch

class AudioDataset(torch.utils.data.Dataset):
  def __init__(self, mfcc_tensor, df):
    self.df = df
    self.mfcc = mfcc_tensor
    self.mean = self.mfcc.mean()
    self.std = self.mfcc.std()

    # Standardize MFCC tensor
    self.mfcc = (self.mfcc - self.mean) / self.std

  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    label = torch.tensor(self.df.iloc[idx]['danceability'], dtype=torch.float32)
    mfcc = self.mfcc[idx]
    return mfcc, label


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

# Load the MFCC tensor
myDrive = "/content/drive/MyDrive/"
tensor_file = os.path.join(os.path.dirname(myDrive), "mfcc_tensor.pt")
loaded_mfcc_tensor = torch.load(tensor_file).squeeze(1)
print("Loaded MFCC tensor shape:", loaded_mfcc_tensor.shape)

# Split all_tracks and the loaded MFCC tensor
train_tracks, test_val_tracks = train_test_split(all_tracks, test_size=0.3, random_state=42)
test_tracks, val_tracks = train_test_split(test_val_tracks, test_size=0.5, random_state=42)

train_mfcc, test_val_mfcc = train_test_split(loaded_mfcc_tensor, test_size=0.3, random_state=42)
test_mfcc, val_mfcc = train_test_split(test_val_mfcc, test_size=0.5, random_state=42)

# Create the datasets
train_dataset = AudioDataset(train_mfcc, train_tracks)
val_dataset = AudioDataset(val_mfcc, val_tracks)
test_dataset = AudioDataset(test_mfcc, test_tracks)

# Create the data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)


Loaded MFCC tensor shape: torch.Size([21325, 20, 160])


In [13]:
import torch.nn as nn

# 2D CNN Model
class CNNModel(nn.Module):
  def __init__(self):
    super(CNNModel, self).__init__()
    self.conv1 = nn.Conv2d(1, 16, 3, padding=1)
    self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
    self.fc1 = nn.Linear(32 * 5 * 40, 64)
    self.fc2 = nn.Linear(64, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    x = nn.functional.relu(self.conv1(x))
    x = nn.functional.max_pool2d(x, 2)
    x = nn.functional.relu(self.conv2(x))
    x = nn.functional.max_pool2d(x, 2)
    x = x.view(x.size(0), -1)
    x = nn.functional.relu(self.fc1(x))
    x = self.fc2(x)

    x = self.sigmoid(x)
    
    # Multiply by 1000, round, and then divide by 1000 for precision purposes
    # x = torch.round(x * 1000) / 1000 # prevents loss from doing down

    return x

In [14]:
data_directory = "/content/drive/MyDrive/"

# loading for colab, choose based on where you place your model paths
model_weights_path = os.path.join(data_directory, "cnn_model_weights.pth")

In [15]:
# Load the model weights
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model = CNNModel().to(device)
loaded_model.load_state_dict(torch.load(model_weights_path))
loaded_model.eval()  # Set the model to evaluation mode


CNNModel(
  (conv1): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc1): Linear(in_features=6400, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [17]:
import numpy as np
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

true_labels = []
predicted_labels = []

loaded_model.eval()
with torch.no_grad():
  for inputs, labels in test_loader:
    inputs = inputs.unsqueeze(1).to(device)
    labels = labels.to(device)

    outputs = loaded_model(inputs)

    true_labels.extend(labels.cpu().numpy())
    predicted_labels.extend(outputs.squeeze().cpu().numpy())

true_labels = np.array(true_labels)
predicted_labels = np.array(predicted_labels)

mse = mean_squared_error(true_labels, predicted_labels)
mae = mean_absolute_error(true_labels, predicted_labels)
rmse = np.sqrt(mse)
r2 = r2_score(true_labels, predicted_labels)
pearson_corr, p_value = pearsonr(true_labels, predicted_labels)

print(f"Mean Squared Error: {mse:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Root Mean Squared Error: {rmse:.4f}")
print(f"R-squared: {r2:.4f}")
print(f"Pearson's Correlation Coefficient: {pearson_corr:.4f}")


Mean Squared Error: 0.0110
Mean Absolute Error: 0.0800
Root Mean Squared Error: 0.1048
R-squared: 0.6481
Pearson's Correlation Coefficient: 0.8114


### 2. Load 1D RNN MFCC model (Also Danceability only)

In [18]:
# 1D RNN Model
import torch.nn as nn

class RNNModel(nn.Module):
  def __init__(self, input_size=20, hidden_size=64, num_layers=1):
    super(RNNModel, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
    self.fc = nn.Linear(hidden_size, 1)

  def forward(self, x):
    h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
    c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
    out, _ = self.lstm(x, (h0, c0))
    out = self.fc(out[:, -1, :])
    return out

In [19]:
data_directory = "/content/drive/MyDrive/"
rnn_model_weights_path = os.path.join(data_directory, "rnn_model_weights.pth")

# Create a new instance of the RNN model
loaded_model = RNNModel()

# Load the model weights from the saved state dictionary
loaded_model.load_state_dict(torch.load(model_weights_path))

# Move the model to the device (if you used GPU while training)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model.to(device)

# Set the model to evaluation mode
loaded_model.eval()

RNNModel(
  (lstm): LSTM(20, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
)

#### Sample of using the model

In [20]:
import torch

# Choose the index of the sample you want to examine
sample_index = 1010

# Load the saved MFCC tensor
myDrive = "/content/drive/MyDrive/"
tensor_file = os.path.join(os.path.dirname(myDrive), "mfcc_tensor.pt")
loaded_mfcc_tensor = torch.load(tensor_file).squeeze(1)

# Extract the input features (MFCCs) and the true label for the sample
sample_mfcc = loaded_mfcc_tensor[sample_index].unsqueeze(0)
true_label = all_tracks.iloc[sample_index]["danceability"]

# Pass the input features through the trained RNN model to get the outputted label
loaded_model.eval()
with torch.no_grad():
  sample_input = sample_mfcc.permute(0, 2, 1).to(device)
  output_label = loaded_model(sample_input).item()

# Print both the true label and the outputted label
print(f"True label: {true_label}")
print(f"Outputted label: {output_label}")


True label: 0.247
Outputted label: 0.023576393723487854


In [21]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
from scipy.stats import pearsonr

loaded_model.eval()
all_outputs = []
all_labels = []

with torch.no_grad():
  for inputs, labels in test_loader:
    inputs = inputs.permute(0, 2, 1)
    inputs, labels = inputs.to(device), labels.to(device)

    outputs = loaded_model(inputs)
    all_outputs.extend(outputs.squeeze().tolist())
    all_labels.extend(labels.tolist())

mse = mean_squared_error(all_labels, all_outputs)
rmse = np.sqrt(mse)
mae = mean_absolute_error(all_labels, all_outputs)
r2 = r2_score(all_labels, all_outputs)
pearson_corr, p_value = pearsonr(all_labels, all_outputs)

print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R-squared: {r2:.4f}")
print(f"Pearson's Correlation Coefficient: {pearson_corr:.4f}")


MSE: 0.0104
RMSE: 0.1018
MAE: 0.0786
R-squared: 0.6682
Pearson's Correlation Coefficient: 0.8297


### 3. Multitask RNN model

In first attempt to make Linear model, only this model's output should be combined with the Regression model's output

In [23]:
import torch.nn as nn

class MultiTaskRNNModel(nn.Module):
  def __init__(self, input_size=20, hidden_size=64, num_layers=1, num_outputs=5):
    super(MultiTaskRNNModel, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.num_outputs = num_outputs
    self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
    self.fc = nn.Linear(hidden_size, num_outputs)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
    c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
    out, _ = self.lstm(x, (h0, c0))
    out = self.fc(out[:, -1, :])
    out = self.sigmoid(out)

    # Multiply by 1000, round, and then divide by 1000 for precision purposes
    #out = torch.round(out * 1000) / 1000

    return out


In [24]:
class AudioDataset(torch.utils.data.Dataset):
  def __init__(self, mfcc_tensor, df):
    self.df = df
    self.mfcc = mfcc_tensor
    self.mean = self.mfcc.mean()
    self.std = self.mfcc.std()

    # Standardize MFCC tensor
    self.mfcc = (self.mfcc - self.mean) / self.std

  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    labels = self.df.iloc[idx][['danceability', 'instrumentalness', 'acousticness', 'energy', 'speechiness']].astype(float).values
    label = torch.tensor(labels, dtype=torch.float32)
    mfcc = self.mfcc[idx]
    return mfcc, label

import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

# Load the MFCC tensor
myDrive = "/content/drive/MyDrive/"
tensor_file = os.path.join(os.path.dirname(myDrive), "mfcc_tensor.pt")
loaded_mfcc_tensor = torch.load(tensor_file).squeeze(1)
print("Loaded MFCC tensor shape:", loaded_mfcc_tensor.shape)

# Split all_tracks and the loaded MFCC tensor
train_tracks, test_val_tracks = train_test_split(all_tracks, test_size=0.3, random_state=42)
test_tracks, val_tracks = train_test_split(test_val_tracks, test_size=0.5, random_state=42)

train_mfcc, test_val_mfcc = train_test_split(loaded_mfcc_tensor, test_size=0.3, random_state=42)
test_mfcc, val_mfcc = train_test_split(test_val_mfcc, test_size=0.5, random_state=42)

# Create the datasets
train_dataset = AudioDataset(train_mfcc, train_tracks)
val_dataset = AudioDataset(val_mfcc, val_tracks)
test_dataset = AudioDataset(test_mfcc, test_tracks)

# Create the data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)


Loaded MFCC tensor shape: torch.Size([21325, 20, 160])


In [25]:
data_directory = "/content/drive/MyDrive/"

model_weights_path = os.path.join(data_directory, "multi_task_rnn_model_weights.pth")

# Initialize the model with the same architecture
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model = MultiTaskRNNModel().to(device)

# Load the model weights
loaded_model.load_state_dict(torch.load(model_weights_path))

# Set the model to evaluation mode
loaded_model.eval()

MultiTaskRNNModel(
  (lstm): LSTM(20, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=5, bias=True)
  (sigmoid): Sigmoid()
)

#### Sample of using the model for predictions

In [26]:
import torch

# Choose the index of the sample you want to examine
sample_index = 1013

# Load the saved MFCC tensor
myDrive = "/content/drive/MyDrive/"
tensor_file = os.path.join(os.path.dirname(myDrive), "mfcc_tensor.pt")
loaded_mfcc_tensor = torch.load(tensor_file).squeeze(1)

# Extract the input features (MFCCs) and the true labels for the sample
sample_mfcc = loaded_mfcc_tensor[sample_index].unsqueeze(0)
true_labels = all_tracks.iloc[sample_index][['danceability', 'instrumentalness', 'acousticness', 'energy', 'speechiness']].values

# Pass the input features through the trained RNN model to get the outputted labels
loaded_model.eval()
with torch.no_grad():
  sample_input = sample_mfcc.permute(0, 2, 1).to(device)
  output_labels = loaded_model(sample_input).cpu().numpy()[0]

# Print both the true labels and the outputted labels
labels = ['danceability', 'instrumentalness', 'acousticness', 'energy', 'speechiness']
for i, label in enumerate(labels):
  print(f"{label} - True label: {true_labels[i]:.3f}, Outputted label: {output_labels[i]:.3f}")


danceability - True label: 0.331, Outputted label: 0.691
instrumentalness - True label: 0.000, Outputted label: 0.005
acousticness - True label: 0.006, Outputted label: 0.072
energy - True label: 0.471, Outputted label: 0.858
speechiness - True label: 0.030, Outputted label: 0.155


#### Model statistics

In [27]:
import numpy as np
from scipy.stats import pearsonr
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

true_labels = []
predicted_labels = []

loaded_model.eval()
with torch.no_grad():
  for inputs, labels in test_loader:
    inputs = inputs.permute(0, 2, 1).to(device)
    labels = labels.to(device)

    outputs = loaded_model(inputs)

    true_labels.extend(labels.cpu().numpy())
    predicted_labels.extend(outputs.cpu().numpy())

true_labels = np.array(true_labels)
predicted_labels = np.array(predicted_labels)

for i, label in enumerate(['danceability', 'instrumentalness', 'acousticness', 'energy', 'speechiness']):
  mse = mean_squared_error(true_labels[:, i], predicted_labels[:, i])
  mae = mean_absolute_error(true_labels[:, i], predicted_labels[:, i])
  rmse = np.sqrt(mse)
  r2 = r2_score(true_labels[:, i], predicted_labels[:, i])
  pearson_corr, p_value = pearsonr(true_labels[:, i], predicted_labels[:, i])

  print(f"Evaluation metrics for {label}:")
  print(f"Mean Squared Error: {mse:.4f}")
  print(f"Mean Absolute Error: {mae:.4f}")
  print(f"Root Mean Squared Error: {rmse:.4f}")
  print(f"R-squared: {r2:.4f}")
  print(f"Pearson's Correlation Coefficient: {pearson_corr:.4f}")


Evaluation metrics for danceability:
Mean Squared Error: 0.0194
Mean Absolute Error: 0.1109
Root Mean Squared Error: 0.1393
R-squared: 0.3783
Pearson's Correlation Coefficient: 0.6151
Evaluation metrics for instrumentalness:
Mean Squared Error: 0.0520
Mean Absolute Error: 0.1408
Root Mean Squared Error: 0.2281
R-squared: 0.5734
Pearson's Correlation Coefficient: 0.7719
Evaluation metrics for acousticness:
Mean Squared Error: 0.0241
Mean Absolute Error: 0.1134
Root Mean Squared Error: 0.1552
R-squared: 0.8093
Pearson's Correlation Coefficient: 0.9014
Evaluation metrics for energy:
Mean Squared Error: 0.0179
Mean Absolute Error: 0.1068
Root Mean Squared Error: 0.1337
R-squared: 0.7543
Pearson's Correlation Coefficient: 0.8759
Evaluation metrics for speechiness:
Mean Squared Error: 0.0074
Mean Absolute Error: 0.0541
Root Mean Squared Error: 0.0862
R-squared: 0.1204
Pearson's Correlation Coefficient: 0.3632


### 4. Load Numerical based regression model

In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [29]:
df = all_tracks

In [32]:
X = df[['valence', 'tempo', 'loudness', 'key', 'mode', 'time_signature']]
y = df[['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness']]

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [35]:
model = Sequential()
model.add(Dense(64, input_dim=6, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(5, activation='linear'))

model.compile(loss='mean_squared_error', optimizer='adam')