### Linear Combination model

##### Inputs: outputs of MFCC and numerical regression models

##### Output: multi-labels: Danceability, Instrumentalness, Speechiness, Acuosticness, Energy 

In [1]:
import torch
import torchaudio

In [2]:
import os
import requests
import torch

import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
labels_file = "tracks_features.csv"
#labels_file = "/content/drive/MyDrive/tracks_features.csv"
all_tracks = pd.read_csv(labels_file)

In [5]:
loaded_mfcc_tensor = torch.load("C://users/khala/Downloads/mfcc_tensor (1).pt")

#### Import MFCC model weights locally if desired 

Will not work in colab, this just shows where the weights are on github

In [6]:
cnn_model_path = "cnn_model_weights.pth"
rnn_model_path = "rnn_model_weights.pth"
multi_task_model_path = "multi_task_rnn_model_weights.pth"
linear_model_weights_path = "linear_model_weights.pth"

### 3. Multitask RNN model

In first attempt to make Linear model, only this model's output should be combined with the Regression model's output

In [74]:
import torch.nn as nn

class MultiTaskRNNModel(nn.Module):
  def __init__(self, input_size=20, hidden_size=64, num_layers=1, num_outputs=5):
    super(MultiTaskRNNModel, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.num_outputs = num_outputs
    self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
    self.fc = nn.Linear(hidden_size, num_outputs)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
    c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
    out, _ = self.lstm(x, (h0, c0))
    #out = self.fc(out[:, -1, :])          #we remove fully connected layer and sigmoid since we are combining
    out = out[:, -1, :]
    #out = self.sigmoid(out)     

    # Multiply by 1000, round, and then divide by 1000 for precision purposes
    #out = torch.round(out * 1000) / 1000

    return out


In [75]:
#data_directory = "/content/drive/MyDrive/"

model_weights_path = ("multi_task_rnn_model_weights.pth")

# Initialize the model with the same architecture
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model = MultiTaskRNNModel()

# Load the model weights
loaded_model.load_state_dict(torch.load(model_weights_path))

# Set the model to evaluation mode
#loaded_model.eval()

#initialize a loaded model
trained_mt_rnn = loaded_model.cuda()

### 4. Load Numerical based regression model

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [76]:
class RegressionModel(nn.Module):
    def __init__(self):
        super(RegressionModel, self).__init__()
        self.layer1 = nn.Linear(6, 64)
        self.layer2 = nn.Linear(64, 32)
        self.layer3 = nn.Linear(32, 5)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        #x = self.layer3(x)
        return x
 
model = RegressionModel().cuda()

In [77]:
data_directory = "/content/drive/MyDrive/"

model_weights_path = ("linear_model_weights.pth")

# Initialize the model with the same architecture
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model = RegressionModel()#.to(device)

# Load the model weights
loaded_model.load_state_dict(torch.load(model_weights_path))

# Set the model to evaluation mode
#loaded_model.eval()

#initialize a trained linear model
trained_lm = loaded_model.cuda()

### 5. Combine linear and multitask rnn model

In [79]:
class CombinerModel(nn.Module):
    def __init__(self, input_size=20, hidden_size=64, num_layers=1, num_outputs=5):
        super(CombinerModel, self).__init__()
        self.fc_rnn = nn.Linear(64, 32)
        self.fc_lm = nn.Linear(32, 32)
        self.fc_combiner = nn.Linear(64, 5)

    def forward(self, x):
        x_rnn = x[:, 0:20*160]
        x_lm = x[:, 20*160:20*160+6]
        batch_size = x.shape[0]
        x_rnn = torch.reshape(x_rnn, (batch_size, 160, 20))

        out_rnn = trained_mt_rnn(x_rnn)
        out_rnn = torch.relu(self.fc_rnn(out_rnn))

        out_lm = trained_lm(x_lm)
        out_lm = torch.relu(self.fc_lm(out_lm))

        out = torch.cat((out_rnn, out_lm), 1)
        out = self.fc_combiner(out)
        out = torch.sigmoid(out)
        
        return out
combiner = CombinerModel().cuda()

In [80]:
class CombineDataset(torch.utils.data.Dataset):
  def __init__(self, mfcc_tensor, df):
    self.df = df
    self.mfcc = mfcc_tensor
    self.mean = self.mfcc.mean()
    self.std = self.mfcc.std()

    # Standardize MFCC tensor
    self.mfcc = (self.mfcc - self.mean) / self.std

  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    labels = self.df.iloc[idx][['danceability', 'instrumentalness', 'acousticness', 'energy', 'speechiness']].astype(float).values
    label = torch.tensor(labels, dtype=torch.float32)
    mfcc = self.mfcc[idx]
    x = self.df.iloc[idx][['valence', 'tempo', 'loudness', 'key', 'mode', 'time_signature']].astype(float).values
    x = torch.tensor(x, dtype=torch.float32).cuda()
    flatten = torch.reshape(mfcc, (-1,)).cuda()
    vector = torch.cat((flatten, x), 0)
    return vector, label

import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

# Load the MFCC tensor
#myDrive = ""

loaded_mfcc_tensor = loaded_mfcc_tensor.cuda()

print("Loaded MFCC tensor shape:", loaded_mfcc_tensor.shape)

# Split all_tracks and the loaded MFCC tensor
train_tracks, test_val_tracks = train_test_split(all_tracks, test_size=0.3, random_state=42)
test_tracks, val_tracks = train_test_split(test_val_tracks, test_size=0.5, random_state=42)

train_mfcc, test_val_mfcc = train_test_split(loaded_mfcc_tensor, test_size=0.3, random_state=42)
test_mfcc, val_mfcc = train_test_split(test_val_mfcc, test_size=0.5, random_state=42)

# Create the datasets
train_dataset = CombineDataset(train_mfcc, train_tracks)
val_dataset = CombineDataset(val_mfcc, val_tracks)
test_dataset = CombineDataset(test_mfcc, test_tracks)

# Create the data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)

Loaded MFCC tensor shape: torch.Size([21325, 1, 20, 160])


In [None]:
import torch.optim as optim
import matplotlib.pyplot as plt
from tqdm import tqdm # progress bar stuff
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

device = torch.device("cuda")
model = combiner.to(device)
criterion = nn.SmoothL1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_losses = []
val_losses = []

num_epochs = 20
for epoch in range(num_epochs):
  print(f"working on epoch: {epoch}")
  model.train()
  running_loss = 0.0
  for i, (inputs, labels) in enumerate(tqdm(train_loader)):
    inputs, labels = inputs.to(device), labels.to(device)
    
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    
    running_loss += loss.item()

  epoch_loss = running_loss / (i + 1)
  train_losses.append(epoch_loss)
  print(f"Epoch [{epoch + 1}/{num_epochs}] - Loss: {epoch_loss:.4f}")

  model.eval()
  val_running_loss = 0.0
  with torch.no_grad():
    for inputs, labels in val_loader:
      inputs, labels = inputs.to(device), labels.to(device)
      
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      val_running_loss += loss.item()

  val_epoch_loss = val_running_loss / len(val_loader)
  val_losses.append(val_epoch_loss)
  print(f"Validation Loss: {val_epoch_loss:.4f}")

# Plotting the training and validation losses
plt.plot(train_losses, label="Training Loss")
plt.plot(val_losses, label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training and Validation Losses")
plt.legend()
plt.show()


In [None]:
import numpy as np
from scipy.stats import pearsonr
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

true_labels = []
predicted_labels = []

combiner.eval()
with torch.no_grad():
  for inputs, labels in test_loader:
    labels = labels.to(device)

    outputs = combiner(inputs)

    true_labels.extend(labels.cpu().numpy())
    predicted_labels.extend(outputs.cpu().numpy())

true_labels = np.array(true_labels)
predicted_labels = np.array(predicted_labels)

for i, label in enumerate(['danceability', 'instrumentalness', 'acousticness', 'energy', 'speechiness']):
  mse = mean_squared_error(true_labels[:, i], predicted_labels[:, i])
  mae = mean_absolute_error(true_labels[:, i], predicted_labels[:, i])
  rmse = np.sqrt(mse)
  r2 = r2_score(true_labels[:, i], predicted_labels[:, i])
  pearson_corr, p_value = pearsonr(true_labels[:, i], predicted_labels[:, i])

  print(f"Evaluation metrics for {label}:")
  print(f"Mean Squared Error: {mse:.4f}")
  print(f"Mean Absolute Error: {mae:.4f}")
  print(f"Root Mean Squared Error: {rmse:.4f}")
  print(f"R-squared: {r2:.4f}")
  print(f"Pearson's Correlation Coefficient: {pearson_corr:.4f}")

### Model architecture tuning

architecture: remove final fc layer from pretrained rnn and lm model. two 32 size hidden layers and one 64 size hidden layer. 

```

class CombinerModel(nn.Module):
    def __init__(self):
        super(CombinerModel, self).__init__()
        self.fc_rnn = nn.Linear(64, 32)
        self.fc_lm = nn.Linear(32, 32)
        self.fc_combiner = nn.Linear(64, 5)

    def forward(self, x):
        x_rnn = x[:, 0:20*160]
        x_lm = x[:, 20*160:20*160+6]
        batch_size = x.shape[0]
        x_rnn = torch.reshape(x_rnn, (batch_size, 160, 20))

        out_rnn = trained_mt_rnn(x_rnn)
        out_rnn = torch.relu(self.fc_rnn(out_rnn))

        out_lm = trained_lm(x_lm)
        out_lm = torch.relu(self.fc_lm(out_lm))

        out = torch.cat((out_rnn, out_lm), 1)
        out = self.fc_combiner(out)
        out = torch.sigmoid(out)
        
        return out

optimizer = optim.Adam(model.parameters(), lr=0.001)

Evaluation metrics for danceability:
Mean Squared Error: 0.0222
Mean Absolute Error: 0.1188
Root Mean Squared Error: 0.1490
R-squared: 0.2889
Pearson's Correlation Coefficient: 0.5384
Evaluation metrics for instrumentalness:
Mean Squared Error: 0.0820
Mean Absolute Error: 0.2054
Root Mean Squared Error: 0.2863
R-squared: 0.3280
Pearson's Correlation Coefficient: 0.5734
Evaluation metrics for acousticness:
Mean Squared Error: 0.0670
Mean Absolute Error: 0.2047
Root Mean Squared Error: 0.2588
R-squared: 0.4698
Pearson's Correlation Coefficient: 0.6857
Evaluation metrics for energy:
Mean Squared Error: 0.0303
Mean Absolute Error: 0.1375
Root Mean Squared Error: 0.1739
R-squared: 0.5842
Pearson's Correlation Coefficient: 0.7651
Evaluation metrics for speechiness:
Mean Squared Error: 0.0078
Mean Absolute Error: 0.0611
Root Mean Squared Error: 0.0885
R-squared: 0.0716
Pearson's Correlation Coefficient: 0.2841
```

Architecture: keep final fc layer in rnn and lm model but remove sigmoid in rnn model. two 16 size and one 32 size hidden layer.
```
class CombinerModel(nn.Module):
    def __init__(self):
        super(CombinerModel, self).__init__()
        self.fc_rnn = nn.Linear(5, 16)
        self.fc_lm = nn.Linear(5, 16)
        self.fc_combiner = nn.Linear(32, 5)

    def forward(self, x):
        x_rnn = x[:, 0:20*160]
        x_lm = x[:, 20*160:20*160+6]
        batch_size = x.shape[0]
        x_rnn = torch.reshape(x_rnn, (batch_size, 160, 20))

        out_rnn = trained_mt_rnn(x_rnn)
        out_rnn = torch.relu(self.fc_rnn(out_rnn))

        out_lm = trained_lm(x_lm)
        out_lm = torch.relu(self.fc_lm(out_lm))

        out = torch.cat((out_rnn, out_lm), 1)
        out = self.fc_combiner(out)
        out = torch.sigmoid(out)
        
        return out

optimizer = optim.Adam(model.parameters(), lr=0.001)

Evaluation metrics for danceability:
Mean Squared Error: 0.0229
Mean Absolute Error: 0.1201
Root Mean Squared Error: 0.1513
R-squared: 0.2660
Pearson's Correlation Coefficient: 0.5196
Evaluation metrics for instrumentalness:
Mean Squared Error: 0.0833
Mean Absolute Error: 0.2052
Root Mean Squared Error: 0.2886
R-squared: 0.3173
Pearson's Correlation Coefficient: 0.5655
Evaluation metrics for acousticness:
Mean Squared Error: 0.0682
Mean Absolute Error: 0.2059
Root Mean Squared Error: 0.2612
R-squared: 0.4597
Pearson's Correlation Coefficient: 0.6795
Evaluation metrics for energy:
Mean Squared Error: 0.0327
Mean Absolute Error: 0.1410
Root Mean Squared Error: 0.1807
R-squared: 0.5512
Pearson's Correlation Coefficient: 0.7474
Evaluation metrics for speechiness:
Mean Squared Error: 0.0080
Mean Absolute Error: 0.0641
Root Mean Squared Error: 0.0895
R-squared: 0.0516
Pearson's Correlation Coefficient: 0.2591
```

Architecture: keep all layers. Two size 5 and one size 10 hidden layer
```
class CombinerModel(nn.Module):
    def __init__(self, input_size=20, hidden_size=64, num_layers=1, num_outputs=5):
        super(CombinerModel, self).__init__()
        self.fc_rnn = nn.Linear(5, 5)
        self.fc_lm = nn.Linear(5, 5)
        self.fc_combiner = nn.Linear(10, 5)

    def forward(self, x):
        x_rnn = x[:, 0:20*160]
        x_lm = x[:, 20*160:20*160+6]
        batch_size = x.shape[0]
        x_rnn = torch.reshape(x_rnn, (batch_size, 160, 20))

        out_rnn = trained_mt_rnn(x_rnn)
        out_rnn = torch.relu(self.fc_rnn(out_rnn))

        out_lm = trained_lm(x_lm)
        out_lm = torch.relu(self.fc_lm(out_lm))

        out = torch.cat((out_rnn, out_lm), 1)
        out = self.fc_combiner(out)
        out = torch.sigmoid(out)
        
        return out

optimizer = optim.Adam(model.parameters(), lr=0.001)

Evaluation metrics for danceability:
Mean Squared Error: 0.0236
Mean Absolute Error: 0.1236
Root Mean Squared Error: 0.1536
R-squared: 0.2435
Pearson's Correlation Coefficient: 0.4946
Evaluation metrics for instrumentalness:
Mean Squared Error: 0.0849
Mean Absolute Error: 0.2097
Root Mean Squared Error: 0.2914
R-squared: 0.3040
Pearson's Correlation Coefficient: 0.5519
Evaluation metrics for acousticness:
Mean Squared Error: 0.0716
Mean Absolute Error: 0.2105
Root Mean Squared Error: 0.2676
R-squared: 0.4329
Pearson's Correlation Coefficient: 0.6591
Evaluation metrics for energy:
Mean Squared Error: 0.0338
Mean Absolute Error: 0.1445
Root Mean Squared Error: 0.1839
R-squared: 0.5351
Pearson's Correlation Coefficient: 0.7319
Evaluation metrics for speechiness:
Mean Squared Error: 0.0083
Mean Absolute Error: 0.0603
Root Mean Squared Error: 0.0911
R-squared: 0.0167
Pearson's Correlation Coefficient: 0.1563
```

changing learning rate: 0.01, 0.005, 0.002, 0.001
architecture: remove final fc layer in rnn and lm model

```
class CombinerModel(nn.Module):
    def __init__(self, input_size=20, hidden_size=64, num_layers=1, num_outputs=5):
        super(CombinerModel, self).__init__()
        self.fc_rnn = nn.Linear(64, 32)
        self.fc_lm = nn.Linear(32, 32)
        self.fc_combiner = nn.Linear(64, 5)

    def forward(self, x):
        x_rnn = x[:, 0:20*160]
        x_lm = x[:, 20*160:20*160+6]
        batch_size = x.shape[0]
        x_rnn = torch.reshape(x_rnn, (batch_size, 160, 20))

        out_rnn = trained_mt_rnn(x_rnn)
        out_rnn = torch.relu(self.fc_rnn(out_rnn))

        out_lm = trained_lm(x_lm)
        out_lm = torch.relu(self.fc_lm(out_lm))

        out = torch.cat((out_rnn, out_lm), 1)
        out = self.fc_combiner(out)
        out = torch.sigmoid(out)
        
        return out

Evaluation metrics for danceability:
Mean Squared Error: 0.0218
Mean Absolute Error: 0.1172
Root Mean Squared Error: 0.1477
R-squared: 0.3004
Pearson's Correlation Coefficient: 0.5485
Evaluation metrics for instrumentalness:
Mean Squared Error: 0.0821
Mean Absolute Error: 0.2002
Root Mean Squared Error: 0.2865
R-squared: 0.3269
Pearson's Correlation Coefficient: 0.5739
Evaluation metrics for acousticness:
Mean Squared Error: 0.0658
Mean Absolute Error: 0.1963
Root Mean Squared Error: 0.2565
R-squared: 0.4791
Pearson's Correlation Coefficient: 0.6941
Evaluation metrics for energy:
Mean Squared Error: 0.0297
Mean Absolute Error: 0.1347
Root Mean Squared Error: 0.1725
R-squared: 0.5912
Pearson's Correlation Coefficient: 0.7705
Evaluation metrics for speechiness:
Mean Squared Error: 0.0076
Mean Absolute Error: 0.0583
Root Mean Squared Error: 0.0874
R-squared: 0.0962
Pearson's Correlation Coefficient: 0.3114
```