In [1]:
from utils.preprocessing import *
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 
import pickle

[nltk_data] Downloading package omw-1.4 to /home/eu3neuom/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
FULL_DATASET_PATH = "./data/dataset.csv"
DATASET_PATH = "./data/"
# run `split_dataset` only once to create train/test/val 
# split_dataset(FULL_DATASET_PATH, DATASET_PATH, verbose=True)

In [3]:
# Reading
train_dataframe = pd.read_csv(os.path.join(DATASET_PATH, "train.csv"))
val_dataframe = pd.read_csv(os.path.join(DATASET_PATH, "val.csv"))
test_dataframe = pd.read_csv(os.path.join(DATASET_PATH, "test.csv"))

# Normalizing
for dataframe in [train_dataframe, val_dataframe, test_dataframe]:
    dataframe["lyrics"] = [normalization(song) for song in dataframe["lyrics"]]

# Feature extraction
tfidf_vect = TfidfVectorizer(min_df=5, max_df=0.8)
tfidf_vect.fit(train_dataframe["lyrics"])

with open("./app/model/tfidf.pkl", "wb") as file:
    pickle.dump(tfidf_vect, file)

def transform_data(tfidf, dataframe):
    features = tfidf_vect.transform(dataframe["lyrics"])
    return pd.DataFrame(features.todense(), columns=tfidf.get_feature_names_out())

train_features = transform_data(tfidf_vect, train_dataframe)
val_features = transform_data(tfidf_vect, val_dataframe)
test_features = transform_data(tfidf_vect, test_dataframe)


In [4]:
idx = 0
artistToLabel = {}
labelToArtist = {}
for artist in train_dataframe["artist"].tolist():
    if artist not in artistToLabel.keys():
        artistToLabel[artist] = idx
        labelToArtist[idx] = artist
        idx += 1
    
train_labels = [artistToLabel[artist] for artist in train_dataframe["artist"]]
val_labels = [artistToLabel[artist] for artist in val_dataframe["artist"]]
test_labels = [artistToLabel[artist] for artist in test_dataframe["artist"]]

In [4]:
# Remove songs with less than 20 words after normalization
# print(train_dataframe.shape, val_dataframe.shape, test_dataframe.shape)
# for dataframe in [train_dataframe, val_dataframe, test_dataframe]:
#     indexes = []
#     for idx, row in enumerate(dataframe.to_numpy()):
#         if len(row[5]) <= 20:
#             indexes.append(idx)
#     dataframe.drop(index=indexes, inplace=True)
# print(train_dataframe.shape, val_dataframe.shape, test_dataframe.shape)

(4530, 6) (1510, 6) (1510, 6)
(4529, 6) (1510, 6) (1510, 6)


In [98]:
# 35%
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier(n_jobs=5)
parameters = {
    "criterion": ["gini", "entropy"],
    "n_estimators": [100, 1000],
    "max_features": ["auto", "sqrt"]
}
clf = GridSearchCV(clf, parameters, verbose=1)
clf.fit(train_features, train_dataframe["artist"])

score = 0
pred_labels = clf.predict(val_features)
for i, name in enumerate(val_dataframe["artist"]):
    if pred_labels[i] == name:
        score += 1
print(f"Validation acc: [{100.0 * score / len(pred_labels)}]")

score = 0
pred_labels = clf.predict(test_features)
for i, name in enumerate(test_dataframe["artist"]):
    if pred_labels[i] == name:
        score += 1
print(f"Test acc:       [{100.0 * score / len(pred_labels)}]")

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Validation acc: [37.83962889330682]
Test acc:       [38.9662027833002]


In [103]:
with open("./app/model/random-forest.pkl", "wb") as file:
    pickle.dump(clf, file)

In [5]:
with open("./app/model/ranom-forwest.pkl", "rb") as file:
    loaded = pickle.load(file)

    score = 0
    pred_labels = loaded.predict(test_features)
    for i, name in enumerate(test_dataframe["artist"]):
        if pred_labels[i] == name:
            score += 1
    print(f"Test acc:       [{100.0 * score / len(pred_labels)}]")

Test acc:       [38.9662027833002]


In [12]:
# 33%
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

clf = SVC()
parameters = {
    "kernel": ["rbf"],
    "gamma": ["scale", "auto"],
    "C": [1, 10]
}
clf = GridSearchCV(clf, parameters, verbose=1)
clf.fit(train_features, train_dataframe["artist"])

score = 0
pred_labels = clf.predict(val_features)
for i, name in enumerate(val_dataframe["artist"]):
    if pred_labels[i] == name:
        score += 1
print(f"Validation acc: [{100.0 * score / len(pred_labels)}]")

score = 0
pred_labels = clf.predict(test_features)
for i, name in enumerate(test_dataframe["artist"]):
    if pred_labels[i] == name:
        score += 1
print(f"Test acc:       [{100.0 * score / len(pred_labels)}]")

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Validation acc: [36.42384105960265]
Test acc:       [33.17880794701987]


In [94]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import numpy as np



class MyDataset(Dataset):
    def __init__(self, data, labels):
        super().__init__()
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index], self.labels[index]

class DNN(nn.Module):
    def __init__(self):
        super(DNN, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(5993, 4096),
            nn.Linear(4096, 1024),
            nn.Dropout(),
            nn.Linear(1024, 256),
            nn.Linear(256, 38)
        )

    def forward(self, x):
        return self.layers(x)
        
trainData = MyDataset(train_features.to_numpy().astype(float), train_labels)
validData = MyDataset(val_features.to_numpy().astype(float), val_labels)
testData = MyDataset(test_features.to_numpy().astype(float), test_labels)

batch_size = 8
train_loader = DataLoader(dataset = trainData, batch_size = batch_size, shuffle=True)
valid_loader = DataLoader(dataset = validData, batch_size = batch_size, shuffle=False)
test_loader = DataLoader(dataset = testData, batch_size = 1, shuffle=False)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [95]:
def train(model, numEpochs, criterion, optimizer, path=None):
    train_losses = []
    valid_losses = []
    best_accuracy = -1.0

    for epoch in range(1, numEpochs + 1):
        train_loss = 0.0
        valid_loss = 0.0

        model.train()
        for data, target in train_loader:
            data = data.to(device)
            target = target.to(device)

            optimizer.zero_grad()
            output = model(data.float())
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * data.size(0)

        valid_loss, validation_acc = validation(model, criterion)
        if validation_acc > best_accuracy:
            best_accuracy = validation_acc
            # if path is not None:
            #     aditionalInfo = {"epochs": epoch, "acc": best_accuracy, "structure": model}
            #     saveStateDict(model.state_dict(), path, aditionalInfo)

        train_loss = train_loss / len(train_loader.sampler)
        valid_loss = valid_loss / len(valid_loader.sampler)
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)
        print(
            "Epoch: [{:2}/{:2}] \tTraining Loss: [{:.6f}] \tValidation Loss: {:.6f} \tValidation acc: [{:.6f}] \t Best acc: [{:.6f}]".format(
                epoch, numEpochs, train_loss, valid_loss, validation_acc, best_accuracy
            )
        )

    
def validation(model, criterion):
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        valid_loss = 0
        for images, labels in valid_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images.float())
            loss = criterion(outputs, labels)
            valid_loss += loss.item() * images.size(0)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        return valid_loss, 100 * correct / total

def test(model):
    ret = []
    model.eval()
    with torch.no_grad():
        for images, imagePaths, _ in test_loader:
            images = images.to(device)
            outputs = model(images.float())
            _, predicted = torch.max(outputs.data, 1)
            ret.append(predicted[0].item())
        return ret

In [97]:
model = DNN().to(device)
learning_rate = 0.0001
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
train(model, 10, criterion, optimizer, "./models/CNN.ckpt")

Epoch: [ 1/10] 	Training Loss: [3.174205] 	Validation Loss: 2.841482 	Validation acc: [21.471173] 	 Best acc: [21.471173]
Epoch: [ 2/10] 	Training Loss: [2.512903] 	Validation Loss: 2.622900 	Validation acc: [28.429423] 	 Best acc: [28.429423]
Epoch: [ 3/10] 	Training Loss: [1.915205] 	Validation Loss: 2.756847 	Validation acc: [30.682571] 	 Best acc: [30.682571]
Epoch: [ 4/10] 	Training Loss: [1.390478] 	Validation Loss: 3.004896 	Validation acc: [32.471836] 	 Best acc: [32.471836]
Epoch: [ 5/10] 	Training Loss: [0.948332] 	Validation Loss: 3.538494 	Validation acc: [32.935719] 	 Best acc: [32.935719]
Epoch: [ 6/10] 	Training Loss: [0.611056] 	Validation Loss: 4.167316 	Validation acc: [32.471836] 	 Best acc: [32.935719]
Epoch: [ 7/10] 	Training Loss: [0.364365] 	Validation Loss: 4.823866 	Validation acc: [33.134526] 	 Best acc: [33.134526]
Epoch: [ 8/10] 	Training Loss: [0.218034] 	Validation Loss: 5.590315 	Validation acc: [32.471836] 	 Best acc: [33.134526]
Epoch: [ 9/10] 	Training