In [5]:
from utils.preprocessing import *
import pandas as pd
from collections import Counter

In [6]:
FULL_DATASET_PATH = "./data/dataset.csv"
DATASET_PATH = "./data/"
# run `split_dataset` only once to create train/test/val 
# split_dataset(FULL_DATASET_PATH, DATASET_PATH, verbose=True)

In [7]:
train_dataframe = pd.read_csv(os.path.join(DATASET_PATH, "train.csv"))
val_dataframe = pd.read_csv(os.path.join(DATASET_PATH, "val.csv"))
test_dataframe = pd.read_csv(os.path.join(DATASET_PATH, "test.csv"))

for dataframe in [train_dataframe, val_dataframe, test_dataframe]:
    dataframe["lyrics"] = [normalization(song) for song in dataframe["lyrics"]]


In [8]:
tmp = set(train_dataframe["artist"].tolist())
print(len(tmp))

KeyError: 'artists'

In [4]:
# Remove songs with less than 20 words after normalization
print(train_dataframe.shape, val_dataframe.shape, test_dataframe.shape)
for dataframe in [train_dataframe, val_dataframe, test_dataframe]:
    indexes = []
    for idx, row in enumerate(dataframe.to_numpy()):
        if len(row[5]) <= 20:
            indexes.append(idx)
    dataframe.drop(index=indexes, inplace=True)
print(train_dataframe.shape, val_dataframe.shape, test_dataframe.shape)

(4530, 6) (1510, 6) (1510, 6)
(4529, 6) (1510, 6) (1510, 6)


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer 
tfidf_vect = TfidfVectorizer(min_df=5, max_df=0.8)

tfidf_vect.fit(train_dataframe["lyrics"])

def transform_data(tfidf, dataframe):
    features = tfidf_vect.transform(dataframe["lyrics"])
    return pd.DataFrame(features.todense(), columns=tfidf.get_feature_names_out())

train_features = transform_data(tfidf_vect, train_dataframe)
val_features = transform_data(tfidf_vect, val_dataframe)
test_features = transform_data(tfidf_vect, test_dataframe)


In [8]:
# 35%
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier(n_jobs=5)
parameters = {
    "criterion": ["gini", "entropy"],
    "n_estimators": [100, 1000],
    "max_features": ["auto", "sqrt"]
}
clf = GridSearchCV(clf, parameters, verbose=1)
clf.fit(train_features, train_dataframe["artist"])

score = 0
pred_labels = clf.predict(val_features)
for i, name in enumerate(val_dataframe["artist"]):
    if pred_labels[i] == name:
        score += 1
print(f"Validation acc: [{100.0 * score / len(pred_labels)}]")

score = 0
pred_labels = clf.predict(test_features)
for i, name in enumerate(test_dataframe["artist"]):
    if pred_labels[i] == name:
        score += 1
print(f"Test acc:       [{100.0 * score / len(pred_labels)}]")

In [12]:
# 33%
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

clf = SVC()
parameters = {
    "kernel": ["rbf"],
    "gamma": ["scale", "auto"],
    "C": [1, 10]
}
clf = GridSearchCV(clf, parameters, verbose=1)
clf.fit(train_features, train_dataframe["artist"])

score = 0
pred_labels = clf.predict(val_features)
for i, name in enumerate(val_dataframe["artist"]):
    if pred_labels[i] == name:
        score += 1
print(f"Validation acc: [{100.0 * score / len(pred_labels)}]")

score = 0
pred_labels = clf.predict(test_features)
for i, name in enumerate(test_dataframe["artist"]):
    if pred_labels[i] == name:
        score += 1
print(f"Test acc:       [{100.0 * score / len(pred_labels)}]")

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Validation acc: [36.42384105960265]
Test acc:       [33.17880794701987]
