In [None]:
import string
import pandas as pd
import contractions
from tqdm import tqdm
import numpy as np
import nltk
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
import torch
from sklearn.preprocessing import LabelEncoder

In [None]:
# Load the data
df = pd.read_csv('data/fma_cut100_echonest_lyrics_fake_country.csv', header=[0, 1])

In [None]:
df.sample()

In [None]:
lyrics = ('track', 'lyrics')

In [None]:
def preprocess_for_nlp(data: pd.DataFrame) -> pd.DataFrame:
    """
    Function for preprocessing the lyrics column for NLP tasks
    
    Args:
    data: pd.DataFrame - DataFrame with lyrics column
    """
    punctuation_to_remove = string.punctuation.replace("'", "")
    data[lyrics] = data[lyrics].str.lower()
    data[lyrics] = data[lyrics].str.replace('chorus','')
    data[lyrics] = data[lyrics].str.replace('verse','')
    data[lyrics] = data[lyrics].str.replace(f"[{punctuation_to_remove}]", "", regex=True)
    data[lyrics] = data[lyrics].str.replace('2x','')
    data[lyrics] = data[lyrics].str.replace('x2','')
    data[lyrics] = data[lyrics].str.replace('3x','')
    data[lyrics] = data[lyrics].str.replace('x3','')
    data[lyrics] = data[lyrics].str.replace('4x','')
    data[lyrics] = data[lyrics].str.replace('x4','')
    data[lyrics] = data[lyrics].str.replace('5x','')
    data[lyrics] = data[lyrics].str.replace('x5','')
    data[lyrics] = data[lyrics].str.replace('6x','')
    data[lyrics] = data[lyrics].str.replace('x6','')
    data[lyrics] = data[lyrics].str.replace('7x','')
    data[lyrics] = data[lyrics].str.replace('x7','')
    data[lyrics] = data[lyrics].str.replace('8x','')
    data[lyrics] = data[lyrics].str.replace('x8','')
    data[lyrics] = data[lyrics].str.replace('9x','')
    data[lyrics] = data[lyrics].str.replace('x9','')
    data[lyrics] = data[lyrics].str.replace('\n', ' ')
    data[lyrics] = data[lyrics].str.replace(r'\s+', ' ', regex=True).str.strip()
    return data

In [None]:
df_pre = preprocess_for_nlp(data=df)

In [None]:
df_pre.sample()

In [None]:
# We have to remove also the numbers
def remove_numbers(df: pd.DataFrame, column) -> pd.DataFrame:
    """
    Function for removing numbers from the column lyrics
    """
    df[column] = df[column].str.replace(r'\d+', '', regex=True)
    df[column] = df[column].str.replace("'","")
    return df

df_pre = remove_numbers(df_pre, lyrics)

In [None]:
df_pre.sample(5)

In [None]:
def handle_contractions(df: pd.DataFrame) -> pd.DataFrame:
    """
    Function for handling contractions -> expanding them to full words f.ex. don't to do not
    
    Args:
    df: pd.DataFrame - DataFrame with lyrics column
    """
    tqdm.pandas(desc="Handling contractions")
    data = df.copy()
    data[lyrics] = data[lyrics].progress_apply(lambda x: contractions.fix(x))
    return data

df_without_contractions = handle_contractions(df_pre)

In [None]:
df_without_contractions.sample(5)

In [None]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Function to lemmatize lyrics
def lemmatize_lyrics(lyrics):
    tokens = word_tokenize(lyrics)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

# Create sets
tqdm.pandas()
df['Lyrics_Lemmatized'] = df[lyrics].progress_apply(lemmatize_lyrics)

In [None]:
# Check when track language code is different than 'en' or NaN
df['track', 'language_code'].value_counts()

In [None]:
# Filter out non-English lyrics -> delete row when it or es
df = df[df['track', 'language_code'].isin(['en', np.nan])]

In [None]:
df['Lyrics_Lemmatized'].sample(5)

In [None]:
df.sort_values(('track', 'one_genre'), inplace=True)

In [None]:
# Load model
device = 'mps'

# Initialize tokenizer and model
tokenizer = RobertaTokenizerFast.from_pretrained("distilroberta-base")
model = RobertaForSequenceClassification.from_pretrained(
    "distilroberta-base",
    num_labels=8,
    problem_type="multi_label_classification"
).to(device)

model.load_state_dict(torch.load("models/lyrics_genre_model_alphabetically.pt", map_location=device, weights_only=True))
model.to(device)
model.eval() # Set model to evaluation mode

In [None]:
# Prepare labels
y = df[('track','one_genre')]

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = np.eye(len(label_encoder.classes_))[y]  # One-hot encode genres
print(label_encoder.inverse_transform([0, 1, 2, 3, 4, 5, 6, 7]))
X = df['Lyrics_Lemmatized']

# Check the shape of the splits
print(f'Data set: {X.shape[0]} samples')

In [None]:
class LyricsGenreDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts.tolist()
        self.labels = torch.tensor(labels, dtype=torch.float32)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": label,
        }

In [None]:
# Create datasets and dataloaders
batch_size = 24

dataset = LyricsGenreDataset(X, y, tokenizer)

data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

In [None]:
features = []

model.eval()  # Set model to evaluation mode
with torch.no_grad():  # Disable gradient computation for inference
    for batch in data_loader:
        # Move inputs to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        # Get model outputs
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True
        )

        # Extract hidden states (use the last layer's hidden states)
        hidden_states = outputs.hidden_states[-1]  # Shape: (batch_size, seq_len, hidden_dim)

        # Perform pooling (e.g., mean pooling across sequence length)
        pooled_features = hidden_states.mean(dim=1)  # Shape: (batch_size, hidden_dim)

        # Collect features
        features.append(pooled_features.cpu())

In [None]:
features

In [None]:
import torch

# Concatenate features from all batches
lyrics_features = torch.cat(features, dim=0)  # Shape: (num_samples, hidden_dim)

In [None]:
lyrics_features_np = lyrics_features.numpy()

In [None]:
lyrics_features_np

In [None]:
labels = y.argmax(axis=1)

In [None]:
labels

In [None]:
np.save("features/lyrics_features_2.npy", lyrics_features_np)
np.save("features/lyrics_labels_2.npy", labels)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def evaluate_model_single_genre(model, test_loader, device):
    model.eval()
    test_preds, test_labels, test_probs = [], [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)

            # Get probabilities
            probs = torch.sigmoid(outputs.logits).cpu().numpy()
            test_probs.extend(probs)

            test_preds.extend(np.argmax(probs, axis=1))
            test_labels.extend(np.argmax(labels.cpu().numpy(), axis=1))

    test_accuracy = accuracy_score(test_labels, test_preds)
    test_f1 = f1_score(test_labels, test_preds, average="macro")
    print(f"Test Accuracy: {test_accuracy}")
    print(f"Test F1 Score: {test_f1}")

    return test_labels, test_preds, test_probs

# Run evaluation and get predictions and labels
test_labels, test_preds, test_probs = evaluate_model_single_genre(model, data_loader, device)

In [None]:
from sklearn.metrics import classification_report

def classification_report_single_genre(test_preds, test_labels):
    print(classification_report(test_labels, test_preds, target_names=list(label_encoder.classes_)))

# Use this function as needed to print the classification report
classification_report_single_genre(test_preds, test_labels)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

def plot_single_confusion_matrix(test_labels, test_preds, label_names):
    cm = confusion_matrix(test_labels, test_preds)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    # Use latex
    plt.rc('text', usetex=True)
    plt.rc('font', family='serif')

    # Plot Non-Normalized
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title("Confusion Matrix (Non-Normalized)")
    plt.ylabel("True Genre")
    plt.xlabel("Predicted Genre")
    plt.show()


    colors = ["#FFFFFF", "#455681"]  # White to #455681 gradient
    custom_cmap = LinearSegmentedColormap.from_list("custom_white_to_blue", colors)
    
    # Normalize the confusion matrix# 
    conf_matrix_norm = cm_normalized
    
    plt.rc('text', usetex=True)
    plt.rc('font', family='serif')
    # Plot the normalized confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix_norm, cmap=custom_cmap, annot=True, fmt=".2f", xticklabels=label_names, yticklabels=label_names, vmax=1.0)
    plt.xlabel("Predicted", fontdict={"fontsize": 12})
    plt.ylabel("True", fontdict={"fontsize": 12})
    plt.tight_layout()
    plt.savefig("confusion_matrix_normalized_lyrics.eps", dpi=300)
    plt.show()

# Plot single confusion matrix
plot_single_confusion_matrix(test_labels, test_preds, list(label_encoder.classes_))




In [None]:
# Print auc-roc curve for each class on one plot
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt


# Run evaluation and get predictions, labels, and probabilities

# Get the true positive rate and false positive rate
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(8):  # Assuming 8 classes
    fpr[i], tpr[i], _ = roc_curve((np.array(test_labels) == i).astype(int), np.array(test_probs)[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
    
    

np.save("fpr_lyric_multi.npy", fpr)
np.save("tpr_lyric_multi.npy", tpr)
np.save("roc_auc_lyric_multi.npy", roc_auc)

plt.rc('text', usetex=True)
plt.rc('font', family='serif')
# Plot the ROC curve
plt.figure(figsize=(10, 8))
for i in range(8):
    plt.plot(fpr[i], tpr[i], label=f"{label_encoder.classes_[i]} (AUC = {roc_auc[i]:.2f})")  # Use label_encoder.classes_ for genre names
plt.plot([0, 1], [0, 1], "k--")
# Set style
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.style.context('tableau-colorblind10')

plt.legend(loc="lower right")  # Adjust legend location if needed
plt.style.use('fast')
plt.style.context('fast')
plt.savefig("auc_roc_roberta.png", dpi=300)

plt.show()



