In [None]:
! mamba install -c pytorch -c nvidia -c conda-forge pandas spacy tqdm scikit-learn gensim sentence-transformers pytorch pytorch-cuda=12.1 -y
! python -m spacy download en_core_web_sm

In [1]:
import pandas as pd
import spacy
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import Word2Vec
import numpy as np
from sentence_transformers import SentenceTransformer

In [2]:
# Load CSV file into a pandas DataFrame, the file has a header row with the column names and the separator is a comma
df = pd.read_csv("song_lyrics.csv")

# Print the first 5 rows of the dataframe
print(df.head())

               title  tag     artist  year   views  \
0          Killa Cam  rap    Cam'ron  2004  173166   
1         Can I Live  rap      JAY-Z  1996  468624   
2  Forgive Me Father  rap   Fabolous  2003    4743   
3       Down and Out  rap    Cam'ron  2004  144404   
4             Fly In  rap  Lil Wayne  2005   78271   

                                       features  \
0                   {"Cam\\'ron","Opera Steve"}   
1                                            {}   
2                                            {}   
3  {"Cam\\'ron","Kanye West","Syleena Johnson"}   
4                                            {}   

                                              lyrics  id language_cld3  \
0  [Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...   1            en   
1  [Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...   3            en   
2  Maybe cause I'm eatin\nAnd these bastards fien...   4            en   
3  [Produced by Kanye West and Brian Miller]\n\n[...   5            en  

In [3]:
# List the different tags in the "tag" column and their frequency
print(df["tag"].value_counts())

# List the different languages in the "language" column and their frequency
print(df["language"].value_counts())

tag
pop        2138587
rap        1724816
rock        793220
rb          196462
misc        181455
country     100316
Name: count, dtype: int64
language
en    3374198
es     275432
fr     189436
pt     167947
ru     166044
       ...   
mt          5
uz          4
tg          3
bs          1
gu          1
Name: count, Length: 84, dtype: int64


In [4]:
# Eliminate rows with missing values in the "tag" column or with a "misc" tag
df = df.dropna(subset=["tag"])
df = df[df["tag"] != "misc"]

# Eliminate rows with missing values in the "language" column or with a language different from "en"
df = df.dropna(subset=["language"])
df = df[df["language"] == "en"]

# Print tag and language frequencies
print(df["tag"].value_counts())
print(df["language"].value_counts())

tag
pop        1393559
rap         964605
rock        633308
rb          155082
country      86658
Name: count, dtype: int64
language
en    3233212
Name: count, dtype: int64


In [5]:
# Balance the dataset by keeping the same number of rows for each tag, the number of rows per tag is 80000
df = df.groupby("tag").apply(lambda x: x.sample(80000), include_groups=True)

# Print tag and language frequencies
print(df["tag"].value_counts())
print(df["language"].value_counts())

# Print the first 5 rows of the dataframe
print(df.head())

# Save the balanced dataset to a new CSV file
df.to_csv("lyrics_balanced.csv", index=False)

  df = df.groupby("tag").apply(lambda x: x.sample(80000), include_groups=True)


tag
country    80000
pop        80000
rap        80000
rb         80000
rock       80000
Name: count, dtype: int64
language
en    400000
Name: count, dtype: int64
                                         title      tag           artist  \
tag                                                                        
country 1082777                Honky Tonk Wine  country  Jerry Lee Lewis   
        979185   If You Should Come Back Today  country    Charley Pride   
        3433877                       Novocain  country    Jessica Meuse   
        3394058                      Boys Girl  country     Kylie Morgan   
        191099                     Its My Time  country     Dolly Parton   

                 year  views features  \
tag                                     
country 1082777  1973     62       {}   
        979185   1968    169       {}   
        3433877  2018     19       {}   
        3394058  2019    274       {}   
        191099   1969   1063       {}   

                

In [2]:
# Load the balanced dataset from the new CSV file if df does not exist
if "df" not in locals():
    df = pd.read_csv("lyrics_balanced.csv")

# Remove all lines between square brackets, including the new line character
df["lyrics"] = df["lyrics"].str.replace(r"\[.*\]\n", "", regex=True)

In [3]:
# Create a spaCy pipeline for English
nlp = spacy.load(
    "en_core_web_sm",
    enable=["tok2vec", "tagger", "attribute_ruler", "lemmatizer"],
)


def preprocess_lyrics(doc):
    return " ".join(
        [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    )


# Tokenize, lemmatize, remove stopwords, and convert to lowercase in a single step
df["clean_lyrics"] = list(
    tqdm(nlp.pipe(df["lyrics"].str.lower(), batch_size=1, n_process=16), total=len(df))
)

# Apply the preprocess_lyrics function to the "clean_lyrics" column
df["clean_lyrics"] = df["clean_lyrics"].apply(preprocess_lyrics)

# Print the first 5 rows of the dataframe
print(df.head())

# Save the cleaned dataset to a new CSV file
df.to_csv("lyrics_cleaned.csv", index=False)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 399934/400000 [13:44<00:00, 659.49it/s]Process Process-9:
Process Process-5:
Process Process-8:
Process Process-16:
Process Process-13:
Process Process-1:
Process Process-7:
Process Process-15:
Process Process-12:
Process Process-11:
Process Process-4:
Process Process-14:
Process Process-2:
Process Process-10:
Traceback (most recent call last):
Process Process-6:
Traceback (most recent call last):
Process Process-3:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site

                           title      tag           artist  year  views  \
0                Honky Tonk Wine  country  Jerry Lee Lewis  1973     62   
1  If You Should Come Back Today  country    Charley Pride  1968    169   
2                       Novocain  country    Jessica Meuse  2018     19   
3                      Boys Girl  country     Kylie Morgan  2019    274   
4                    Its My Time  country     Dolly Parton  1969   1063   

  features                                             lyrics       id  \
0       {}  Yeah yeah sweet sweet honky tonk wine keeps me...  1412794   
1       {}  You'd stop at hundred and forty tears I forget...  1303309   
2       {}  It’s just a relapse, it was my last time out t...  5192852   
3       {}  When I get married, if I ever do\nDon't give m...  5133679   
4       {}  It's my time\nGather round girls\nYou I grew u...   207638   

  language_cld3 language_ft language  \
0            en          en       en   
1            en         

In [2]:
# Load the cleaned dataset from the new CSV file if df does not exist
if "df" not in locals():
    df = pd.read_csv("lyrics_cleaned.csv")

# df = df.dropna(subset=["clean_lyrics"])
df = df.dropna(subset=["lyrics"])

# Split the dataset into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(
#     df["clean_lyrics"], df["tag"], test_size=0.2, random_state=42, stratify=df["tag"]
# )
X_train, X_test, y_train, y_test = train_test_split(
    df["lyrics"], df["tag"], test_size=0.2, random_state=42, stratify=df["tag"]
)

In [3]:
# TF-IDF vectorizer
# vectorizer = TfidfVectorizer()

# BoW vectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer on the training set and transform the training and testing sets, show progress bar
X_train_vector = vectorizer.fit_transform(tqdm(X_train))
X_test_vector = vectorizer.transform(tqdm(X_test))

# Train a multinomial Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_vector, y_train)

# Predict the genre of the testing set
y_pred = classifier.predict(X_test_vector)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

100%|██████████| 319996/319996 [00:12<00:00, 25853.55it/s]
100%|██████████| 79999/79999 [00:03<00:00, 26060.56it/s]


Accuracy: 0.5631570394629933
              precision    recall  f1-score   support

     country       0.62      0.66      0.64     16000
         pop       0.35      0.21      0.27     16000
         rap       0.76      0.75      0.75     16000
          rb       0.53      0.59      0.56     16000
        rock       0.49      0.60      0.54     15999

    accuracy                           0.56     79999
   macro avg       0.55      0.56      0.55     79999
weighted avg       0.55      0.56      0.55     79999



In [4]:
# Train a Logistic Regression classifier
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_vector, y_train)

# Predict the genre of the testing set
y_pred = classifier.predict(X_test_vector)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.5569819622745285
              precision    recall  f1-score   support

     country       0.62      0.68      0.65     16000
         pop       0.34      0.31      0.32     16000
         rap       0.78      0.72      0.75     16000
          rb       0.56      0.54      0.55     16000
        rock       0.49      0.54      0.51     15999

    accuracy                           0.56     79999
   macro avg       0.56      0.56      0.56     79999
weighted avg       0.56      0.56      0.56     79999



In [4]:
# Tokenize the clean lyrics, showing a progress bar
X_train_tokens = [lyrics.split() for lyrics in tqdm(X_train, desc="Processing X_train")]
X_test_tokens = [lyrics.split() for lyrics in tqdm(X_test, desc="Processing X_test")]

embedding_size = 300
word2vec_model = Word2Vec(
    sentences=X_train_tokens,
    vector_size=embedding_size,
    window=5,
    min_count=1,
    workers=16,
)


# Convert lyrics to word embeddings
def lyrics_to_embeddings(lyrics_tokens, word2vec_model):
    embeddings = []
    for tokens in tqdm(lyrics_tokens, desc="Converting lyrics to embeddings"):
        token_embeddings = [
            word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv
        ]
        if len(token_embeddings) > 0:
            embedding = np.mean(token_embeddings, axis=0)
        else:
            embedding = np.zeros(embedding_size)
        embeddings.append(embedding)
    return np.array(embeddings)


X_train_embeddings = lyrics_to_embeddings(X_train_tokens, word2vec_model)
X_test_embeddings = lyrics_to_embeddings(X_test_tokens, word2vec_model)

# Train a Logistic Regression classifier
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_embeddings, y_train)

# Make predictions on the testing data
y_pred = classifier.predict(X_test_embeddings)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

Processing X_train: 100%|██████████| 319996/319996 [00:04<00:00, 74921.61it/s] 
Processing X_test: 100%|██████████| 79999/79999 [00:00<00:00, 105955.38it/s]
Converting lyrics to embeddings: 100%|██████████| 319996/319996 [00:42<00:00, 7454.99it/s]
Converting lyrics to embeddings: 100%|██████████| 79999/79999 [00:11<00:00, 6989.67it/s]


Accuracy: 0.5729696621207765
              precision    recall  f1-score   support

     country       0.61      0.69      0.65     16000
         pop       0.36      0.26      0.30     16000
         rap       0.75      0.79      0.77     16000
          rb       0.55      0.57      0.56     16000
        rock       0.52      0.57      0.55     15999

    accuracy                           0.57     79999
   macro avg       0.56      0.57      0.56     79999
weighted avg       0.56      0.57      0.56     79999



In [3]:
# Load the pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")
# model = SentenceTransformer("avsolatorio/GIST-small-Embedding-v0", device="cuda")

# Encode the clean lyrics
X_train_embeddings = model.encode(
    list(X_train), show_progress_bar=True, convert_to_tensor=True
)
X_test_embeddings = model.encode(
    list(X_test), show_progress_bar=True, convert_to_tensor=True
)

Batches:   0%|          | 0/10000 [00:00<?, ?it/s]

Batches:   0%|          | 0/2500 [00:00<?, ?it/s]

In [4]:
# Train a Logistic Regression classifier
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_embeddings.cpu().numpy(), y_train)

# Make predictions on the testing data
y_pred = classifier.predict(X_test_embeddings.cpu().numpy())

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

Accuracy: 0.5703875
              precision    recall  f1-score   support

     country       0.59      0.68      0.63     16000
         pop       0.37      0.26      0.30     16000
         rap       0.75      0.78      0.77     16000
          rb       0.55      0.58      0.56     16000
        rock       0.52      0.55      0.54     16000

    accuracy                           0.57     80000
   macro avg       0.56      0.57      0.56     80000
weighted avg       0.56      0.57      0.56     80000

