First, we manage all imports for the code.

In [None]:
!pip install gensim
!pip install pandas
!pip install nltk
!pip install install -Uq bertopic
!pip install torch

In [None]:
#imports
import pandas as pd
import numpy as np
import random
import ast
from torch import nn
import torch
import re
import nltk
from torch.utils.data import random_split, Subset, DataLoader
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import zipfile
import os
import shutil

import bertopic
from sentence_transformers import SentenceTransformer

Next we process the zipped data to csv. It is stored in .zip, because it is smaller on git and then locally it is extracted to csv.

In [None]:
with zipfile.ZipFile("data/dataset.zip", "r") as zip_ref:
    zip_ref.extractall("data")

for file_name in os.listdir("data/dataset"):
    source = "data/dataset/" + file_name
    destination = "data/" + file_name
    if os.path.isfile(source): shutil.move(source, destination)

os.rmdir("data/dataset")
del source, destination

Preprocessing of the datasets, the goal is to receive a table with the following columns: Name, Lyrics, Genre<br>
df Dataset - Name, Lyrics, Genre<br>
df3 Dataset - 10000 entrys of random selectet lyrics to train doc2vec

In [18]:

df = pd.read_csv('data/Spotify-2000.csv')
df = df[['Title', 'Top Genre']] #take only the name and genre
df2 = pd.read_csv('data/spotify_millsongdata.csv')
df4 = pd.read_csv('data/tcc_ceds_music.csv')
df4 = df4[['track_name','genre','lyrics']]

df['lyrics'] = '' # add column lyrics
#Now combine those two documents by the title
found = 0

for x, title in enumerate(df['Title']):
    #print(title)
    for y, title2 in enumerate(df2['song']):
        if title2.lower() == title.lower():
            df['lyrics'].iloc[x] = df2['text'].iloc[y]

print("finished combining..")

finished combining..


Collect 10000 random entrys of lyrics from the millsongdata Dataset

In [None]:
df3 = df2['text'].sample(n=10000)
print(df3)
print('done')

In [None]:
#remove songs which were not in both datsets
songs_to_remove = []
for x, lyrics in enumerate(df['lyrics']):
    if lyrics == '':
        songs_to_remove.append(x)
df.drop(songs_to_remove, axis = 0, inplace = True)

print(df)

Further preprocessing of the lyrics itself. We remove the stopwords and punctuations with regex and stopwords from ntlk form df and df3

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

sw = stopwords.words('english')
punc_regex = r'[^\s\w]' #searches for everything thats not a word or space
stopword_regex = r'\b{0}\b'
space_regex = r'\s\s+'
newl_regex = r'\n|\r'

print(df3.iloc[1])

#preprocessing of df
for x, lyrics in enumerate(df['lyrics']):
    txt = lyrics
    txt = re.sub(punc_regex, '', txt) #remove punuctuations
    for sword in sw:
        txt = re.sub(stopword_regex.format(sword), '', txt, flags=re.IGNORECASE) #remove every stopword
    txt = re.sub(space_regex, ' ', txt)
    df['lyrics'].iloc[x] = txt


    
#preprocessing of df3
for x, lyrics in enumerate(df3):
    txt = lyrics
    txt = re.sub(punc_regex, '', txt) #remove punuctuations
    for sword in sw:
        txt = re.sub(stopword_regex.format(sword), '', txt, flags=re.IGNORECASE) #remove every stopword
    txt = re.sub(space_regex, ' ', txt)
    df3.iloc[x] = txt
    
#print(df)
print(df3.iloc[1])

In [None]:
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

#display(df)

topic_model = bertopic.BERTopic(language='english', embedding_model=sentence_model,verbose=True)

topics, probs = topic_model.fit_transform(df["lyrics"])

print(topics)

In [None]:
print(topic_model.get_topic_info())

print(topic_model.get_topics()[1])

Tokenize the lyrics and create tagged Documents

In [None]:
tagged_data = []
nltk.download('punkt')

for i,d in enumerate(df3):
    tokenized_words = nltk.tokenize.word_tokenize(d)
    tagged_data.append(TaggedDocument(words=tokenized_words, tags=str(i)))

print(len(tagged_data))

Setting up the Doc2Vec model

In [None]:
doc2vec_model = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=1, epochs=30)
doc2vec_model.build_vocab(tagged_data)
doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

print("Doc2Vec model finished training")

Building the DataLoader for the machine learning model

In [None]:
columns = ['Token'] + ['Target']
df_for_dataloader = pd.DataFrame(columns = columns)
df_for_dataloader.set_index(columns)

lookup_dict = {}
token_list = []
target_list = []
i = 0
for d in df["Top Genre"]:
    if d not in lookup_dict:
        lookup_dict[d] = i
        i+=1

for index, df_row in df.iterrows():
    lyrics_tokenized = nltk.tokenize.word_tokenize(df_row["lyrics"])
    token = [doc2vec_model.infer_vector(lyrics_tokenized)]
    #print(token)
    one_hot_encoded_vector = []
    for x in lookup_dict.keys():
        if df_row["Top Genre"] == x:
            one_hot_encoded_vector.append(1)
        else:
            one_hot_encoded_vector.append(0)
    target = [np.array(one_hot_encoded_vector)] # should be genre
    #row = pd.DataFrame([token + target], columns=['Token', 'Target'])
    #df_for_dataloader = pd.concat([df_for_dataloader, row])
    token_list.append(token)
    target_list.append(target)

    
dataloader = torch.utils.data.DataLoader(list(zip(token_list,target_list)), batch_size=32)

In [None]:
device = torch.device('cpu') #torch.device('cuda' if torch.cuda.is_available() else 'cpu') For faster training in the end

class SongModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(SongModel, self).__init__()
        self.input_size = input_size
        self.hidden_size = input_size * 3
        self.output_size = output_size
        self.layer_1 =   nn.Linear(self.input_size, self.hidden_size)
        self.layer_2 =   nn.Linear(self.hidden_size, self.hidden_size)
        self.layer_out = nn.Linear(self.hidden_size, self.output_size)
        self.softmax =   nn.Softmax()
        self.sigmoid =   nn.Sigmoid()
        self.tanh =      nn.Tanh()

        nn.init.xavier_uniform_(self.layer_1.weight)
        nn.init.zeros_(self.layer_1.bias)
        nn.init.xavier_uniform_(self.layer_2.weight)
        nn.init.zeros_(self.layer_2.bias)
        nn.init.xavier_uniform_(self.layer_out.weight)
        nn.init.zeros_(self.layer_out.bias)


    def forward(self, d):
        x = self.sigmoid(self.layer_1(d))
        x = self.tanh(self.layer_2(x))
        x = self.softmax(self.layer_out(x))
        return x

model = SongModel(dataloader.dataset[0][0][0].size, dataloader.dataset[0][1][0].size).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.025)
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
epochs = 1000
ff = dataloader.dataset[0][1][0].size
for t in range(epochs):
    total_loss = 0
    iterations = 0
    for sample in dataloader:
        model_input = sample[0][0]
        should = sample[1][0]
        predict = model(model_input)
        loss = loss_fn(predict, should.float())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss
        iterations += 1
    print(f"Epoch {t+1:3}. Loss: {total_loss / len(dataloader):3.20}", end="\r")