First, we manage all imports for the code.

In [34]:
!pip install gensim

The folder you are executing pip from can no longer be found.


In [35]:
#imports
import pandas as pd
import numpy as np
import random
import ast
from torch import nn
import torch
import re
import nltk
from torch.utils.data import random_split, Subset, DataLoader
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import zipfile
import os
import shutil

Next we process the zipped data to csv. It is stored in .zip, because it is smaller on git and then locally it is extracted to csv.

In [None]:
with zipfile.ZipFile("data/dataset.zip", "r") as zip_ref:
    zip_ref.extractall("data")

for file_name in os.listdir("data/dataset"):
    source = "data/dataset/" + file_name
    destination = "data/" + file_name
    if os.path.isfile(source): shutil.move(source, destination)

os.rmdir("data/dataset")
del source, destination

Preprocessing of the datasets, the goal is to receive a table with the following columns: Name, Lyrics, Genre

In [None]:

df = pd.read_csv('data/Spotify-2000.csv')
df = df[['Title', 'Top Genre']] #take only the name and genre
df2 = pd.read_csv('data/spotify_millsongdata.csv')
df['lyrics'] = ' ' # add column lyrics
#Now combine those two documents by the title
found = 0

for x, title in enumerate(df['Title']):
    #print(title)
    for y, title2 in enumerate(df2['song']):
        if title2.lower() == title.lower():
            df['lyrics'].iloc[x] = df2['text'].iloc[y]

print("finished combining..")

In [None]:
#remove songs which were not in both datsets
songs_to_remove = []
for x, lyrics in enumerate(df['lyrics']):
    if lyrics == '':
        songs_to_remove.append(x)
df.drop(songs_to_remove, axis = 0, inplace = True)

print(df)

Further preprocessing of the lyrics itself. We remove the stopwords and punctuations with regex and stopwords from ntlk

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

sw = stopwords.words('english')
punc_regex = r'[^\s\w]' #searches for everything thats not a word or space
stopword_regex = r'\b{0}\b'

print(df['lyrics'].iloc[1])

for x, lyrics in enumerate(df['lyrics']):
    txt = lyrics
    txt = re.sub(punc_regex, '', txt) #remove punuctuations
    for sword in sw:
        txt = re.sub(stopword_regex.format(sword), '', txt, flags=re.IGNORECASE) #remove every stopword
    df['lyrics'].iloc[x] = txt

#print(df)
print(df['lyrics'].iloc[1])

Tokenize the lyrics and create tagged Documents

In [None]:
tagged_data = []

for i,d in enumerate(df['lyrics']):
    tokenized_words = nltk.tokenize.word_tokenize(d)
    tagged_data.append(TaggedDocument(words=tokenized_words, tags=str(i)))

print(tagged_data)

Setting up the Doc2Vec model

In [None]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=1, epochs=30)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

print("Doc2Vec model finished training")

Building the DataLoader for the machine learning model

In [None]:
columns = ['Token'] + ['Target']
df_for_dataloader = pd.DataFrame(columns = columns)
df_for_dataloader.set_index(columns)

for i in range(len(tagged_data)):
    token = [model.infer_vector(tagged_data[i][0])]
    #token = [list(model.dv[i])]
    target = [5] # should be genre
    row = pd.DataFrame([token + target], columns=['Token', 'Target'])
    df_for_dataloader = pd.concat([df_for_dataloader, row])
    
display(df_for_dataloader.iloc[:5]) 

In [None]:
device = torch.device('cpu') #torch.device('cuda' if torch.cuda.is_available() else 'cpu') For faster training in the end

class SongModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(SongModel, self).__init__()
        self.input_size = input_size
        self.hidden_size = input_size * 3
        self.output_size = output_size
        self.layer_1 =   nn.Linear(self.input_size, self.hidden_size)
        self.layer_2 =   nn.Linear(self.hidden_size, self.hidden_size)
        self.layer_out = nn.Linear(self.hidden_size, self.output_size)
        self.softmax =   nn.Softmax()
        self.sigmoid =   nn.Sigmoid()
        self.tanh =      nn.Tanh()

        nn.init.xavier_uniform_(self.layer_1.weight)
        nn.init.zeros_(self.layer_1.bias)
        nn.init.xavier_uniform_(self.layer_2.weight)
        nn.init.zeros_(self.layer_2.bias)
        nn.init.xavier_uniform_(self.layer_out.weight)
        nn.init.zeros_(self.layer_out.bias)


    def forward(self, d):
        x = self.sigmoid(self.layer_1(d))
        x = self.tanh(self.layer_2(x))
        x = self.softmax(self.layer_out(x))
        return x

model = SongModel(1, 3).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.025)
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
def train(dataloader, model, loss_fn, optimizer):
    total_loss = 0
    iterations = 0
    for sample in dataloader:
        model_input = sample[0]
        should = sample[1]
        predict = model(model_input)
        loss = loss_fn(predict, should.float())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss
        iterations += 1
        if iterations % 1000 == 0:
            print(f"Predict: {predict}, Loss: {loss}")

    print(total_loss / len(dataloader))
    return