In [1]:
import os
import pandas as pd
import csv
import re

import nltk
#nltk.download('punkt')
from nltk import word_tokenize
from nltk import sent_tokenize
from gensim.models.doc2vec import TaggedDocument

In [2]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Loading files and removing NaNs

In [3]:
df = pd.read_csv("songlyrics/songs_with_filenames_and_feats_4.csv", delimiter=";")
df = df[df['filename'].notna()]

In [4]:
DATA = "songlyrics/lyrics"

In [5]:
songs = os.listdir(DATA)

In [6]:
print(f"found {len(songs)} unique files") 

found 5809 unique files


## String Processing and Tokenization

We extract tokens, bigrams and trigrams from each lyrics

In [7]:
def get_grams(text):
    toks = word_tokenize(text)
    bigs = [a + "_" + b for a,b in nltk.bigrams(toks)]
    trigs = [a + "_" + b + "_" + c for a,b,c in nltk.trigrams(toks)]
    return toks + bigs + trigs

In [8]:
data = list()
for song in df["filename"]:
    with open(os.path.join(DATA,song),"r",encoding='utf-8') as file:
        lyrics = get_grams(file.read().lower())
        data.append(TaggedDocument(lyrics ,(song,)))

## Word2Vec model training on extracted features

In [12]:
from gensim.models import Doc2Vec
model = Doc2Vec(data, vector_size=100, workers=8)

2021-05-31 16:56:48,638 : INFO : collecting all words and their counts
2021-05-31 16:56:48,640 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2021-05-31 16:56:52,026 : INFO : collected 2196567 word types and 5715 unique tags from a corpus of 5738 examples and 8199708 words
2021-05-31 16:56:52,026 : INFO : Creating a fresh vocabulary
2021-05-31 16:56:53,604 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=5 retains 178277 unique words (8.11616490641988%% of original 2196567, drops 2018290)', 'datetime': '2021-05-31T16:56:53.604652', 'gensim': '4.0.1', 'python': '3.8.10 (default, May 19 2021, 18:05:58) \n[GCC 7.3.0]', 'platform': 'Linux-5.11.0-7614-generic-x86_64-with-glibc2.10', 'event': 'prepare_vocab'}
2021-05-31 16:56:53,605 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 5413500 word corpus (66.02064366194503%% of original 8199708, drops 2786208)', 'datetime': '2021-05-31T16:56:53.605572', 'gensim': '4.0.1', 'pytho

## We save the Doc2Vec vectors to file for further analysis 

In [15]:
doc2vec_vectors = model.dv

In [16]:
doc2vec_vectors.save("word2vecmodels/docs_grams_debug.model")

2021-05-31 16:58:06,173 : INFO : KeyedVectors lifecycle event {'fname_or_handle': 'word2vecmodels/docs_grams_debug.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-05-31T16:58:06.173525', 'gensim': '4.0.1', 'python': '3.8.10 (default, May 19 2021, 18:05:58) \n[GCC 7.3.0]', 'platform': 'Linux-5.11.0-7614-generic-x86_64-with-glibc2.10', 'event': 'saving'}
2021-05-31 16:58:06,182 : INFO : saved word2vecmodels/docs_grams_debug.model
