In [1]:
import os
import pandas as pd
import csv
import re

import nltk
#nltk.download('punkt')
from nltk import word_tokenize
from nltk import sent_tokenize
from gensim.models.doc2vec import TaggedDocument

In [2]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Loading files and removing NaNs

In [3]:
df = pd.read_csv("songlyrics/songs_with_filenames_and_feats_6.csv", delimiter=";")
df = df[df['filename'].notna()]

In [4]:
DATA = "songlyrics/lyrics"

In [5]:
songs = os.listdir(DATA)

In [6]:
print(f"found {len(songs)} unique files") 

found 6671 unique files


## String Processing and Tokenization

We extract tokens, bigrams and trigrams from each lyrics

In [7]:
def get_grams(text):
    toks = word_tokenize(text)
    bigs = [a + "_" + b for a,b in nltk.bigrams(toks)]
    trigs = [a + "_" + b + "_" + c for a,b,c in nltk.trigrams(toks)]
    return toks + bigs + trigs

In [8]:
data = list()
for song in df["filename"]:
    with open(os.path.join(DATA,song),"r",encoding='utf-8') as file:
        lyrics = get_grams(file.read().lower())
        data.append(TaggedDocument(lyrics ,(song,)))

## Word2Vec model training on extracted features

In [9]:
from gensim.models import Doc2Vec
model = Doc2Vec(data, vector_size=100, workers=8)

2021-06-23 16:11:05,508 : INFO : collecting all words and their counts
2021-06-23 16:11:05,509 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2021-06-23 16:11:08,803 : INFO : collected 2311575 word types and 6094 unique tags from a corpus of 6137 examples and 8629734 words
2021-06-23 16:11:08,804 : INFO : Creating a fresh vocabulary
2021-06-23 16:11:10,365 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=5 retains 184790 unique words (7.994116565545137%% of original 2311575, drops 2126785)', 'datetime': '2021-06-23T16:11:10.341207', 'gensim': '4.0.1', 'python': '3.8.10 (default, Jun  4 2021, 15:09:15) \n[GCC 7.5.0]', 'platform': 'Linux-5.11.0-7614-generic-x86_64-with-glibc2.17', 'event': 'prepare_vocab'}
2021-06-23 16:11:10,366 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 5724104 word corpus (66.33001666100022%% of original 8629734, drops 2905630)', 'datetime': '2021-06-23T16:11:10.366803', 'gensim': '4.0.1', 'pyth

## We save the Doc2Vec vectors to file for further analysis 

In [10]:
doc2vec_vectors = model.dv

In [11]:
doc2vec_vectors.save("word2vecmodels/docs_grams_final.model")

2021-06-23 16:12:23,214 : INFO : KeyedVectors lifecycle event {'fname_or_handle': 'word2vecmodels/docs_grams_final.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-06-23T16:12:23.214456', 'gensim': '4.0.1', 'python': '3.8.10 (default, Jun  4 2021, 15:09:15) \n[GCC 7.5.0]', 'platform': 'Linux-5.11.0-7614-generic-x86_64-with-glibc2.17', 'event': 'saving'}
2021-06-23 16:12:23,221 : INFO : saved word2vecmodels/docs_grams_final.model
