In [1]:
import os
import pandas as pd
import csv
import re

import nltk
#nltk.download('punkt')
from nltk import word_tokenize
from nltk import sent_tokenize
from gensim.models.doc2vec import TaggedDocument

In [2]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Loading files and removing NaNs

In [3]:
df = pd.read_csv("songs_with_filenames_and_feats.csv", delimiter=";")
df = df[df['filename'].notna()]

In [4]:
DATA = "songlyrics/lyrics"

In [5]:
songs = os.listdir(DATA)

In [6]:
print(f"found {len(songs)} unique files") 

found 5163 unique files


## String Processing and Tokenization

We extract tokens, bigrams and trigrams from each lyrics

In [7]:
def get_grams(text):
    toks = word_tokenize(text)
    bigs = [a + "_" + b for a,b in nltk.bigrams(toks)]
    trigs = [a + "_" + b + "_" + c for a,b,c in nltk.trigrams(toks)]
    return toks + bigs + trigs

In [8]:
data = list()
for song in df["filename"]:
    with open(os.path.join(DATA,song),"r",encoding='utf-8') as file:
        lyrics = get_grams(file.read().lower())
        data.append(TaggedDocument(lyrics ,(song,)))

## Word2Vec model training on extracted features

In [11]:
from gensim.models import Doc2Vec
model = Doc2Vec(data, vector_size=100, window=10, min_count=5, workers=8)

2021-05-25 11:38:47,701 : INFO : collecting all words and their counts
2021-05-25 11:38:47,702 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2021-05-25 11:38:53,433 : INFO : collected 4160895 word types and 5163 unique tags from a corpus of 5183 examples and 18486903 words
2021-05-25 11:38:53,434 : INFO : Creating a fresh vocabulary
2021-05-25 11:38:56,023 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=5 retains 366107 unique words (8.798756036862262%% of original 4160895, drops 3794788)', 'datetime': '2021-05-25T11:38:56.023143', 'gensim': '4.0.1', 'python': '3.8.10 (default, May 19 2021, 18:05:58) \n[GCC 7.3.0]', 'platform': 'Linux-5.11.0-7614-generic-x86_64-with-glibc2.10', 'event': 'prepare_vocab'}
2021-05-25 11:38:56,023 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 12693570 word corpus (68.66250123127708%% of original 18486903, drops 5793333)', 'datetime': '2021-05-25T11:38:56.023905', 'gensim': '4.0.1', 'p

## We save the Doc2Vec vectors to file for further analysis 

In [13]:
doc2vec_vectors = model.dv

In [16]:
doc2vec_vectors.save("docs_grams.model")

2021-05-25 11:45:56,098 : INFO : KeyedVectors lifecycle event {'fname_or_handle': 'aa.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-05-25T11:45:56.098266', 'gensim': '4.0.1', 'python': '3.8.10 (default, May 19 2021, 18:05:58) \n[GCC 7.3.0]', 'platform': 'Linux-5.11.0-7614-generic-x86_64-with-glibc2.10', 'event': 'saving'}
2021-05-25 11:45:56,110 : INFO : saved aa.model
