In [1]:
import pandas as pd
import spacy
import gensim
from gensim.models import Word2Vec

In [2]:
df_path = 'dataframes/lyrics_combined.csv'

In [7]:
df = pd.read_csv(df_path, index_col=0)

In [8]:
df.reset_index(inplace=True, drop=True)

In [9]:
df.head()

Unnamed: 0,Artist,all_lyrics,genre,gender
0,Aaliyah,dirty south can yall really feel me east coas...,pop,female
1,Beyoncé,ive been drinkin ive been drinkin i get filth...,pop,female
2,Britney Spears,oh baby baby oh baby baby oh baby baby how w...,pop,female
3,Carly Rae Jepsen,i threw a wish in the well dont ask me ill ne...,pop,female
4,Charli XCX,i was busy thinkin bout boys boys boys always...,pop,female


In [11]:
nlp = spacy.load('en_core_web_sm')

In [12]:
lyrics = ' '.join(df['all_lyrics'].str.lower())
# for i in df.index:
#     text = df['all_lyrics'][i].str.lower()
#     ' '.join(

In [13]:
len(lyrics)

1583735

In [15]:
nlp.max_length = 1600000

In [16]:
processed_lyrics = nlp(lyrics)

In [17]:
sentences = [s for s in processed_lyrics.sents]

In [18]:
print(len(sentences))

11868


In [22]:
print(len(processed_lyrics.text.split()))

326329


#### preprocess data for training the model (lemmatize)

In [24]:
processed_sentences = [sent.lemma_.split() for sent in processed_lyrics.sents]

#### model

In [28]:
similar_words_model = Word2Vec(
    sentences=processed_sentences,
    min_count=10, # Purning the internal dictionary
    vector_size=200, # the number of dimensions (N) gensim maps the word onto
    window=2, # Define when two words are together, 2 means, 2 words left and 2 words right
    compute_loss=True,
    sg=1
)

print(len(similar_words_model.wv.key_to_index))

2038


In [29]:
# getting the training loss
training_loss = similar_words_model.get_latest_training_loss()
print(f"Training Loss: {training_loss}")

Training Loss: 2385789.75


In [32]:
for w, sim in similar_words_model.wv.most_similar('lover'):
    print((w, sim))

('parttime', 0.8531744480133057)
('toro', 0.8056215643882751)
('gimme', 0.7868713736534119)
('jean', 0.7758487462997437)
('rape', 0.7753496170043945)
('danja', 0.7654772996902466)
('ohohoh', 0.7528627514839172)
('merry', 0.7471245527267456)
('woahwoah', 0.7464218735694885)
('mo—', 0.7442200183868408)


### Get similar out-of-corpus words from a FastText set

In [36]:
from gensim.models import FastText

model = FastText(window=2)
model.build_vocab(corpus_iterable=processed_sentences)
model.train(corpus_iterable=processed_sentences, total_examples=len(processed_sentences), epochs=10)

TypeError: can only concatenate str (not "int") to str

In [37]:
for w, sim in model.wv.most_similar('love', topn=20):
    print((w, sim))

('glove', 0.9545936584472656)
('prove', 0.8232774138450623)
('lovely', 0.8214894533157349)
('lover', 0.8095089793205261)
('loving', 0.7901659607887268)
('logic', 0.7800371646881104)
('loser', 0.7678811550140381)
('shove', 0.7645830512046814)
('los', 0.763531506061554)
('happiness', 0.7496615648269653)
('I’ve', 0.7415767312049866)
('curve', 0.7402692437171936)
('lo', 0.7395698428153992)
('pray', 0.7344256043434143)
('serve', 0.7340152263641357)
('false', 0.7322720289230347)
('lovin', 0.732210636138916)
('remove', 0.7278366088867188)
('lobby', 0.7260144352912903)
('nerve', 0.7228999733924866)
