In [9]:
import numpy as np
import pandas as pd
from gensim.models import nmf, ldamodel
import gensim.models.phrases
from gensim.matutils import corpus2dense
from gensim.corpora.dictionary import Dictionary
from gensim.models.phrases import Phrases
from sklearn.model_selection import train_test_split
from project_functions.modeling import vectorize_data, perform_tsne_analysis

In [10]:
df = pd.read_parquet('../data/cleaned/tokenized_data_complete.parquet')

In [11]:
df

Unnamed: 0,title_name,genre_name,artist_name,lyrics,clean_title
0,Onward to Die,Rock-Death-Metal-Black-Metal,Paganizer,"[machine, end, line, spew, forth, death, meat,...",Onward to Die
1,Illumina,Rock-Death-Metal-Black-Metal,Exlibris,"[ready, blade, mental, crusade, matter, right,...",Illumina
2,History Bodies,Rock-Death-Metal-Black-Metal,Amagortis,"[close, eye, leave, control, move, mind, histo...",History Bodies
3,An Eternal Dark Horizon,Rock-Death-Metal-Black-Metal,Throne Of Katarsis,"[ride, northwind, distant, place, know, float,...",An Eternal Dark Horizon
4,An Icon for the Damned,Rock-Death-Metal-Black-Metal,Paganizer,"[immortality, war, dead, rise, worship, icon, ...",An Icon for the Damned
...,...,...,...,...,...
67361,Nuclear Soldier,Reggae,Aswad,"[wan, na, see, nuclear, soldier, wan, na, see,...",Nuclear Soldier
67362,Natty Dread,Reggae,Jaqee,"[start, explain, situation, taste, bitterness,...",Natty Dread
67363,Natty Rebel (2001 Remaster),Reggae,U-Roy,"[natty, rebel, soul, rebel, rebel, soul, rebel...",Natty Rebel (2001 Remaster)
67364,Money Can't Buy Fren,Reggae,Vybz Kartel aka Addi Innocent,"[come, frm, uwi, utech, shi, seh, handsome, ti...",Money Can't Buy Fren


In [12]:
X_train, X_test, y_train, y_test = train_test_split(df.lyrics.to_frame(), df.genre_name,
                                                    test_size = 0.2, random_state = 42, stratify = df.genre_name)

In [13]:
X_train_corpus = list(X_train.lyrics.apply(lambda x: list(x)).values)

In [14]:
bigram_model = Phrases(X_train_corpus)

In [15]:
X_train['bigrams'] = X_train.apply(lambda x: bigram_model[x])
X_train_bigram_corpus = list(X_train.bigrams.apply(lambda x: list(x)).values)

dct = Dictionary(X_train_bigram_corpus)
dct.filter_extremes(no_below = 5, no_above = 0.95)

X_train_bigram_corpus_tokenized = [dct.doc2bow(x) for x in X_train_bigram_corpus]
X_train_bow = pd.DataFrame(corpus2dense(X_train_bigram_corpus_tokenized,
                                           num_terms = len(dct.token2id)).T,
                           columns = list(dct.values()), index = X_train.index)

In [16]:
X_train_bow

Unnamed: 0,baby,bare,bone,cameron,come,date,dog,everyone,first,get,...,soldier_soldier,dish_dirt,taliban,zillion,silvery_moon,bomb_bomb,discretion,staten,smashin,blackend
23204,9.0,1.0,1.0,1.0,1.0,4.0,1.0,1.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15147,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44398,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24167,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58031,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50219,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7211,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64285,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26061,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
dct = Dictionary(X_train_bigram_corpus)
dct.filter_extremes(no_below = 5, no_above = 0.5)

X_train_bigram_corpus_tokenized = [dct.doc2bow(x) for x in X_train_bigram_corpus]
X_train_bow_lower_threshold = pd.DataFrame(corpus2dense(X_train_bigram_corpus_tokenized,
                                           num_terms = len(dct.token2id)).T,
                           columns = list(dct.values()), index = X_train.index)

In [18]:
X_train_bow_lower_threshold

Unnamed: 0,baby,bare,bone,cameron,come,date,dog,everyone,first,get,...,soldier_soldier,dish_dirt,taliban,zillion,silvery_moon,bomb_bomb,discretion,staten,smashin,blackend
23204,9.0,1.0,1.0,1.0,1.0,4.0,1.0,1.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15147,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44398,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24167,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58031,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50219,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7211,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64285,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26061,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# X_test['bigrams'] = X_test.apply(lambda x: bigram_model[x])
# X_test_bigram_corpus = list(X_test.bigrams.apply(lambda x: list(x)).values)
# X_test_bigram_corpus_tokenized = [dct.doc2bow(x) for x in X_test_bigram_corpus]
# X_test_bow = pd.DataFraX_test['bigrams'] = X_test.apply(lambda x: bigram_model[x])
# X_test_bigram_corpus = list(X_test.bigrams.apply(lambda x: list(x)).values)\
# X_test_bigram_corpus_tokenized = [dct.doc2bow(x) for x in X_test_bigram_corpus]
# X_test_bow = pd.DataFrame(corpus2dense(X_test_bigram_corpus_tokenized,
#                                            num_terms = len(dct.token2id)).T,
#                            columns = token_names, index = X_test.index)me(corpus2dense(bigram_corpus_tokenized,
#                                            num_terms = len(dct.token2id)).T,
#                            columns = token_names, index = X_test.index)