In [13]:
## Text Mining Project - Experiments
# imports
import pandas as pd
import numpy as np
import re
import spacy
from gensim.corpora import Dictionary
from gensim.models import Phrases
from gensim.models import LdaModel
from gensim.models import nmf
from gensim.models import CoherenceModel
from operator import itemgetter
import pyLDAvis.gensim
import pickle 
import pyLDAvis
from collections import Counter

In [4]:
# Save/Load DataFrame
#all_songs.to_pickle('all_songs.pkl')
all_songs = pd.read_pickle('all_songs.pkl')

In [5]:
# Preprocessing

nlp = spacy.load('de_core_news_lg')
nlp_en = spacy.load('en')
docs = []

print('Tokenizing and removing stop words')
# Tokenization and Lemmatization (remove stop words, numbers and one/two character words)
for index, song in all_songs.iterrows():
    # Remove the artist name from the lyrics
    doc = nlp(song['Lyrics'].replace(song['Artist'],''))
    lemmatized_doc = [token.lemma_ for token in doc if not token.is_stop and token.lemma_.isalpha() and len(token)>2]
    docs.append(lemmatized_doc)

# Remove english stop words and few manual stop words
manual_stop_words = ['ein', 'eine', 'mal', 'mach', 'lass', 'nich', 'sein', 'nix', 'tun', 'mein']
en_stop_words = nlp_en.Defaults.stop_words
docs = [[word for word in doc if not word.lower() in en_stop_words and not word.lower() in manual_stop_words] for doc in docs]

# Add bigrams/trigrams that appear 5 times or more (song texts are short)
bigram = Phrases(docs, min_count=5)
trigram = Phrases(bigram[docs], min_count=5)
print(f'Adding {len(bigram.vocab)} bigrams and {len(trigram.vocab)} trigrams')

for idx in range(len(docs)):
    for token in trigram[bigram[docs[idx]]]:
        if '_' in token:
            docs[idx].append(token)
    
# Filter extremes, these values seem to work well
dictionary = Dictionary(docs)
print(f'Filtering extremes - Number of unique tokens: {len(dictionary)}')
dictionary.filter_extremes(no_below=15, no_above=0.5)

print('Computing BoW representation')
# Bag-of-words representation of the documents
corpus = [dictionary.doc2bow(doc) for doc in docs]

temp = dictionary[0]
id2word = dictionary.id2token

print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Tokenizing and removing stop words
Adding 936232 bigrams and 959381 trigrams
Filtering extremes - Number of unique tokens: 108140
Computing BoW representation
Number of unique tokens: 8195
Number of documents: 7467


In [9]:
def get_coherence_score(model):
    coherence_model = CoherenceModel(model=model, texts=docs, dictionary=dictionary, coherence='c_v')
    coherence = coherence_model.get_coherence()
    return coherence

In [44]:
# Find topic number - model combination with the highest coherence value

chunksize = 1000
passes = 100
iterations = 400
random_state = 18

model_results = {'Topics': [], 'Model': [], 'Coherence': []}
topics_range = range(3, 16)

for t in topics_range:  
        lda_model_ = LdaModel(
                 corpus=corpus,
                 id2word=id2word,
                 num_topics=t,
                 passes=passes,
                 iterations=iterations,
                 chunksize=chunksize,
                 random_state=random_state
        )
        cv = get_coherence_score(lda_model_)
        model_results['Model'].append('lda')
        model_results['Topics'].append(t)
        model_results['Coherence'].append(cv)
        print(f'model: lda, topics: {t}, score: {cv}')
        
for t in topics_range:  
        nmf_model_ = nmf.Nmf(
            corpus=corpus,
            num_topics=t,
            id2word=id2word,
            chunksize=chunksize,
            passes=passes,
            random_state=random_state
        )
        cv = get_coherence_score(nmf_model_)
        model_results['Model'].append('nmf')
        model_results['Topics'].append(t)
        model_results['Coherence'].append(cv)
        print(f'model: nmf, topics: {t}, score: {cv}')

model: lda, topics: 3, score: 0.3880800261358184
model: lda, topics: 4, score: 0.4139295223191907
model: lda, topics: 5, score: 0.4009439882922911
model: lda, topics: 6, score: 0.3912501411616684
model: lda, topics: 7, score: 0.40279858423161446
model: lda, topics: 8, score: 0.3727735490247208
model: lda, topics: 9, score: 0.4133459000515158
model: lda, topics: 10, score: 0.40702006361400594
model: lda, topics: 11, score: 0.41092463308253085
model: lda, topics: 12, score: 0.3878621844172901
model: lda, topics: 13, score: 0.36643452285847683
model: lda, topics: 14, score: 0.3827028079063838
model: lda, topics: 15, score: 0.3837642692500546
model: nmf, topics: 3, score: 0.35425503951984044
model: nmf, topics: 4, score: 0.3750091723051591
model: nmf, topics: 5, score: 0.36665627194491324
model: nmf, topics: 6, score: 0.36812054222826185
model: nmf, topics: 7, score: 0.4028816279027961
model: nmf, topics: 8, score: 0.33994943657547405
model: nmf, topics: 9, score: 0.38773649432746266
model

In [45]:
index, element = max(enumerate(model_results['Coherence']), key=itemgetter(1))

print(model_results['Topics'][index])
print(model_results['Model'][index])
print(model_results['Coherence'][index])

4
lda
0.4139295223191907


In [50]:
# Try LDA with 4 topics

random_state = 18
num_topics = 4
chunksize = 1000
passes = 250
iterations = 400

lda_model = LdaModel(
         corpus=corpus,
         id2word=id2word,
         num_topics=num_topics,
         passes=passes,
         iterations=iterations,
         chunksize=chunksize,
         random_state=random_state
)

print(f'{num_topics} Topics:')
print(get_coherence_score(lda_model))
lda_model.print_topics(num_words=30)

4 Topics:
0.4120638799712101


[(0,
  '0.012*"Bruder" + 0.010*"Geld" + 0.007*"Straße" + 0.006*"Bra" + 0.005*"Benz" + 0.005*"Jungs" + 0.005*"geben" + 0.005*"Gang" + 0.005*"Scheine" + 0.005*"Para" + 0.004*"Kopf" + 0.004*"Block" + 0.004*"Bratan" + 0.004*"weg" + 0.004*"Mama" + 0.004*"Fick" + 0.004*"Digga" + 0.004*"voll" + 0.004*"Gib" + 0.003*"Haze" + 0.003*"komm" + 0.003*"paar" + 0.003*"fick" + 0.003*"laufen" + 0.003*"Knast" + 0.003*"rein" + 0.003*"reden" + 0.003*"Bulle" + 0.003*"zieh" + 0.003*"schnellen"'),
 (1,
  '0.034*"Baby" + 0.016*"Bitch" + 0.016*"Dicka" + 0.015*"yeah" + 0.010*"weiß" + 0.009*"komm" + 0.009*"Bitches" + 0.009*"Money" + 0.007*"Geld" + 0.007*"Yeah" + 0.006*"Komm" + 0.005*"sag" + 0.005*"high" + 0.005*"Nacht" + 0.005*"bitte" + 0.005*"High" + 0.004*"Babe" + 0.004*"Ice" + 0.004*"jaja" + 0.004*"Club" + 0.004*"Party" + 0.004*"wissen" + 0.004*"geben" + 0.004*"Stay" + 0.004*"Gucci" + 0.004*"Boy" + 0.004*"Guck" + 0.003*"lieben" + 0.003*"Cash" + 0.003*"fühlen"'),
 (2,
  '0.011*"Leben" + 0.008*"Welt" + 0.007*"we

In [64]:
# Not implemented anymore
#performance_test_known(lda_model)

Predicted 55 of 77 (71%)
Incorrect labeling pairs: Counter({(2.0, 1): 6, (0.0, 1): 4, (2.0, 3): 3, (1.0, 2): 2, (3.0, 1): 2, (3.0, 0): 2, (0.0, 3): 2, (2.0, 0): 1})
Incorrect predictions: Counter({2.0: 10, 0.0: 6, 3.0: 4, 1.0: 2})
Incorrectly predicted labels: Counter({1: 12, 3: 5, 0: 3, 2: 2})


In [51]:
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
LDAvis_prepared

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [54]:
# Try NMF with 4 topics as comparison, although coherence scores are lower for NMF models
# The topics don't make as much sense as with the LDA model -> stick to LDA

random_state = 18
num_topics = 4
chunksize = 1000
passes = 250

nmf_model = nmf.Nmf(
            corpus=corpus,
            num_topics=num_topics,
            id2word=id2word,
            chunksize=chunksize,
            passes=passes,
            random_state=random_state
        )

print(f'{num_topics} Topics:')
print(get_coherence_score(nmf_model))
nmf_model.print_topics(num_words=30)

4 Topics:
0.37832338288542194


[(0,
  '0.035*"yeah" + 0.026*"Geld" + 0.021*"Bitch" + 0.014*"Yeah" + 0.011*"lieben" + 0.008*"Bra" + 0.006*"komm" + 0.006*"fick" + 0.005*"Guck" + 0.005*"Fick" + 0.005*"Gang" + 0.005*"Ice" + 0.005*"Cash" + 0.004*"Rapper" + 0.004*"Boss" + 0.003*"Money" + 0.003*"Nutte" + 0.003*"Benz" + 0.003*"Komm" + 0.003*"Rap" + 0.003*"Block" + 0.003*"Mutter" + 0.003*"voll" + 0.003*"hol" + 0.003*"Louis" + 0.003*"paar" + 0.002*"flex" + 0.002*"geb" + 0.002*"guck" + 0.002*"drip"'),
 (1,
  '0.009*"Leben" + 0.006*"sehen" + 0.005*"bleiben" + 0.005*"Bruder" + 0.005*"zieh" + 0.004*"Welt" + 0.004*"Kopf" + 0.004*"Rapper" + 0.004*"raus" + 0.004*"Straße" + 0.004*"Rap" + 0.004*"Weg" + 0.004*"Junge" + 0.003*"weg" + 0.003*"stehen" + 0.003*"Mann" + 0.003*"sagen" + 0.003*"einfach" + 0.003*"denken" + 0.003*"geben" + 0.003*"Hand" + 0.003*"hören" + 0.003*"geh" + 0.003*"Freund" + 0.003*"leben" + 0.003*"reden" + 0.003*"Leute" + 0.003*"voll" + 0.003*"hart" + 0.003*"komm"'),
 (2,
  '0.190*"Bitches" + 0.065*"Money" + 0.042*"Gucc

In [46]:
# 4 Topics look promising, but better seperate the lifestyle topic "Gucci, Money, Geld" and the sex/party topic "Baby, Nacht, Party"
# Fine tune the model parameters eta and alpha for LDA with 5 topics

model_results = {'Alpha': [], 'Eta': [], 'Coherence': []}
alpha_values = [0.1, 0.5, 1, 5, 10, 'auto']
eta_values = [0.1, 0.5, 1, 5, 10, 'auto']

for alpha in alpha_values: 
    for eta in eta_values:
        lda_model_ = LdaModel(
                 corpus=corpus,
                 id2word=id2word,
                 num_topics=t,
                 passes=passes,
                 iterations=iterations,
                 alpha=alpha,
                 eta=eta,
                 chunksize=chunksize,
                 random_state=random_state
        )
        cv = get_coherence_score(lda_model_)
        model_results['Alpha'].append(alpha)
        model_results['Eta'].append(eta)
        model_results['Coherence'].append(cv)
        print(f'Alpha: {alpha}, Eta: {eta}, score: {cv}')

Alpha: 0.1, Eta: 0.1, score: 0.3715038624056148
Alpha: 0.1, Eta: 0.5, score: 0.41660750553151044
Alpha: 0.1, Eta: 1, score: 0.42807141609854304
Alpha: 0.1, Eta: 5, score: 0.4844000999801811
Alpha: 0.1, Eta: 10, score: 0.44727222519292137
Alpha: 0.1, Eta: auto, score: 0.3845428341052875
Alpha: 0.5, Eta: 0.1, score: 0.40070097904363977
Alpha: 0.5, Eta: 0.5, score: 0.4204101550425724
Alpha: 0.5, Eta: 1, score: 0.4155338746080693
Alpha: 0.5, Eta: 5, score: 0.4776549478330427
Alpha: 0.5, Eta: 10, score: 0.4644459908352593
Alpha: 0.5, Eta: auto, score: 0.40000797788204945
Alpha: 1, Eta: 0.1, score: 0.4271346844149047
Alpha: 1, Eta: 0.5, score: 0.4456970870343123
Alpha: 1, Eta: 1, score: 0.4251672692223079
Alpha: 1, Eta: 5, score: 0.4814976034523775
Alpha: 1, Eta: 10, score: 0.4086932751398091
Alpha: 1, Eta: auto, score: 0.43011616857432206
Alpha: 5, Eta: 0.1, score: 0.4193349867731637
Alpha: 5, Eta: 0.5, score: 0.4294292099922394
Alpha: 5, Eta: 1, score: 0.436802134968805
Alpha: 5, Eta: 5, s

In [47]:
index, element = max(enumerate(model_results['Coherence']), key=itemgetter(1))

print(model_results['Alpha'][index])
print(model_results['Eta'][index])
print(model_results['Coherence'][index])

auto
10
0.4859971551318659


In [48]:
# Try LDA 5 topics with eta=10 and alpha='auto'

random_state = 18
num_topics = 5
chunksize = 1000
passes = 250
iterations = 10000

lda_model = LdaModel(
         corpus=corpus,
         id2word=id2word,
         num_topics=num_topics,
         passes=passes,
         iterations=iterations,
         chunksize=chunksize,
         alpha='auto',
         eta=10,
         random_state=random_state
)

print(f'{num_topics} Topics:')
print(get_coherence_score(lda_model))
lda_model.print_topics(num_words=30)

5 Topics:
0.46126473109769084


[(0,
  '0.002*"mhm" + 0.002*"Tanz" + 0.001*"Mhm" + 0.001*"Vulkan" + 0.001*"Raver" + 0.001*"Fahrer" + 0.001*"Chemie" + 0.001*"Salat" + 0.001*"Bello" + 0.000*"Feuerzeug" + 0.000*"Water" + 0.000*"tanz" + 0.000*"bad" + 0.000*"Finale" + 0.000*"Harz" + 0.000*"lala" + 0.000*"vergiften" + 0.000*"Achtung" + 0.000*"Chuck" + 0.000*"reiten" + 0.000*"Fahne" + 0.000*"released" + 0.000*"song" + 0.000*"General" + 0.000*"Walkman" + 0.000*"Hitler" + 0.000*"Kugelhagel" + 0.000*"Mademoiselle" + 0.000*"Arbeit" + 0.000*"ansprechen"'),
 (1,
  '0.003*"like" + 0.002*"Lachs" + 0.002*"ready" + 0.002*"love" + 0.002*"Harry" + 0.002*"Potter" + 0.002*"que" + 0.002*"know" + 0.001*"pas" + 0.001*"heavy" + 0.001*"come" + 0.001*"Bounce" + 0.001*"away" + 0.001*"need" + 0.001*"les" + 0.001*"let" + 0.001*"good" + 0.001*"wanna" + 0.001*"nick" + 0.001*"want" + 0.001*"Champion" + 0.001*"got" + 0.001*"run" + 0.001*"eyes" + 0.001*"Dragon" + 0.001*"Ick" + 0.001*"ick" + 0.001*"believe" + 0.001*"Ferris" + 0.001*"think"'),
 (2,
  '0

In [49]:
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
LDAvis_prepared

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [59]:
# Really bad results, try 'auto' for parameter eta instead

random_state = 18
num_topics = 5
chunksize = 1000
passes = 250
iterations = 10000

lda_model = LdaModel(
         corpus=corpus,
         id2word=id2word,
         num_topics=num_topics,
         passes=passes,
         iterations=iterations,
         chunksize=chunksize,
         alpha='auto',
         eta='auto',
         random_state=random_state
)

print(f'{num_topics} Topics:')
print(get_coherence_score(lda_model))
lda_model.print_topics(num_words=30)

5 Topics:
0.4067005288169977


[(0,
  '0.020*"Bruder" + 0.015*"Geld" + 0.014*"Straße" + 0.011*"Mama" + 0.010*"Kopf" + 0.009*"Jungs" + 0.007*"raus" + 0.006*"Leben" + 0.006*"Block" + 0.006*"weg" + 0.006*"laufen" + 0.006*"Knast" + 0.005*"bleiben" + 0.005*"Stadt" + 0.005*"Weg" + 0.005*"rein" + 0.005*"reden" + 0.005*"leben" + 0.005*"Bulle" + 0.005*"schnellen" + 0.005*"Berlin" + 0.004*"Bratan" + 0.004*"Scheine" + 0.004*"Kugel" + 0.004*"paar" + 0.004*"voll" + 0.004*"Ghetto" + 0.004*"Feind" + 0.004*"geh" + 0.004*"Benz"'),
 (1,
  '0.041*"Baby" + 0.020*"komm" + 0.017*"weiß" + 0.014*"Komm" + 0.011*"yeah" + 0.011*"sag" + 0.010*"bitte" + 0.010*"Party" + 0.010*"zieh" + 0.009*"Nacht" + 0.008*"Club" + 0.007*"high" + 0.007*"geben" + 0.007*"voll" + 0.006*"tanzen" + 0.006*"wissen" + 0.005*"Babe" + 0.005*"Sex" + 0.005*"bleib" + 0.005*"Frau" + 0.005*"Yeah" + 0.005*"chill" + 0.005*"Nummer" + 0.005*"brauch" + 0.004*"hol" + 0.004*"vorbei" + 0.004*"Flasche" + 0.004*"Haus" + 0.004*"brauchen" + 0.004*"machen"'),
 (2,
  '0.013*"Leben" + 0.009*

In [60]:
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
LDAvis_prepared

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
