In [1]:
import re
from gensim import corpora
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/home/evelin/Downloads/movies.csv')
df['text_length'] = df.text.apply(lambda x: len(x.split()))
df = df[df['text_length'] >= 3000]

In [3]:
from stop_words import get_stop_words
stop_words = {token: True for token in get_stop_words('de')}

In [4]:
docs = [
    [token.lower() for token in re.findall(r'\w+', doc) if not stop_words.get(token.lower(), False)] for doc in df.text
]

In [5]:
from gensim.corpora import Dictionary

dictionary = Dictionary(docs)

In [6]:
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [7]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [11]:
from gensim.models import LdaModel

# Set training parameters.
num_topics = 19
chunksize = 2000
passes = 20
iterations = 100
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

 

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every,

)

In [12]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)



Average topic coherence: -1.8127.
[([(0.0037609572, 'hi'),
   (0.0035872983, 'super'),
   (0.0032684337, 'ne'),
   (0.00299861, 'sex'),
   (0.0029866935, 'total'),
   (0.002493854, 'schatz'),
   (0.0022956661, 'entschuldige'),
   (0.0022896782, 'cool'),
   (0.0022147445, 'glücklich'),
   (0.0019864084, 'mama'),
   (0.0019719838, 'wär'),
   (0.0018373603, 'wow'),
   (0.0018358821, 'schule'),
   (0.0017033662, 'süß'),
   (0.0016856806, 'glaub'),
   (0.0016552717, 'typ'),
   (0.0016065399, 'verliebt'),
   (0.0015956783, 'findest'),
   (0.0015798714, 'tschüss'),
   (0.001540975, 'heiraten')],
  -0.8171661628482765),
 ([(0.0071156383, 'scheiß'),
   (0.005913502, 'ne'),
   (0.0057973745, 'n'),
   (0.0054511493, 'arsch'),
   (0.004096526, 'arschloch'),
   (0.0038678697, 'polizei'),
   (0.003306435, 'boss'),
   (0.0031712072, 'wichser'),
   (0.0031153774, 'verdammte'),
   (0.0029744487, 'typ'),
   (0.0027879984, 'job'),
   (0.0027471315, 'teufel'),
   (0.002680539, 'wagen'),
   (0.0026701884, 

In [13]:
from pathlib import Path

In [14]:
model.save('../dataset/lda.model_2')

In [23]:
#import pyLDAvis.gensim

#pyLDAvis.gensim.prepare(top_topics, corpus, dictionary)
import pyLDAvis
import pyLDAvis.gensim 

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, corpus, id2word)
vis

AttributeError: 'dict' object has no attribute 'token2id'

In [25]:
import pyLDAvis
import pyLDAvis.gensim
vis = pyLDAvis.gensim.prepare(topic_model=model, 
                              corpus=corpus, 
                              dictionary=dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

In [None]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [None]:
no_top_words = 10
display_topics(model, , no_top_words)

In [None]:
df_topics = pd.DataFrame(top_topic)
df_topics

In [None]:
import numpy as np
import seaborn as sns

uniform_data = top_topic
ax = sns.heatmap(uniform_data)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

fig, axes = plt.subplots(2,2,figsize=(16,14), dpi=160, sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):    
    df_dominant_topic_sub = top_topics.Dominant_Topic == i
    doc_lens = [len(d) for d in top_topics.Text]
    ax.hist(doc_lens, bins = 1000, color=cols[i])
    ax.tick_params(axis='y', labelcolor=cols[i], color=cols[i])
    sns.kdeplot(doc_lens, color="black", shade=False, ax=ax.twinx())
    ax.set(xlim=(0, 1000), xlabel='Document Word Count')
    ax.set_ylabel('Number of Documents', color=cols[i])
    ax.set_title('Topic: '+str(i), fontdict=dict(size=16, color=cols[i]))

fig.tight_layout()
fig.subplots_adjust(top=0.90)
plt.xticks(np.linspace(0,1000,9))
fig.suptitle('Distribution of Document Word Counts by Dominant Topic', fontsize=22)
plt.show()