In [1]:
import re
from gensim import corpora
import pandas as pd
import numpy as np

 ### Einlesen des bereinigten Datasets 

In [2]:
df = pd.read_csv('/home/evelin/Downloads/movies.csv')
df['text_length'] = df.text.apply(lambda x: len(x.split()))
df = df[df['text_length'] >= 3000]
df.text = df.text.apply(lambda x: re.sub(r'\d+', '', x))

In [3]:
from stop_words import get_stop_words
stop_words = {token: True for token in get_stop_words('de')}

In [4]:
docs = [
    [token.lower() for token in re.findall(r'\w+', doc) if not stop_words.get(token.lower(), False)] for doc in df.text
]

In [5]:
from gensim.corpora import Dictionary

dictionary = Dictionary(docs)

In [6]:
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [7]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

# Erstellen des Models mit Gensim

In [8]:
from gensim.models import LdaModel

# Set training parameters.
num_topics = 19
chunksize = 2000
passes = 20
iterations = 100
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

 

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every,

)

## Parameterwahl

Die Anzahl der Cluster ist nicht zufällig auf 19 beschränkt worden. Es war für uns interessant zu sehen, ob mit der Topic Modelling - Methode sich die Topicthematiken den einzelnen Genres zuordnen lassen.

In [9]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)



Average topic coherence: -1.8048.
[([(0.0060538594, 'hi'),
   (0.003764918, 'scheiß'),
   (0.0037261639, 'baby'),
   (0.0035149339, 'dad'),
   (0.0034484137, 'ne'),
   (0.0032951117, 'cool'),
   (0.0030981854, 'arsch'),
   (0.0029963392, 'wow'),
   (0.002888324, 'sex'),
   (0.0028085795, 'mom'),
   (0.0026796209, 'total'),
   (0.0026682469, 'party'),
   (0.0024371338, 'super'),
   (0.0020556278, 'typ'),
   (0.0019398223, 'job'),
   (0.0019098232, 'schatz'),
   (0.0017773634, 'wär'),
   (0.0017434944, 'schwanz'),
   (0.0016570431, 'fick'),
   (0.001652994, 'typen')],
  -0.7624433371519347),
 ([(0.007560946, 'scheiß'),
   (0.0071070828, 'ne'),
   (0.0070372154, 'n'),
   (0.006099476, 'polizei'),
   (0.004894887, 'arsch'),
   (0.004323658, 'arschloch'),
   (0.003624503, 'wichser'),
   (0.0032988545, 'boss'),
   (0.00326313, 'he'),
   (0.0029852386, 'verdammte'),
   (0.0029045844, 'wagen'),
   (0.002809331, 'fahr'),
   (0.0027562105, 'typ'),
   (0.002577195, 'bullen'),
   (0.0025109532, 'j

In [10]:
from pathlib import Path

In [11]:
model.save('../dataset/lda.model_3')

In [12]:
import pyLDAvis
import pyLDAvis.gensim
vis = pyLDAvis.gensim.prepare(topic_model=model, 
                              corpus=corpus, 
                              dictionary=dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

### Fazit

Tatsächlich kann man bei den einzelnen Topicinhalten diverse Filmkategorien reininterpretieren. 