In [10]:
import re
from gensim import corpora
import pandas as pd
import numpy as np

In [9]:
df = pd.read_csv('../dataset/movies.csv')

In [7]:
from stop_words import get_stop_words
stop_words = {token: True for token in get_stop_words('de')}

In [21]:
docs = [
    [token.lower() for token in re.findall(r'\w+', doc) if not stop_words.get(token.lower(), False)] for doc in df.text
]

In [23]:
from gensim.corpora import Dictionary

dictionary = Dictionary(docs)

In [24]:
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [27]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [28]:
from gensim.models import LdaModel

# Set training parameters.
num_topics = 250 # 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every,

)

In [32]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)



Average topic coherence: -3.2281.
[([(0.0069550304, 'hi'),
   (0.0032717492, 'freundin'),
   (0.0026385838, 'job'),
   (0.0026154127, 'irgendwie'),
   (0.002193366, 'entschuldige'),
   (0.0020931207, 'sex'),
   (0.0020588269, 'glücklich'),
   (0.0020562196, 'eltern'),
   (0.0020346262, 'gefühl'),
   (0.0019266321, 'klingt'),
   (0.0018163878, 'perfekt'),
   (0.0017794288, 'wahrscheinlich'),
   (0.0017438189, 'könntest'),
   (0.0017266438, 'falsch'),
   (0.001723504, 'hattest'),
   (0.0016616496, 'wolltest'),
   (0.0016424569, 'anrufen'),
   (0.0016174113, 'fühle'),
   (0.001599664, 'wow'),
   (0.0015518981, 'ruf')],
  -0.7036495554850629),
 ([(0.18475956, 'ok'),
   (0.008358523, 'hi'),
   (0.007014787, 'schatz'),
   (0.0069177095, 'dad'),
   (0.005519439, 'kumpel'),
   (0.0050841738, 'nochmal'),
   (0.004596995, 'mist'),
   (0.004345123, 'mom'),
   (0.004218963, 'wow'),
   (0.0039234348, 'super'),
   (0.003667533, 'baby'),
   (0.0036338489, 'cool'),
   (0.0033524705, 'teufel'),
   (0.0

   (0.0040406007, 'afghanistan')],
  -1.7272930432175335),
 ([(4.32444e-05, 'daddy'),
   (4.324247e-05, 'hi'),
   (4.3240925e-05, 'baby'),
   (4.3240845e-05, 'scheiß'),
   (4.324071e-05, 'gefragt'),
   (4.3240638e-05, 'carla'),
   (4.3240583e-05, 'verdammte'),
   (4.3240205e-05, 'lola'),
   (4.3239957e-05, 'schau'),
   (4.3239732e-05, 'geschichten'),
   (4.3239623e-05, 'glücklich'),
   (4.3239615e-05, 'klein'),
   (4.3239597e-05, 'fernbedienung'),
   (4.3239546e-05, 'dad'),
   (4.3239535e-05, 'valentine'),
   (4.3239364e-05, 'wow'),
   (4.3239324e-05, 'großmutter'),
   (4.323932e-05, 'zeichen'),
   (4.3239288e-05, 'körper'),
   (4.3239255e-05, 'herz')],
  -1.7939498767439919),
 ([(0.005967026, 'regierung'),
   (0.005456145, 'land'),
   (0.0040661907, 'herr'),
   (0.0037923167, '1'),
   (0.0036334116, 'herren'),
   (0.002892286, 'stellen'),
   (0.0027432966, 'presse'),
   (0.002682888, 'usa'),
   (0.0026420907, 'politik'),
   (0.0025829084, 'partei'),
   (0.002480191, 'gesetz'),
   (0.0

   (0.013352789, 'visum'),
   (0.012678716, 'araber'),
   (0.010933722, 'gaulle'),
   (0.010773301, 'franzose'),
   (0.010077954, 'helene'),
   (0.009877566, 'papiere'),
   (0.009416837, 'franzosen'),
   (0.009059889, 'krieg'),
   (0.008918468, 'schachtel'),
   (0.008831265, 'gefängnis'),
   (0.008804908, 'land'),
   (0.007924958, 'allah'),
   (0.007570593, 'graham'),
   (0.0074534286, 'rechts'),
   (0.0064874333, 'schau'),
   (0.0064329323, 'heb'),
   (0.005897725, 'polizei')],
  -2.5900673354864043),
 ([(0.23781395, 'joe'),
   (0.04063409, 'bernie'),
   (0.014745084, 'lucky'),
   (0.0070542335, 'u'),
   (0.0069228853, 'dino'),
   (0.006256367, 'junior'),
   (0.0056525376, 'joes'),
   (0.004668876, 'cowboy'),
   (0.004129868, 'werd'),
   (0.0038500102, 'knast'),
   (0.0038211704, 'town'),
   (0.0037603148, 'rauf'),
   (0.003754322, 'ken'),
   (0.0036051618, 'meilen'),
   (0.0035803674, 'daisy'),
   (0.0035035103, 'gutes'),
   (0.0034875185, '1'),
   (0.0033693418, 'wütend'),
   (0.003

   (0.005443347, 'kreide'),
   (0.004919749, 'lauf'),
   (0.0048700552, 'rote'),
   (0.0046568927, 'kilometer'),
   (0.0046448284, 'papagei'),
   (0.0045614396, 'reisen'),
   (0.0044847196, 'roten')],
  -3.2709485432455634),
 ([(0.12436524, 'casey'),
   (0.09411033, 'vampir'),
   (0.0701304, 'vampire'),
   (0.04560102, 'jen'),
   (0.031750668, 'blut'),
   (0.023121728, 'anwältin'),
   (0.01983867, 'henrik'),
   (0.016883953, 'lola'),
   (0.008739163, 'pa'),
   (0.00806531, '16'),
   (0.0053254785, 'sandwich'),
   (0.004901647, 'hase'),
   (0.0044728946, 'probleme'),
   (0.004321254, 'weh'),
   (0.0040872265, 'firma'),
   (0.004003486, 'martin'),
   (0.0039910357, 'job'),
   (0.003921821, 'kurier'),
   (0.003844804, 'ar'),
   (0.0038390062, 'irgendwas')],
  -3.2759886418618587),
 ([(0.0618138, 'rennen'),
   (0.036807552, 'andy'),
   (0.019054534, 'gewinnen'),
   (0.017026918, 'fahrer'),
   (0.016153397, 'team'),
   (0.015493879, 'cody'),
   (0.011743942, 'finn'),
   (0.010856119, 'schne

 ([(0.16711648, 'jim'),
   (0.091503546, 'emily'),
   (0.022162832, 'alicia'),
   (0.01593007, 'freddy'),
   (0.011597292, 'dämon'),
   (0.007622909, 'ding'),
   (0.0074028694, 'te'),
   (0.0062238188, 'sir'),
   (0.0051803165, 'sturm'),
   (0.0047149775, 'blut'),
   (0.0046349936, 'töten'),
   (0.004601975, 'baby'),
   (0.0045199203, 'yong'),
   (0.004178734, 'karen'),
   (0.0040269424, 'jimmy'),
   (0.003679002, 'tochter'),
   (0.0035757301, 'earl'),
   (0.0034021614, 'haken'),
   (0.0033218253, 'wang'),
   (0.0032221486, 'dämonen')],
  -4.39713500219618),
 ([(0.25690597, 'david'),
   (0.01779025, 'davis'),
   (0.01628905, 'walker'),
   (0.011020349, 'katherine'),
   (0.008399979, '1'),
   (0.006438295, '5'),
   (0.0060786493, 'craig'),
   (0.0051992317, 'bild'),
   (0.004248524, 'chicago'),
   (0.004245902, 'senator'),
   (0.0042132544, 'fbi'),
   (0.004172584, '7'),
   (0.004122728, '2'),
   (0.0040772753, 'julien'),
   (0.0038979603, 'virus'),
   (0.003871556, 'agent'),
   (0.0038

   (0.006758996, 'hubert'),
   (0.006420742, 'raphael'),
   (0.006245399, 'dolores'),
   (0.005289995, 'kenneth'),
   (0.0047095874, 'salsa'),
   (0.00465154, 'zeitmaschine'),
   (0.004626819, 'sol'),
   (0.0036111719, 'zeitreise'),
   (0.0031681366, 'zeitreisen'),
   (0.0031430144, 'casinos'),
   (0.0029273906, 'shaw'),
   (0.002734831, 'great'),
   (0.0026701447, 'zukunft'),
   (0.00258161, 'adler')],
  -11.48249505462202)]


In [36]:
from pathlib import Path

In [40]:
model.save('../dataset/lda.model')

In [45]:
top_topics[12]

([(0.018916095, 'wohnung'),
  (0.016757697, 'polizei'),
  (0.009495686, 'schwester'),
  (0.008101037, 'christine'),
  (0.0066789743, 'hotel'),
  (0.005279089, 'telefon'),
  (0.0052776886, 'zimmer'),
  (0.0052516563, 'wohnt'),
  (0.0049791434, 'nachricht'),
  (0.004368779, 'rufe'),
  (0.0038878205, 'ruf'),
  (0.00385902, 'anrufen'),
  (0.0038102898, 'fahre'),
  (0.003790587, 'unfall'),
  (0.003750356, 'adresse'),
  (0.0036643303, 'handy'),
  (0.0036165714, 'straße'),
  (0.0030708264, 'tschüss'),
  (0.0030133596, 'foto'),
  (0.0029202525, 'rufen')],
 -1.0197044746796466)