# Topic Modeling

Christoph & Nadia

## LDA (Latent Dirichlet Allocation)

## Imports von hauptsächlich Gensim & SpaCy

In [2]:
from pprint import pprint
import gensim
from gensim import corpora, models, similarities
from gensim.models import LdaModel
from spacy_preprocessing.preprocess import Preprocess
from tabulate import tabulate

import numpy as np
import pickle
import pyLDAvis
import pyLDAvis.gensim

import warnings
warnings.filterwarnings('ignore')

from ipywidgets import widgets
from IPython.display import display, clear_output

from spacy_preprocessing.preprocess import Preprocess
import datetime

## Zusätzliche Preprocessing Funktionen

In [3]:
def is_blacklisted(word):
    return word in [
        'polizei', 'polizist', 'beamter', 'nr.', 'berlin', 'uhr', 'polizeimeldung',
        'nicht', 'jahr', 'jährige', 'jährig', 'jähriger', 'polizeiliche', 'polizeilich', '2015', '2016',
        '2014', '2017', '2018', 'polizeibeamter', '-', 'u.a.', 'z.b.', 'der', 'die', 'das', 'dem', 'den', 'diese', 'dieser',
        'diesen', 'diesem', 'um', 'für', 'eine', 'ein', 'einer', 'einen', 'einem', 'anderer', 'andere', 'anderen', 'anders'
    ]

def is_empty(word):
    return word.strip() == ''

def can_parse_date(word):
    try:
        datetime.datetime.strptime(word, '%d.%m.%Y')
        return True
    except ValueError:
        return False

def can_parse_num_int(word):
    try:
        int(word)
        return True
    except ValueError:
        return False
    
def can_parse_num_float(word):
    try:
        float(word)
        return True
    except ValueError:
        return False
    
def preprocess_after(doc):
    return [
        word 
        for word in doc
        if not is_empty(word)
        and not is_blacklisted(word)
        and not can_parse_date(word)
        and not can_parse_num_int(word)
        and not can_parse_num_float(word)
    ]

## Daten aus pickle Datei laden und Preprocessing durchführen

In [4]:
items = pickle.load(open('./data/items.pkl', 'rb'))
texts = [report['text'] for report in items]
titles = [report['title'] for report in items] 
data = [report['text_pre_processed_v1'] for report in items]

In [5]:
clean_data = [preprocess_after(doc) for doc in data]

## Dictionary und Bag of Words aus jedem Dokument erstellen

In [7]:
id2word = corpora.Dictionary(clean_data)
corpus = [id2word.doc2bow(doc) for doc in clean_data]

## LDA Modell

In [8]:
tmust

In [9]:
pprint(lda_model.print_topics())

[(1,
  '0.021*"hund" + 0.017*"wache" + 0.013*"corsa" + 0.011*"mangel" + '
  '0.010*"verspüren" + 0.010*"besitzerin" + 0.010*"polizeipräsident" + '
  '0.009*"rathenower" + 0.009*"gesteuert" + 0.009*"wilhelmshavener"'),
 (23,
  '0.023*"straße" + 0.009*"finden" + 0.009*"folgend" + 0.008*"sicherheit" + '
  '0.007*"information" + 0.007*"informieren" + 0.007*"platz" + '
  '0.006*"insgesamt" + 0.006*"berliner" + 0.006*"thema"'),
 (24,
  '0.079*"spandau" + 0.025*"reinickendorf" + 0.018*"residenzstraße" + '
  '0.017*"wechselgeld" + 0.015*"tipp" + 0.014*"silvesterrakete" + '
  '0.014*"brunsbütteler" + 0.014*"damm" + 0.013*"bisherig" + 0.012*"bleiben"'),
 (16,
  '0.032*"person" + 0.015*"trinkgelage" + 0.014*"aufzug" + '
  '0.011*"umherfliegend" + 0.011*"teilnehmer" + 0.010*"insgesamt" + '
  '0.010*"einsatzkraft" + 0.009*"verletzen" + 0.009*"straße" + '
  '0.008*"einsatz"'),
 (12,
  '0.046*"staatsanwaltschaft" + 0.025*"gemeinsame" + 0.025*"meldung" + '
  '0.023*"ermittlung" + 0.019*"landeskriminal

## Topics visualisieren

In [10]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

## MatrixSimilarity erstellen

In [11]:
index = similarities.MatrixSimilarity(lda_model[corpus])

## Die Suche

In [12]:
html_widget = widgets.HTML(
    value = ''
)

text_widget = widgets.Text()


def search(sender):  
    preprocess = Preprocess(sender.value)
    search_term_preprocessed = preprocess_after(preprocess.preprocess(sentence_split=False, with_pos=False))
    search_term_bow = id2word.doc2bow(search_term_preprocessed)
    search_term_lda = lda_model[search_term_bow]
    sims = index[search_term_lda]
    similarity_list = list(zip(range(len(sims)), sims, texts, titles))
    similarity_list.sort(key=lambda x: x[1], reverse=True)
    
    results = [f'<h3>{result[3]}</h3><em>Relevanz: {round(result[1] * 100, 2)} %</em><br><p style="width: 80%;">{result[2]}</p>' for result in similarity_list[:10]]
    output = '<hr><br>'.join(results)
    
    html_widget.value = output

text_widget.on_submit(search)    
    
display(text_widget)
display(html_widget)    


Text(value='')

HTML(value='')