# Topic Modeling

Christoph & Nadia

## LDA (Latent Dirichlet Allocation)

## Imports von hauptsächlich Gensim & SpaCy

In [15]:
from pprint import pprint
import gensim
from gensim import corpora, models, similarities
from gensim.models import LdaModel, LsiModel
from spacy_preprocessing.preprocess import Preprocess
from tabulate import tabulate

import numpy as np
import pickle
import pyLDAvis
import pyLDAvis.gensim

import os
import warnings
warnings.filterwarnings('ignore')

from ipywidgets import widgets
from IPython.display import display, clear_output

from spacy_preprocessing.preprocess import Preprocess
import datetime

load_saved = True

## Zusätzliche Preprocessing Funktionen

In [16]:
def is_blacklisted(word):
    return word in [
        'polizei', 'polizist', 'beamter', 'nr.', 'berlin', 'uhr', 'polizeimeldung',
        'nicht', 'jahr', 'jährige', 'jährig', 'jähriger', 'polizeiliche', 'polizeilich', '2015', '2016',
        '2014', '2017', '2018', 'polizeibeamter', '-', 'u.a.', 'z.b.', 'der', 'die', 'das', 'dem', 'den', 'diese', 'dieser',
        'diesen', 'diesem', 'um', 'für', 'eine', 'ein', 'einer', 'einen', 'einem', 'anderer', 'andere', 'anderen', 'anders'
    ]

def is_empty(word):
    return word.strip() == ''

def can_parse_date(word):
    try:
        datetime.datetime.strptime(word, '%d.%m.%Y')
        return True
    except ValueError:
        return False

def can_parse_num_int(word):
    try:
        int(word)
        return True
    except ValueError:
        return False
    
def can_parse_num_float(word):
    try:
        float(word)
        return True
    except ValueError:
        return False
    
def preprocess_after(doc):
    return [
        word 
        for word in doc
        if not is_empty(word)
        and not is_blacklisted(word)
        and not can_parse_date(word)
        and not can_parse_num_int(word)
        and not can_parse_num_float(word)
    ]

## Daten aus pickle Datei laden und Preprocessing durchführen

In [29]:
items = pickle.load(open('./data/items.pkl', 'rb'))
texts = [report['text'] for report in items]
titles = [report['title'] for report in items]
ids = [report['_id'] for report in items]
data = [report['text_pre_processed_v1'] for report in items]

In [30]:
clean_data = [preprocess_after(doc) for doc in data]

## Dictionary und Bag of Words aus jedem Dokument erstellen

In [31]:
if load_saved:
    id2word = corpora.Dictionary.load('./data/id2word')
    corpus = pickle.load(open('./data/corpus.pkl', 'rb'))
else:
    id2word = corpora.Dictionary(clean_data)
    corpus = [id2word.doc2bow(doc) for doc in clean_data]
    
    id2word.save('./data/id2word')
    pickle.dump(corpus, open('./data/corpus.pkl', 'wb'))

## LDA/LSI Modell

In [32]:
if load_saved:
    lda_model = LdaModel.load('./data/lda_model')
    lsi_model = LsiModel.load('./data/lsi_model')
else: 
    lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=25, passes=50, alpha='auto', eta='auto')
    lsi_model = LsiModel(corpus=corpus, id2word=id2word, num_topics=25, power_iters=50)
    lda_model.save('./data/lda_model')
    lsi_model.save('./data/lsi_model')

In [33]:
pprint(lda_model.print_topics())

[(24,
  '0.018*"dienst" + 0.014*"gesundbrunnen" + 0.012*"raubkommissariate" + '
  '0.010*"insgesamt" + 0.010*"gehoben" + 0.010*"getränk" + 0.008*"zeit" + '
  '0.008*"mitarbeiter" + 0.008*"mitte" + 0.007*"genannt"'),
 (14,
  '0.038*"fahrzeug" + 0.024*"versuchen" + 0.023*"audi" + 0.019*"wasser" + '
  '0.018*"stellen" + 0.016*"schwer" + 0.016*"flucht" + 0.015*"stehlen" + '
  '0.015*"fahrer" + 0.014*"versuch"'),
 (6,
  '0.022*"straße" + 0.010*"berliner" + 0.009*"finden" + 0.009*"platz" + '
  '0.009*"folgend" + 0.008*"tiergarten" + 0.008*"sicherheit" + 0.008*"bereich" '
  '+ 0.008*"informieren" + 0.008*"thema"'),
 (16,
  '0.016*"insgesamt" + 0.011*"rahmen" + 0.008*"droge" + 0.007*"pferd" + '
  '0.007*"verstoß" + 0.007*"handel" + 0.007*"kontrollieren" + '
  '0.006*"flughafen" + 0.006*"bezirksübergreifend" + 0.006*"haschisch"'),
 (15,
  '0.061*"steglitz" + 0.053*"zehlendorf" + 0.034*"km/h" + 0.023*"höhe" + '
  '0.018*"euro" + 0.017*"bußgeld" + 0.016*"fahrverbot" + 0.016*"unterwegs" + '
  '0.0

## Topics visualisieren

In [34]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

## MatrixSimilarity erstellen

In [35]:
if load_saved:
    index = similarities.MatrixSimilarity.load('./data/index')
else:
    index = similarities.MatrixSimilarity(lda_model[corpus])
    index.save('./data/index')

## Die Suche

In [37]:
use_lsi = True

html_widget = widgets.HTML(
    value = ''
)

text_widget = widgets.Text()


def search(sender):  
    preprocess = Preprocess(sender.value)
    search_term_preprocessed = preprocess_after(preprocess.preprocess(sentence_split=False, with_pos=False))
    search_term_bow = id2word.doc2bow(search_term_preprocessed)
    
    if use_lsi:
        search_term_vec = lsi_model[search_term_bow]
    else:
        search_term_vec = lda_model[search_term_bow]
    
    
    sims = index[search_term_vec]
    similarity_list = list(zip(range(len(sims)), sims, texts, titles, ids))
    similarity_list.sort(key=lambda x: x[1], reverse=True)
    
    results = [f'<h3>{result[3]}</h3><em>Id: {result[4]} | Similarity Score: {round(result[1] * 100, 2)} %</em><br><p style="width: 80%;">{result[2]}</p>' for result in similarity_list[:10]]
    output = '<hr><br>'.join(results)
    
    html_widget.value = output

text_widget.on_submit(search)    
    
display(text_widget)
display(html_widget)    


Text(value='')

HTML(value='')