# Topic Modeling

In [9]:
# Imports
import warnings
warnings.simplefilter("ignore")

import datetime

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

import numpy as np
import pandas as pd

from nltk.corpus import stopwords

import spacy
# nlp = spacy.load('de_core_news_lg')

In [4]:
# Preprocess functions
def keep_nouns(text):
    nouns = ""
    for word in nlp(text):
        if word.pos_ == 'NOUN':
            nouns += " "+word.orth_+" "
    return nouns

def lemmatize(text):
    text = nlp(text)
    return ' '.join([x.lemma_ for x in text])

def preprocess(text):
    return lemmatize(keep_nouns(text))

In [5]:
# Read Data
df = pd.read_csv("Data/clean_thesis.csv", 
                 parse_dates=['Datum'],
                 infer_datetime_format=True
                 )

In [6]:
# Only German Articles
df = df[df.Länder == "Germany, Federal Republic of"].reset_index(drop=True)

In [7]:
# Filter Irrelevant Sources
quellen = pd.read_excel("Data/LexisNexis Quellen.xlsx")
df = df[df['Quelle'].isin(quellen[quellen.relevant == 1.0].Quelle)]

In [11]:
# Drop All Outside Daterange
# df['Datum'] = df.Datum.apply(lambda x: x.date())
start = datetime.datetime.strptime('2015-01-01', "%Y-%m-%d").date()
end = datetime.datetime.strptime('2020-08-31', "%Y-%m-%d").date()
drange = (df['Datum'] >= start) & (df['Datum'] <= end)
df = df.loc[drange]
df['Datum'] = df.Datum.apply(lambda x: pd.to_datetime(x))

In [12]:
# Sample
sample = df.sample(10000).reset_index(drop=True)

In [13]:
# TF VECTORIZATION
tf_vectorizer = CountVectorizer(preprocessor=preprocess)
dtm_tf = tf_vectorizer.fit_transform(sample.Text)
# print(dtm_tf.shape)

In [14]:
# TF-IDF VECTORIZATION
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(sample.Text)
# print(dtm_tfidf.shape)

In [17]:
# Perform LDA

# for TF DTM
lda_tf = LDA(n_components=10, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LDA(n_components=10, random_state=0)
lda_tfidf.fit(dtm_tfidf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [19]:
# Visualize LDA Result
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)