In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import spacy
from spacy_langdetect import LanguageDetector
import easyocr
import de_core_news_sm
import en_core_web_sm
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0

import re
import os
import string
import sys
from pathlib import Path
import pickle

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

import gensim
import gensim.corpora as corpora
from pprint import pprint

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

module_path = str(Path.cwd().parents[0] / "Scripts")
if module_path not in sys.path:
    sys.path.append(module_path)
    
import warnings
warnings.filterwarnings('ignore')

from notebook_scripts import split_array, stop_word_removal

## PATHS

In [2]:
poster_text_df_path = "../../Data/poster_text.csv"
corpus_df_path = "../../Data/corpus.csv"

## 1) Data

In [3]:
df = pd.read_csv(poster_text_df_path)
df = df[["Path", "Most_Likely"]]
df["Most_Likely"] = df["Most_Likely"].apply(lambda x: ''.join(str(word) for word in x))
df.head()

Unnamed: 0,Path,Most_Likely
0,../../Data/PlakateBayreuth/../../Data/PlakateB...,"['FORUM', 'Kirchner', 'Das', 'expressionistisc..."
1,../../Data/PlakateBayreuth/../../Data/PlakateB...,"['SEST', 'I4', 'g', "" ' ^ ."", '1472', 'Das Pa ..."
2,../../Data/PlakateBayreuth/../../Data/PlakateB...,"['U', 'M 1 V R (; E R', 'KUNSTHALLE', 'LICHTWA..."
3,../../Data/PlakateBayreuth/../../Data/PlakateB...,"['PRIS O NS', 'AUSSTELLUNGE', 'DER FRIEDRICH-S..."
4,../../Data/PlakateBayreuth/../../Data/PlakateB...,"['Christoph Brech', ""it's about time"", '10 Mai..."


In [4]:
try:
    corpus_df = df = pd.read_csv(corpus_df_path)
    
except:
    print("No file found. Creating Corpus...")
    corpus = []

    for i, row in enumerate(df["Most_Likely"], 1):
        cleaned_row = cleaner(row.split(","))
        corpus.append([cleaned_row])
        if i % 1000 == 0:
            print(f"[{i}/{len(df)}] processed successfully.")
            
    corpus_np = np.array(corpus)
    corpus_df = pd.DataFrame(corpus_np, columns=["Text"])
    corpus_df.to_csv(corpus_df_path, ignore_index=True)
  
print(f"Corpus Length: {len(corpus_df)}")

Corpus Length: 17786


## CONSTANTS

In [5]:
NUMBER_WORDS = 15
NUMBER_TOPICS = 15

### 1.1) Data Preprocessing

In [6]:
# Stopwor
corpus_df['Text']  = corpus_df['Text'].apply(stop_word_removal)

processed_corpus_df = corpus_df[corpus_df["Text"].apply(lambda x: split_array(x) > NUMBER_WORDS)]
corpus = processed_corpus_df["Text"].tolist()
print(f"Original Length: {len(corpus_df)} - Processed Length: {len(corpus)}")

Original Length: 17786 - Processed Length: 2380


## 2) Modelling

In [7]:
data_words = [word.split(" ") for word in corpus]
id2word = corpora.Dictionary(data_words)
corpus_lda = [id2word.doc2bow(text) for text in data_words]

In [8]:
lda_model = gensim.models.LdaMulticore(corpus=corpus_lda,
                                       id2word=id2word,
                                       num_topics=NUMBER_TOPICS)

pprint(lda_model.print_topics())
doc_lda = lda_model[corpus_lda]

[(0,
  '0.006*"ausstellung" + 0.005*"haus" + 0.004*"sch" + 0.004*"leitung" + '
  '0.004*"hne" + 0.004*"inszenierung" + 0.003*"musikalische" + 0.003*"richard" '
  '+ 0.003*"and" + 0.003*"augsburg"'),
 (1,
  '0.005*"and" + 0.004*"thomas" + 0.004*"theater" + 0.003*"deutsche" + '
  '0.003*"augsburg" + 0.003*"opernhaus" + 0.003*"eintritt" + 0.003*"haus" + '
  '0.003*"weimar" + 0.002*"freitag"'),
 (2,
  '0.011*"theater" + 0.006*"richard" + 0.006*"steingraeber" + 0.004*"glich" + '
  '0.004*"eintritt" + 0.004*"haus" + 0.003*"freitag" + 0.003*"inszenierung" + '
  '0.003*"dezember" + 0.003*"and"'),
 (3,
  '0.010*"haus" + 0.008*"sch" + 0.006*"theaterkasse" + 0.006*"theater" + '
  '0.005*"freitag" + 0.005*"abendkasse" + 0.004*"prof" + 0.004*"hne" + '
  '0.004*"leitung" + 0.003*"ckfahrt"'),
 (4,
  '0.007*"ausstellung" + 0.007*"haus" + 0.003*"freitag" + 0.003*"sch" + '
  '0.003*"recht" + 0.003*"abendkasse" + 0.003*"glich" + 0.003*"theater" + '
  '0.003*"menschen" + 0.002*"wurde"'),
 (5,
  '0.007*"sc

## 2) Visualisierung

In [9]:
LDAvis_data_filepath = os.path.join('../../imgs/ldav_prepared_'+str(NUMBER_TOPICS))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus_lda, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
        
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, '../../imgs/ldav_prepared_'+ str(NUMBER_TOPICS) +'.html')
LDAvis_prepared

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
