# Imports

In [10]:
import spacy
import pandas as pd
import matplotlib.pyplot as mltp
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
import sklearn as sk
import seaborn as sea
import re 
from tqdm import tqdm

# processing

In [11]:
filename='data/AFD_Wahlprogramm_2021.txt'

In [12]:
with open(filename) as f:
    text = f.read()
    #text = text.replace(r'\\d+\\n', '$')
    #text=   re.sub("\n\n", ".", text)
    #text=   re.sub("\d+.", ".", text)
    text=   re.sub(" \d+\n", ".", text)
    text=   re.sub("\n\d+", " ", text)
    text=   re.sub("\n", " ", text)
sentences = re.split(r' *[\.\?!][\'"\)\]]* *', text)

In [31]:
sentences

['Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen',
 'Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz',
 'Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz',
 'Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion',
 'San Francisco erwägt Verbot von Lieferrobotern',
 'Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller',
 'Wo bist du?',
 'Was ist die Hauptstadt von Deutschland?']

In [13]:
text=sentences

In [14]:
len(text)

2086

In [15]:
text[0:20:-1]

[]

In [16]:
import numpy as np
import json
import glob

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
from spacy.lang.de.examples import sentences 
from nltk.corpus import stopwords

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

import pickle 

import warnings
warnings.filterwarnings('ignore',category=DeprecationWarning)

In [17]:
german_stop_words= stopwords.words('german')

In [18]:
def lemmatization_test(texts,  allowed_posttags=['NOUN','ADJ','VERB','ADV']):
    nlp=spacy.load('de_core_news_lg',disable=['parser','ner'])
    texts_out=[]
    for text in tqdm(texts):
        doc= nlp(text)
        new_text=[]
        for token in doc:
            if token.pos_ in allowed_posttags:
                new_text.append(token.lemma_)
        final=' '.join(new_text)
        texts_out.append(final)
    return texts_out

In [19]:
lemma_text=lemmatization_test(text)

100%|██████████| 2086/2086 [00:04<00:00, 453.43it/s]


In [20]:
lemma_text = list(filter(None, lemma_text))

In [21]:
def gen_words(texts):
    final=[]
    for text in tqdm(texts):
        new= gensim.utils.simple_preprocess(text,deacc=True)
        final.append(new)
    return final

In [22]:
data_words=gen_words(lemma_text)

100%|██████████| 1901/1901 [00:00<00:00, 41793.76it/s]


In [23]:
bigram_phrases=gensim.models.Phrases(data_words,min_count=3,threshold=100)
trigram_phases=gensim.models.Phrases(bigram_phrases[data_words],threshold=50)

bigram=gensim.models.phrases.Phraser(bigram_phrases)
trigram=gensim.models.phrases.Phraser(trigram_phases)

def make_bigrams(texts):
    return [bigram[doc] for doc in texts]

def make_trgram(texts):
    return [trigram[bigram[doc]] for doc in texts]

data_bigrams=make_bigrams(data_words)
data_bigrams_trigrams=make_trgram(data_bigrams)

In [24]:
from gensim.models import TfidfModel

id2word=corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus=[id2word.doc2bow(text) for text in texts]

print(corpus[3])

tfidf=TfidfModel(corpus,id2word=id2word)

low_value=0.03
words=[]
words_missing_in_tfdf=[]

for i in tqdm(range(0,len(corpus))):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops=low_value_words+words_missing_in_tfdf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]  
    corpus[i]=new_bow


[(4, 1), (5, 1)]


100%|██████████| 1901/1901 [00:00<00:00, 39137.72it/s]


In [25]:
topics=['climate',
'economy',
'education',
'health',
'infrastructure',
'science',
'social causes',
'politics and ideology',
'technology and entrepreneurship']

In [26]:
num_topics=len(topics)

In [27]:
lda_model=gensim.models.ldamodel.LdaModel(
corpus=corpus,
id2word=id2word,
num_topics=num_topics,
random_state=100,
update_every=1,
chunksize=100,
passes=10,
alpha='auto'
)

In [28]:
dic={}
dic_sum={}
for idx, topic in lda_model.print_topics(num_topics,num_words=100):
    elements=[]
    percent=[]

    for e in topic.split('+'):
        elements.append(e.split('*')[1].replace('"','').strip())
        percent.append(float(e.split('*')[0].replace('"','').strip()))

    dic[str(idx)+'_word']=elements
    dic[str(idx)+'_per']=percent
    dic_sum[idx]=sum(percent)
print(dic_sum)

{0: 0.5030000000000003, 1: 0.47600000000000026, 2: 0.49400000000000027, 3: 0.4700000000000003, 4: 0.5220000000000004, 5: 0.4940000000000003, 6: 0.6790000000000003, 7: 0.49500000000000033, 8: 0.5420000000000004}


In [29]:
pd.DataFrame(data=dic)

Unnamed: 0,0_word,0_per,1_word,1_per,2_word,2_per,3_word,3_per,4_word,4_per,5_word,5_per,6_word,6_per,7_word,7_per,8_word,8_per
0,deutsch,0.057,offentlich,0.026,wieder,0.020,landwirtschaft,0.025,fordern,0.074,setzen,0.032,paypal,0.115,ausbau,0.020,auch,0.063
1,gut,0.027,erhalten,0.023,dazu,0.015,regional,0.022,nur,0.030,ein,0.025,alternativefuer,0.115,digital,0.020,lehnen,0.033
2,kultur,0.018,frei,0.016,insbesondere,0.013,mensch,0.018,familie,0.020,deshalb,0.015,spenden,0.115,eu,0.017,hoch,0.019
3,sprache,0.014,neu,0.013,ermoglichen,0.013,europaisch,0.015,bleiben,0.017,wald,0.012,burger,0.020,versorgung,0.017,fuhren,0.017
4,schule,0.013,nutzung,0.012,landlich,0.013,starken,0.015,daher,0.016,dafur,0.011,land,0.013,klima_energie_technik_digitalisierung,0.016,sozial,0.017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,stellung,0.002,bitte,0.002,bzw,0.002,konventionell,0.002,last,0.002,gesondert,0.002,ungerechtigkeit,0.001,anteil,0.002,wie,0.002
96,kulturgut,0.002,iban,0.002,standig,0.002,islam,0.002,beteiligter,0.002,gesicherte,0.002,app,0.001,fachen,0.002,zugute,0.002
97,rentensystem,0.002,folgend,0.002,lebensabend,0.002,entweren,0.002,festgelegt,0.002,zensur,0.002,endlich,0.001,erwarmung,0.002,derart,0.002
98,festschreiben,0.002,abgabenlast,0.002,rentenalter,0.002,aufheizung,0.002,wirken,0.002,werbung,0.002,grundsicherung,0.001,emissione,0.002,zugang,0.002


In [30]:
pyLDAvis.enable_notebook()
vis=gensimvis.prepare(lda_model,corpus,id2word,mds='mmds',R=30)
vis

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
