# Topic Modeling

### 1) Load Data

In [44]:
# All Imports
import numpy as np  
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

In [45]:
# Load data
df = pd.read_csv(r'/Users/nicohehlke/DataspellProjects/NAK-Text-Analytics/src/data/transfer/cleaned_articles_normalverteilt_3.csv')

In [46]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a list of all texts
texts = df["Cleaned_Text"].tolist()
"""
# Transform texts to tf-idf-values
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
"""
# Calculate the term frequency
vectorizer = CountVectorizer()
tfidf_matrix = vectorizer.fit_transform(texts)
# Create LDA-model
lda_model = LatentDirichletAllocation(n_components=40, random_state=1)
lda_topic_matrix = lda_model.fit_transform(tfidf_matrix)
print(lda_topic_matrix)

[[6.96378830e-05 1.04877690e-02 6.96378830e-05 ... 6.96378830e-05
  6.96378830e-05 6.96378830e-05]
 [2.47524752e-04 2.47524752e-04 2.47524752e-04 ... 2.47524752e-04
  2.47524752e-04 2.47524752e-04]
 [3.83435583e-05 9.01575849e-05 3.83435583e-05 ... 3.83435583e-05
  3.83435583e-05 5.91433085e-03]
 ...
 [1.08225108e-04 1.08225108e-04 1.08225108e-04 ... 1.08225108e-04
  1.08225108e-04 1.08225108e-04]
 [3.52112676e-04 3.52112676e-04 3.52112676e-04 ... 3.52112676e-04
  3.22678185e-01 3.52112676e-04]
 [2.77771976e-02 5.86854460e-05 5.86854460e-05 ... 5.86854460e-05
  9.02255982e-02 5.86854460e-05]]


In [47]:
# What topic is relevant for each text?
dominant_topics = [np.argmax(topic) for topic in lda_topic_matrix]
print(dominant_topics[0])
# Add a new column for the topic
df['dominant_topic'] = dominant_topics
print(df[['Cleaned_Text', 'dominant_topic']])

18
                                            Cleaned_Text  dominant_topic
0      neu Rumor Porschebörsengang Sixt Berenberg stu...              18
1      Beiersdorf Aktie Kaufempfehlung beflügeln Bere...               5
2      Heidelbergcement klimaneutral Zementwerk Weg B...              36
3      Dax bleiben Rekordhoch Vortag lustlos setzen R...              16
4      Sartorius Impferfolg belasten Papier Laborausr...              23
...                                                  ...             ...
38516  Eqscms Siemens Aktiengesellschaft Veröffentlic...              28
38517  national Luftfahrtkonferenz Hamburg Kanzler Sc...              31
38518  Stiebeleltronwärmepumpe Gifhorn Contijob erhal...              23
38519  national Luftfahrtkonferenz Hamburg Kanzler Sc...              31
38520  Dpaafxüberblick Unternehmen Uhr Roundupvor Woh...              36

[38521 rows x 2 columns]


In [48]:
# Extract the most important word for each topic
feature_names = vectorizer.get_feature_names_out()
topic_words = []
for idx, topic in enumerate(lda_model.components_):
    words = [feature_names[i] for i in topic.argsort()[:-5 - 1:-1]]
    topic_words.append(words)
    #print(sorted(topic, reverse=True)[:3])
    #Was ist, wenn es das wichtigste WOrt schon gibt? Dann das 2. Wichtigste nehmen? ist aber ja eigentlich falsch
print(topic_words)

[['bank', 'deutsch', 'credit', 'suisse', 'jahr'], ['unternehmen', 'prozent', 'jahr', 'deutsch', 'deutschland'], ['urteil', 'bgh', 'eugh', 'schadenersatz', 'auto'], ['prozent', 'jahr', 'china', 'auto', 'bmw'], ['aufsichtsrat', 'gesellschaft', 'hauptversammlung', 'vergütung', 'vorstand'], ['prozent', 'aktie', 'beachten', 'erhalten', 'bedingung'], ['q3zahlen', 'deu', 'unternehmen', 'q3umsatz', 'termin'], ['ziel', 'eur', 'senken', 'heben', 'deu'], ['daimler', 'truck', 'zalando', 'puma', 'holding'], ['wphg', 'stimmrecht', 'goldman', 'sachs', 'summe'], ['euro', 'milliarde', 'jahr', 'prozent', 'quartal'], ['dax', 'onvista', 'inhalt', 'deutsch', 'information'], ['fresenius', 'fmc', 'medical', 'care', 'unternehmen'], ['aktie', 'jahr', 'euro', 'unternehmen', 'fool'], ['china', 'sagen', 'lützerath', 'polizei', 'deutsch'], ['euro', 'dpaafx', 'originalstudie', 'studie', 'broker'], ['prozent', 'punkt', 'dax', 'aktie', 'deutsch'], ['dax', 'deutsch', 'neu', 'woche', 'stehen'], ['prozent', 'dax', 'punk

In [50]:
# Create topic-clusters with kMeans
num_clusters = 40  # e.g. one cluster for each topic
km = KMeans(n_clusters=num_clusters, random_state=42)
km.fit(tfidf_matrix)

# Predict cluster for each text
texte_cluster = km.predict(tfidf_matrix)
print(texte_cluster)

# Mapping between topics and cluster
themen_cluster_mapping = {}
for idx, cluster in enumerate(texte_cluster): #z.B. Index 0 = erstes Cluster = "5" = erstes dominant_topics = topic "16"
    topic_per_text = dominant_topics[idx]  # dominant topics for each text
    if topic_per_text not in themen_cluster_mapping:
        themen_cluster_mapping[topic_per_text] = cluster
        
# Label the topic for each text based on the cluster
automatic_labels = [topic_words[themen_cluster_mapping[thema]][0] for thema in dominant_topics]
df["topic"] = automatic_labels
#df.to_csv('themen.csv', index=False)
# Exportiere den DataFrame als Excel-Datei
df.to_excel('data//transfer//topic_modeling_v1.xlsx', index=False)

[ 8 36  1 ...  1 36  1]


### Baustelle

In [None]:
"""
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora, models

# Tokenisierung und Entfernung von Stoppwörtern
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('german'))

def preprocess_text(text):
    words = word_tokenize(text.lower())
    filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
    return filtered_words

# Tokenisierte Texte vorbereiten
tokenized_texts = [preprocess_text(text) for text in df['Text']]

# Wort-Dictionary erstellen
dictionary = corpora.Dictionary(tokenized_texts)

# Texte in ein Bag-of-Words-Format umwandeln
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]

# LDA-Modell erstellen
lda_model = models.LdaModel(corpus, num_topics=20, id2word=dictionary, passes=15)

# Funktion zum Zuordnen eines Textes zu einem Thema
def assign_topic(text):
    bow = dictionary.doc2bow(preprocess_text(text))
    topic_probs = lda_model.get_document_topics(bow)
    topic_probs = sorted(topic_probs, key=lambda x: x[1], reverse=True)  # Sortieren nach Wahrscheinlichkeit
    return topic_probs[0][0]  # Index des wahrscheinlichsten Themas

# Neue Spalte 'Thema' im DataFrame erstellen
df['Thema'] = df['Text'].apply(assign_topic)

# Ausgabe des DataFrames mit der neuen 'Thema'-Spalte
print(df[['Thema', 'Text']])

# 2. Versuch

In [None]:
# Import the wordcloud library
from wordcloud import WordCloud
# Join the different processed titles together.
long_string = ','.join(list(df['Text'].values))
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud.generate(long_string)
# Visualize the word cloud
wordcloud.to_image()

In [None]:
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('german')
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc))
             if word not in stop_words] for doc in texts]
data = df.text_processed.values.tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])

In [None]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1][0][:30])
"""