Read data

In [None]:
import pandas as pd

df = pd.read_csv('data/realdonaldtrump.csv', encoding='UTF-8')
df.dtypes

Clean up the tweet column and write that to a new dataframe

In [None]:
import re

for i in range(len(df)):
    content_value = str(df.at[i, 'content'])
    df.at[i, 'content'] = re.sub(r"[^a-zA-Z0-9]+", ' ', content_value.lower())
    
tweets_df_clean = df[['content']].copy()
tweets_df_clean.head()

Create histogram

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

plt.figure(figsize=(10,6))
doc_lens = [len(d) for d in tweets_df_clean.content]
plt.hist(doc_lens, bins=100)
plt.title('Verteilung der Tweetlänge')
plt.ylabel('Anzahl der Tweets')
plt.xlabel('Anzahl der Zeichen pro Tweet')
sns.despine()

Create a word cloud

In [None]:
from wordcloud import WordCloud, STOPWORDS

stopwords = set(STOPWORDS)
wordcloud = WordCloud(background_color='white', stopwords=stopwords, max_words=500, max_font_size=40, random_state=100).generate(str(tweets_df_clean.content))
print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show();

Perform an LDA

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
tf = vectorizer.fit_transform(tweets_df_clean.content)
tf_feature_names = vectorizer.get_feature_names_out()

lda = LatentDirichletAllocation(n_components=20, max_iter=5, learning_method='online', learning_offset=50., random_state=0).fit(tf)

for topic_idx, topic in enumerate(lda.components_):
    print('Thema %d:' % topic_idx)
    print([tf_feature_names[i] for i in topic.argsort()[:-10 - 1:-1]])