# Trump Bot: Topic Clustering mit TF-IDF

In [1]:
import re
import operator
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [2]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [3]:
# Einlesen des Datensatzes, "trump_tweets.csv"
#cp1252 encoding nur für MacOS
train = pd.read_csv("trump_tweets.csv", engine="python", encoding="cp1252")

In [4]:
train.head()

Unnamed: 0,source,text,created_at,favorite_count,id_str
0,Twitter for iPhone,It is a stunner by any stretch of the imaginat...,06-05-2020 13:01:38,38831.0,1.268891e+18
1,Twitter for iPhone,It’s a stupendous number. It’s joyous let’s ca...,06-05-2020 12:59:17,35164.0,1.26889e+18
2,Twitter for iPhone,Oh no the Dems are worried again. The only one...,06-05-2020 12:54:18,59423.0,1.268889e+18
3,Twitter for iPhone,Congratulations to wonderful Charles Payne on ...,06-05-2020 12:51:00,45021.0,1.268888e+18
4,Twitter for iPhone,I will be doing a News Conference at 10:00 A.M...,06-05-2020 12:48:41,59473.0,1.268887e+18


In [5]:
train.shape

(49355, 5)

In [6]:
# Hier wird die Tweet Variable deklariert
tweets = train.text
tweets.head

<bound method NDFrame.head of 0        It is a stunner by any stretch of the imaginat...
1        It’s a stupendous number. It’s joyous let’s ca...
2        Oh no the Dems are worried again. The only one...
3        Congratulations to wonderful Charles Payne on ...
4        I will be doing a News Conference at 10:00 A.M...
                               ...                        
49350    My persona will never be that of a wallflower ...
49351    New Blog Post: Celebrity Apprentice Finale and...
49352    Donald Trump reads Top Ten Financial Tips on L...
49353    Donald Trump will be appearing on The View tom...
49354    Be sure to tune in and watch Donald Trump on L...
Name: text, Length: 49355, dtype: object>

# Tweets "säubern" mit regulären Ausdrücken

In [7]:
def clean_tweet(tweet):
    if (type(tweet) == str):
        if (not re.search(r"^RT.*$", tweet) and not re.search(r"^http.*$", tweet)):
            tweet = str(tweet).lower()
            tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet, flags=re.MULTILINE)
            tweet = re.sub(r'[_"\-;%()|.,+&=*%]', '', tweet)
            tweet = re.sub(r'\.', ' . ', tweet)
            tweet = re.sub(r'\!', ' !', tweet)
            tweet = re.sub(r'\?', ' ?', tweet)
            tweet = re.sub(r'\,', ' ,', tweet)
            tweet = re.sub(r':', ' : ', tweet)
            tweet = re.sub(r'#', ' # ', tweet)
            tweet = re.sub(r'@', ' @ ', tweet)
            tweet = re.sub(r'd .c .', 'd.c.', tweet)
            tweet = re.sub(r'u .s .', 'd.c.', tweet)
            tweet = re.sub(r' amp ', ' and ', tweet)
            tweet = re.sub(r'pm', ' pm ', tweet)
            tweet = re.sub(r'news', ' news ', tweet)
            tweet = re.sub(r' . . . ', ' ', tweet)
            tweet = re.sub(r' .  .  . ', ' ', tweet)
            tweet = re.sub(r' ! ! ', ' ! ', tweet)
            tweet = re.sub(r'&amp', 'and', tweet)
            return tweet
        else:
            return None
    else:
        return None

In [8]:
# Tweets säubern und alle Tweets in clean_tweets packen, dessen Länge > 0 ist.
clean_tweets = []
for tweet in tweets:
    tweet = clean_tweet(tweet)
    if tweet != "none":
        if tweet != None:
            if len(tweet) > 0:
                clean_tweets.append(tweet)
        
len(clean_tweets)

38098

In [9]:
# Prüfung, ob die Tweets gesäubert wurden:
x = 0
for i in range(x, x+5):
    print("Tweet" + str(i))
    print(clean_tweets[i])
    print()

Tweet0
it is a stunner by any stretch of the imagination !  @ cnbc

Tweet1
it’s a stupendous number it’s joyous let’s call it like it is the market was right it’s stunning !  @ jimcramer   @ cnbc

Tweet2
oh no the dems are worried again the only one that can kill this comeback is sleepy joe biden !

Tweet3
congratulations to wonderful charles payne on having been so optimistic and therefore correct market up big ! !

Tweet4
i will be doing a  news  conference at 10 : 00 am on the jobs numbers ! white house



# Berechnung der TF-IDF Werte

In [10]:
cv = CountVectorizer()
word_count_vector = cv.fit_transform(clean_tweets)

In [11]:
#Berechnung der TF-IDF Werte
tfidf_vectorizer=TfidfVectorizer(use_idf=True)
tfidf_vectors=tfidf_vectorizer.fit_transform(clean_tweets)

In [12]:
#So sieht ein einzelner Vektor, nach absteigenden TF-IDF Werten sortiert, aus
example_vector=tfidf_vectors[0]
df = pd.DataFrame(example_vector.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
print(df.sort_values(by=["tfidf"],ascending=False))

                   tfidf
stunner         0.528110
stretch         0.508383
imagination     0.416084
cnbc            0.363452
any             0.259381
...                  ...
fitbafan        0.000000
fitch           0.000000
fiteswithheart  0.000000
fitn            0.000000
élysée          0.000000

[37181 rows x 1 columns]


In [13]:
#Umwandlung der Werte in eine Matrix um die pca-Metode auf die Werte anwenden zu können
matrix = tfidf_vectors.todense()

# Clustern der Tweets in Gruppen

In [14]:
# Mit der Hauptkomponentenanalyse (PCA) reduzieren wir die Dimension eines jeden Tweets auf 1.
pca = PCA(n_components=1, random_state = 2)
pca_tweets = pca.fit_transform(matrix)

In [15]:
# Wir sehen, dass wir immer noch fast die gleiche Anzahl an Tweets haben. 
# Jeden Tweet stellt jetzt allerdings nur noch eine Zahl dar:
print("Anzahl: ", len(pca_tweets))
print(pca_tweets)

Anzahl:  38098
[[-0.05649444]
 [-0.04702602]
 [-0.06852488]
 ...
 [ 0.00189099]
 [-0.04942379]
 [-0.06038127]]


In [16]:
# Ähnliche Tweets gruppieren mit der K-Means-Methode.
# n_clusters gibt an, wieviele unterschiedliche Gruppen wir wollen
kmeans = KMeans(n_clusters=4, max_iter = 1000, n_init = 20, random_state=2).fit(pca_tweets)
labels = kmeans.labels_

In [17]:
# Wie viele Tweets sind in jeder Gruppe enthalten:
pd.DataFrame(labels)[0].value_counts()

0    16950
3    16205
2     3863
1     1080
Name: 0, dtype: int64

In [18]:
# Stopwords aufzählen, damit diese nicht bei den meist vorkommenden Wörter aufgelistet werden:
stop_words = ['be','on','!','at','.',':','...','@',',','#','will','.m','in','a','the','with','to','by','and','my','is',
              'of','for','new','via','are','that','has','have','all','as','it','so','they','do','he','just','this',
              'was','who','your','from','his','about','get','but','am','up','if','can','would','than','should','dont',
              'had','or','were','did','there','got','even','its','an','i', 'not', 'our', 'we','you', '?','no','their', 'us','rt','great',
             'realdonaldtrump', 'trump', 'very', 'thank', 'thanks', 'president', 'donald', 'what', 'news', 'me', 'never', 'out', 'now', 'good',
             'when', 'like', 'one', 'more', 'run', 'time', 'best', 'going', 'much', 'want', 'big', 'make', 'again', 'many', 'been', 'today', 'him', 'pm', 'true',
             'mr', 'them', 'only', 'back', 'yes', 'need', 'why' , 'tonight', 'over', 'really', 'how', 'other', 'being', 'see', 'show', 'doing', 'think', 'must',
            'trump2016', 'makeamericagreatagain', 'apprenticenbc', 'fox', 'foxandfriends', 'can\'t', 'don\'t', '00', 'i\'m', 'know', 'celebapprentice', 'love', 'vote',
             'america', 'her', '7', 'watch', 'please', '2016', 'it\'s', 'tomorrow', 'she', 'country', 'people', 'go', 'first', 'soon', 'nice', 'years', 'hope', 'needs',
             'you\'re', 'work', 'keep', 'day', 'job', 'better', 'working', 'man', 'could', 'ever', 'done', 'say', 'amazing', 'support', 'bad', 'happy', 'believe',
             'right', 'well', 'always', 'last', 'amazing', 'win', 'which', 'because', 'way', 'real', 'u', '–', 'next', 'you\'ve', 'agree', 'running', 'wait', 'total',
             'said', '“', '“donald', '10', 'words', 'wonderful', 'american', 'hard']

In [19]:
# Für jede Gruppe die am meisten vorkommenden Wörter finden:
def most_common_words(group, n_words):
    vocab = {} # das Vokabular für jede Gruppe
    for i in range(len(clean_tweets)):
        if labels[i] == group:
            for word in clean_tweets[i].split():
                if word not in vocab:
                    vocab[word] = 1
                else:
                    vocab[word] += 1
      
    # Sortiere die am häufigst vorkommenden Wörter
    sorted_vocab = sorted(vocab.items(), key=operator.itemgetter(1), reverse=True)
    top_n_words = []
    for word, value in sorted_vocab:
        if word not in stop_words:
            top_n_words.append(word)
        if len(top_n_words) == n_words:
            break
    print(top_n_words)

In [20]:
# die am meisten vorkommenden Worte in jeder Gruppe:
groups = len(np.unique(labels))
for i in range(groups):
    print("Gruppe ", i+1,": ")
    most_common_words(i, 10)

Gruppe  1 : 
['obama', 'interview', 'apprentice', 'night', 'golf', 'jobs', 'course', 'business', 'looking', 'congratulations']
Gruppe  2 : 
['maga', 'kag2020', 'poll', 'americafirst', 'carolina', 'florida', 'honor', 'iowa', 'hampshire', 'pennsylvania']
Gruppe  3 : 
['republican', 'poll', 'business', 'night', 'god', 'look', 'honor', 'sir', 'proud', 'down']
Gruppe  4 : 
['obama', 'democrats', 'fake', 'china', 'border', 'media', 'deal', 'world', 'states', 'united']


Durch diese Gruppen können übergreifende Topics deklariert werden. Zum Beispiel wäre ein mögliches Topic für Gruppe 2 "poll".

In [21]:
def print_tweet_group(group, n_tweets):
    '''Prints the first n_tweets in a group'''
    count = 1
    for i in range(len(clean_tweets)):
        if labels[i] == group:
            print("#{}: {}".format(count, clean_tweets[i]))
            count += 1
            if count == n_tweets+1:
                break

In [22]:
# Die ersten paar Tweets in jeder Gruppe:
n_tweets = 5
for i in range(groups):
    print("Gruppe #",i+1)
    print_tweet_group(i,n_tweets)
    print()

Gruppe # 1
#1: congratulations to wonderful charles payne on having been so optimistic and therefore correct market up big ! !
#2: this is an amazing jobs report ! edward lawrence  @ fox news 
#3: i am so stunned i’ve never seen numbers like this and i’ve been doing this for 30 years ! steve m  @ mariabartiromo
#4: these numbers are incredible !  @ mariabartiromo
#5: really big jobs report great going president trump kidding but true !

Gruppe # 2
#1: thank you matt ! 
#2: thank you mark ! 
#3: 100 correct thank you tom ! 
#4: thank you cowboys see you in new mexico ! 
#5: thank you lou ! 

Gruppe # 3
#1: thanks for your invaluable help in getting a great man passed to run voice of america trying for 25 years and you got it done jim idaho is proud of you ! 
#2: so great to have michael home just arrived very exciting thank you to iran don’t wait until after us election to make the big deal i’m going to win you’ll make a better deal now ! 
#3: unrelated i gave alaska anwr major highways