In [1]:
import pandas as pd
import numpy as np
import re
import markovify
import json

In [2]:
# Einlesen des Datensatzes, "tweets_test" ist nur eine Test Datei.
train = pd.read_csv("trump_tweets.csv", engine="python", encoding="cp1252")
train.head()

Unnamed: 0,source,text,created_at,favorite_count,id_str
0,Twitter for iPhone,It is a stunner by any stretch of the imaginat...,06-05-2020 13:01:38,38831.0,1.268891e+18
1,Twitter for iPhone,It’s a stupendous number. It’s joyous let’s ca...,06-05-2020 12:59:17,35164.0,1.26889e+18
2,Twitter for iPhone,Oh no the Dems are worried again. The only one...,06-05-2020 12:54:18,59423.0,1.268889e+18
3,Twitter for iPhone,Congratulations to wonderful Charles Payne on ...,06-05-2020 12:51:00,45021.0,1.268888e+18
4,Twitter for iPhone,I will be doing a News Conference at 10:00 A.M...,06-05-2020 12:48:41,59473.0,1.268887e+18


In [3]:
# Hier wird die Tweet Variable deklariert
tweets = train.text
tweets.head

<bound method NDFrame.head of 0        It is a stunner by any stretch of the imaginat...
1        It’s a stupendous number. It’s joyous let’s ca...
2        Oh no the Dems are worried again. The only one...
3        Congratulations to wonderful Charles Payne on ...
4        I will be doing a News Conference at 10:00 A.M...
                               ...                        
49350    My persona will never be that of a wallflower ...
49351    New Blog Post: Celebrity Apprentice Finale and...
49352    Donald Trump reads Top Ten Financial Tips on L...
49353    Donald Trump will be appearing on The View tom...
49354    Be sure to tune in and watch Donald Trump on L...
Name: text, Length: 49355, dtype: object>

In [4]:
def clean_tweet(tweet):
    if (type(tweet) == str):
        if (not re.search(r"^RT.*$", tweet) and not re.search(r"^http.*$", tweet)):
            tweet = str(tweet).lower()
            tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet, flags=re.MULTILINE)
            tweet = re.sub(r'[_"\-;%()|.,+&=*%]', '', tweet)
            tweet = re.sub(r'\.', ' . ', tweet)
            tweet = re.sub(r'\!', ' !', tweet)
            tweet = re.sub(r'\?', ' ?', tweet)
            tweet = re.sub(r'\,', ' ,', tweet)
            tweet = re.sub(r':', ' : ', tweet)
            tweet = re.sub(r'#', ' # ', tweet)
            tweet = re.sub(r'@', ' @ ', tweet)
            tweet = re.sub(r'd .c .', 'd.c.', tweet)
            tweet = re.sub(r'u .s .', 'd.c.', tweet)
            tweet = re.sub(r' amp ', ' and ', tweet)
            tweet = re.sub(r'pm', ' pm ', tweet)
            tweet = re.sub(r'news', ' news ', tweet)
            tweet = re.sub(r' . . . ', ' ', tweet)
            tweet = re.sub(r' .  .  . ', ' ', tweet)
            tweet = re.sub(r' ! ! ', ' ! ', tweet)
            tweet = re.sub(r'&amp', 'and', tweet)
            return tweet
        else:
            return None
    else:
        return None

In [5]:
# Tweets säubern und alle Tweets in clean_tweets packen, dessen Länge > 0 ist.
clean_tweets = []
for tweet in tweets:
    tweet = clean_tweet(tweet)
    if tweet != "none":
        if tweet != None:
            if len(tweet) > 0:
                clean_tweets.append(tweet)
        
len(clean_tweets)

38098

In [6]:
# Prüfung, ob die Tweets gesäubert wurden:
x = 0
for i in range(x, x+5):
    print("Tweet" + str(i))
    print(clean_tweets[i])
    print()

Tweet0
it is a stunner by any stretch of the imagination !  @ cnbc

Tweet1
it’s a stupendous number it’s joyous let’s call it like it is the market was right it’s stunning !  @ jimcramer   @ cnbc

Tweet2
oh no the dems are worried again the only one that can kill this comeback is sleepy joe biden !

Tweet3
congratulations to wonderful charles payne on having been so optimistic and therefore correct market up big ! !

Tweet4
i will be doing a  news  conference at 10 : 00 am on the jobs numbers ! white house



In [7]:
# Funktion, die reguläre Ausdrücke in den jeweiligen Tweets findet und an eine List anhängt.
def find_topic(liste, find_terms):
    liste=[]
    for i in range(len(clean_tweets)):
        for word in clean_tweets[i].split():
            find = re.findall(find_terms,word)
            if find:
                liste.append(clean_tweets[i])
    return liste

In [8]:
# Listen für die jeweiligen Topics, die wir abfragen wollen. Zur besseren Übersicht wird für jedes Topic eine liste vergeben. Grundsätzlich würde eine leere Liste reichen.
dems=[]
republican=[]
iran=[]
election=[]
obama=[]
fake=[]
china=[]
media=[]
maga=[]
kag=[]
america=[]
interview=[]
job=[]
politic=[]
fox=[]
cnn=[]
nbc=[]
clinton=[]
russian=[]
covid=[]
family=[]
germany=[]
industry=[]
communism=[]
korea=[]
myself=[]
military=[]
health=[]
wall=[]
gun=[]

In [9]:
# Funktion auf die Topics anwenden, die wir verwenden wollen
democrats = find_topic(dems, "dems|democrats|democratic")
irans = find_topic(iran, "iran")
republicans = find_topic(republican,"republican")
elections = find_topic(election, "elections|poll|campaign")
obamas = find_topic(obama, "obama|barak|michelle|obamagate")
fakes = find_topic(fake,"fake|fake news")
chinas = find_topic(china,"china|chinese")
medias = find_topic(media,"media|mainstream media")
magas = find_topic(maga,"maga|make america great again")
kags = find_topic(kag,"kag|keep america great")
americas = find_topic(america,"america|american|usa|united states")
interviews = find_topic(interview,"interview|interviews")
jobs = find_topic(job,"jobs|work")
politics = find_topic(politic,"politics|politic")
foxs = find_topic(fox,"fox|fox news")
cnns = find_topic(cnn,"cnn|cnn news")
nbcs = find_topic(nbc,"nbc|nbc news")
clintons = find_topic(clinton,"hillary|clinton|bill")
russians = find_topic(russian,"russians|putin|russian|russia")
covids = find_topic(covid,"flu|corona|covid|virus|pandemic|chinese virus")
familys = find_topic(family,"ivanka|trump|melania")
germanys = find_topic(germany,"german|germany|merkel")
industrys = find_topic(industry,"industry|industrial")
koreas = find_topic(korea,"korea")
myselfs = find_topic(myself,"donald|trump")
militarys = find_topic(military,"military")
healths = find_topic(health,"health|care")
walls = find_topic(wall,"border|wall|mexico")
guns = find_topic(gun,"gun|weapon|rifle|shooting")

In [None]:


covids = find_topic(covid,"flu|corona|covid|virus|pandemic|chinese virus")



In [10]:
# Erstellen des Markovify-Modells, welches gleich in ein Json-Format umgewandelt wird.
markov_dems = markovify.Text(democrats, state_size = 3,well_formed=False).to_json()
markov_guns = markovify.Text(guns, state_size = 3, well_formed=False).to_json()
markov_walls = markovify.Text(walls, state_size = 3, well_formed=False).to_json()
markov_healths = markovify.Text(healths, state_size = 3, well_formed=False).to_json()
markov_militarys = markovify.Text(militarys, state_size = 3, well_formed=False).to_json()
markov_myselfs = markovify.Text(myselfs, state_size = 3, well_formed=False).to_json()
markov_koreas = markovify.Text(koreas, state_size = 3, well_formed=False).to_json()
markov_industrys = markovify.Text(industrys, state_size = 3, well_formed=False).to_json()
markov_germanys = markovify.Text(germanys, state_size = 3, well_formed=False).to_json()
markov_familys = markovify.Text(familys, state_size = 3, well_formed=False).to_json()
markov_covids = markovify.Text(covids, state_size = 3, well_formed=False).to_json()
markov_russians = markovify.Text(russians, state_size = 3, well_formed=False).to_json()
markov_clintons = markovify.Text(clintons, state_size = 3, well_formed=False).to_json()
markov_nbcs = markovify.Text(nbcs, state_size = 3, well_formed=False).to_json()
markov_cnns = markovify.Text(cnns, state_size = 3, well_formed=False).to_json()
markov_foxs = markovify.Text(foxs, state_size = 3, well_formed=False).to_json()
markov_politics = markovify.Text(politics, state_size = 3, well_formed=False).to_json()
markov_jobs = markovify.Text(jobs, state_size = 3, well_formed=False).to_json()
markov_interviews = markovify.Text(interviews, state_size = 3, well_formed=False).to_json()
markov_americas = markovify.Text(americas, state_size = 3, well_formed=False).to_json()
markov_kags = markovify.Text(kags, state_size = 3, well_formed=False).to_json()
markov_magas = markovify.Text(magas, state_size = 3, well_formed=False).to_json()
markov_medias = markovify.Text(medias, state_size = 3, well_formed=False).to_json()
markov_chinas = markovify.Text(chinas, state_size = 3, well_formed=False).to_json()
markov_fakes = markovify.Text(fakes, state_size = 3, well_formed=False).to_json()
markov_obamas = markovify.Text(obamas, state_size = 3, well_formed=False).to_json()
markov_elections = markovify.Text(elections, state_size = 3, well_formed=False).to_json()
markov_republicans = markovify.Text(republicans, state_size = 3, well_formed=False).to_json()
markov_irans = markovify.Text(irans, state_size = 3, well_formed=False).to_json()

In [11]:
# Abspeichern und Export der einzelnen Modelle als ".txt"-Datei.
with open('markov_dems.txt', 'w') as outfile:
    json.dump(dems_json, outfile)
    
with open('markov_healths.txt', 'w') as outfile:
    json.dump(markov_healths, outfile)
    
with open('markov_iran.txt', 'w') as outfile:
    json.dump(markov_irans, outfile)
    
with open('markov_republicans.txt', 'w') as outfile:
    json.dump(markov_republicans, outfile)
    
with open('markov_elections.txt', 'w') as outfile:
    json.dump(markov_elections, outfile)
    
with open('markov_obamas.txt', 'w') as outfile:
    json.dump(markov_obamas, outfile)
    
with open('markov_fakes.txt', 'w') as outfile:
    json.dump(markov_fakes, outfile)
    
with open('markov_china.txt', 'w') as outfile:
    json.dump(markov_chinas, outfile)
    
with open('markov_media.txt', 'w') as outfile:
    json.dump(markov_medias, outfile)
    
with open('markov_maga.txt', 'w') as outfile:
    json.dump(markov_magas, outfile)
    
with open('markov_kag.txt', 'w') as outfile:
    json.dump(markov_kags, outfile)
    
with open('markov_america.txt', 'w') as outfile:
    json.dump(markov_americas, outfile)
    
with open('markov_interview.txt', 'w') as outfile:
    json.dump(markov_interviews, outfile)
    
with open('markov_jobs.txt', 'w') as outfile:
    json.dump(markov_jobs, outfile)
    
with open('markov_politics.txt', 'w') as outfile:
    json.dump(markov_politics, outfile)
    
with open('markov_fox.txt', 'w') as outfile:
    json.dump(markov_foxs, outfile)
    
with open('markov_cnn.txt', 'w') as outfile:
    json.dump(markov_cnns, outfile)
    
with open('markov_nbc.txt', 'w') as outfile:
    json.dump(markov_nbcs, outfile)
    
with open('markov_clinton.txt', 'w') as outfile:
    json.dump(markov_clintons, outfile)
    
with open('markov_russia.txt', 'w') as outfile:
    json.dump(markov_russians, outfile)
    
with open('markov_covid.txt', 'w') as outfile:
    json.dump(markov_covids, outfile)
    
with open('markov_family.txt', 'w') as outfile:
    json.dump(markov_familys, outfile)
    
with open('markov_germany.txt', 'w') as outfile:
    json.dump(markov_germanys, outfile)
    
with open('markov_industry.txt', 'w') as outfile:
    json.dump(markov_industrys, outfile)
    
with open('markov_korea.txt', 'w') as outfile:
    json.dump(markov_koreas, outfile)
    
with open('markov_myself.txt', 'w') as outfile:
    json.dump(markov_myselfs, outfile)
    
with open('markov_military.txt', 'w') as outfile:
    json.dump(markov_militarys, outfile)
    
with open('markov_walls.txt', 'w') as outfile:
    json.dump(markov_walls, outfile)
    
with open('markov_guns.txt', 'w') as outfile:
    json.dump(markov_guns, outfile)

NameError: name 'json' is not defined

In [15]:
# Durch diese Zeile können wir unsere Modelle wieder importieren.
reconstituted_model = markovify.Text.from_json(markov_familys)
print(reconstituted_model.make_short_sentence(140))
print(reconstituted_model.make_sentence_with_start('melania'))

via @ news maxmedia @ dpatten32 time for trump to get to 3 even though they said
melania and i extend our deepest condolences to president reuven rivlin and the entire trump family i want to go to jail for flying trump flag breitbart
