In [1]:
import re
import pickle
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn' based on false positives
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector
from spacy.tokens.doc import Doc
from spacy.vocab import Vocab
from tqdm.notebook import tqdm
from spacy_sentiws import spaCySentiWS
from spacy_sentiws import spaCySentiWS


tqdm.pandas()

In [2]:
@Language.component("Remove non alphabetic words")
def remove_non_alpha(doc):
    return [token for token in doc if token.is_alpha]

In [3]:
@Language.factory("Detect languages")
def create_language_detector(nlp, name):
    return LanguageDetector(language_detection_function=None)

In [4]:
@Language.factory("Sentiment Appplication")
def create_sentiment_dictionary(nlp, name):
    return spaCySentiWS(sentiws_path = "../data/raw/Sentiment/")

In [5]:
@Language.component("Keep only German documents")
def remove_non_german(doc):
    res = [sent for sent in doc.sents if sent._.language["language"] == "de"]
    if res:
        return [token for sent in res for token in sent]
    else:
        return Doc(Vocab([]), words=[], spaces=[])

In [6]:
@Language.component("Remove stopwords")
def remove_stopwords(doc): 
    return [token for token in doc if not token.is_stop]

In [7]:
@Language.component("Lemmatize text")
def lemmatize_text(doc):
    return [token.lemma_ for token in doc]

In [8]:
@Language.component("Lowercase Text")
def lowercase(doc):
    return [token.lower() for token in doc]

In [9]:
emoji_codes = re.compile("["
                         u"\U0001F600-\U0001F64F"
                         u"\U0001F300-\U0001F5FF"
                         u"\U0001F680-\U0001F6FF"
                         u"\U0001F1E0-\U0001F1FF"
                         u"\U00002500-\U00002BEF"
                         u"\U00002702-\U000027B0"
                         u"\U00002702-\U000027B0"
                         u"\U000024C2-\U0001F251"
                         u"\U0001f926-\U0001f937"
                         u"\U00010000-\U0010ffff"
                         u"\u2640-\u2642"
                         u"\u2600-\u2B55"
                         u"\u200d"
                         u"\u23cf"
                         u"\u23e9"
                         u"\u231a"
                         u"\ufe0f"
                         u"\u3030"
                         "]+", re.UNICODE)

@Language.component("Remove emojis")
def remove_emojis(doc):
    doc = [token.text for token in doc if not re.match(emoji, token.text)]
    doc = ' '.join(doc)
    return nlp_twitter.make_doc(doc)

In [10]:
@Language.component("Remove URLs")
def remove_urls(doc):
    doc = [token.text for token in doc if not token.like_url]
    doc = ' '.join(doc)
    return nlp_twitter.make_doc(doc)

In [11]:
@Language.component("Remove mentions")
def remove_mentions(doc):
    doc = [token.text for token in doc if not re.match("@.*", token.text)]
    doc = ' '.join(doc)
    return nlp_twitter.make_doc(doc)

In [12]:
@Language.component("Remove stopwords and punctuation")
def remove_stopwords(doc):
    doc = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return doc

In [13]:
pipeline_exclude = ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'ner', 'morphologizer']

In [22]:
tweets_explored = pd.read_csv("../data/raw/Bundestag_Tweets.csv")
tweets_explored_wow = pd.read_csv("../data/raw/tweets_explored.csv")
speeches_explored = pd.read_csv("../data/raw/speeches_explored.csv")


In [15]:
# Create spacy pipeline
nlp_tweets = spacy.load('de_core_news_sm')
nlp_tweets.Defaults.stop_words |= {"amp", "rt"}

# The add_pipe function appends our functions to the default pipeline.
nlp_tweets.add_pipe("sentencizer", last=True)
nlp_tweets.add_pipe("Detect languages", name='Detect languages', last=True)
nlp_tweets.add_pipe("Keep only German documents", name='Keep only German documents', last=True)
nlp_tweets.add_pipe("Remove non alphabetic words", name="Remove non alphabetic words", last=True)
nlp_tweets.add_pipe("Remove stopwords", name="Remove stopwords", last=True)
# nlp_tweets.add_pipe("Lemmatize text", name="Lemmatize text", last=True)
# nlp_tweets.add_pipe("Lowercase Text", name="Lowercase Text", last=True)
nlp_tweets.add_pipe("Sentiment Appplication", name="Sentiment Appplication", last=True)

<spacy_sentiws.spaCySentiWS at 0x125592250>

In [16]:
####test####
doc = nlp_tweets('Die Dummheit der Unterwerfung blüht in hübschen Farben.')

for token in doc:
    print('{}, {}'.format(token.text, token._.sentiws))

Dummheit, -0.4877
Unterwerfung, -0.3279
blüht, 0.2028
hübschen, 0.4629
Farben, None


In [17]:
#import packages

import pandas as pd
from textblob_de import TextBlobDE as TextBlob
import numpy as np
from tqdm.notebook import tqdm

tqdm.pandas()

#read in Twitter data
#load in the preprocessed data
import pickle

pre_data_twitter= pickle.load(open('../data/processed/tweets_processed.p','rb'))
pre_data_speeches= pickle.load(open('../data/processed/speeches_processed.p','rb'))
pre_data_twitter.head()

Unnamed: 0,full_name,date,party,text_preprocessed,text_preprocessed_sentence
0,Ralph Brinkhaus,2021-06-15,CDU,"[fußballfans, freuen, spiel, nationalmannschaf...",fußballfans freuen spiel nationalmannschaft dr...
1,Ralph Brinkhaus,2021-06-11,CDU,"[außenpolitik, wirtschaftlich, souveränität, d...",außenpolitik wirtschaftlich souveränität digit...
2,Ralph Brinkhaus,2021-06-11,CDU,"[nachhaltig, klimawandel, kämpfen, brauchen, a...",nachhaltig klimawandel kämpfen brauchen akzept...
3,Ralph Brinkhaus,2021-06-11,CDU,"[brauchen, pandemie, bezahlen, arbeitsplätze, ...",brauchen pandemie bezahlen arbeitsplätze digit...
4,Ralph Brinkhaus,2021-06-11,CDU,"[wahldebatte, thema, zukunft, passieren, coron...",wahldebatte thema zukunft passieren corona sta...


In [48]:
#Apply the notebook to the Twitter accounts of the politicians
data=[]
for name in tqdm(['Ralph Brinkhaus','Hermann Gröhe', 'Nadine Schön' ,'Norbert Röttgen' , 'Peter Altmaier' , 'Jens Spahn' , 'Matthias Hauer',
            'Christian Lindner' , 'Marco Buschmann' , 'Bettina Stark-Watzinger', 'Alexander Graf Lambsdorff' , 'Johannes Vogel' , 'Konstantin Kuhle' , 'Marie-Agnes Strack-Zimmermann',
            'Lars Klingbeil' , 'Saskia Esken' , 'Hubertus Heil' , 'Heiko Maas' , 'Martin Schulz' , 'Karamba Diaby' , 'Karl Lauterbach',
            'Steffi Lemke' , 'Cem Özdemir' , 'Katrin Göring-Eckardt' , 'Konstantin von Notz' , 'Britta Haßelmann' , 'Sven Lehmann' , 'Annalena Baerbock',
            'Sahra Wagenknecht' , 'Bernd Riexinger' , 'Niema Movassat' , 'Jan Korte' , 'Dietmar Bartsch' , 'Gregor Gysi' , 'Sevim Dağdelen',
            'Alice Weidel' , 'Beatrix von Storch' , 'Joana Cotar' , 'Stephan Brandner' , 'Tino Chrupalla' , 'Götz Frömming' , 'Leif-Erik Holm']):
    #get tweets from the specific politician and from the desired period
    tweets_analyzing = tweets_explored_wow.loc[tweets_explored_wow['full_name']==name]
    tweets_analyzing1 = tweets_analyzing.text.progress_apply(nlp_tweets)
    #get the sentiment of the tweets
    politician_sum=[]
    for sentence in tweets_analyzing1:
        sentence_sum=[]
        for token in sentence:
            if token._.sentiws == None:
                a=0
            elif token._.sentiws == 'nan':
                a=0
            else:
                sentence_sum.append(token._.sentiws)
        sentence_score=np.nanmean(sentence_sum)
        politician_sum.append(sentence_score)
    politician_score=np.nanmean(politician_sum)
    #get the number of positive, neutral and negative tweets
    positive_p=0
    neutral_p=0
    negative_p=0
    for item_p in politician_sum:
        if item_p>0:
            positive_p += 1
        elif item_p<0:
            negative_p += 1
        elif item_p == 'nan':
            neutral_p += 1
        else:
            neutral_p += 1
    #set up list to secure the values generated
    data.append([name,politician_score,positive_p,neutral_p,negative_p])
        

  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/361 [00:00<?, ?it/s]

  sentence_score=np.nanmean(sentence_sum)


  0%|          | 0/847 [00:00<?, ?it/s]

  0%|          | 0/386 [00:00<?, ?it/s]

  0%|          | 0/2760 [00:00<?, ?it/s]

  0%|          | 0/1084 [00:00<?, ?it/s]

  0%|          | 0/1214 [00:00<?, ?it/s]

  0%|          | 0/10035 [00:00<?, ?it/s]

  0%|          | 0/4647 [00:00<?, ?it/s]

  0%|          | 0/14060 [00:00<?, ?it/s]

  0%|          | 0/2321 [00:00<?, ?it/s]

  0%|          | 0/3245 [00:00<?, ?it/s]

  0%|          | 0/7562 [00:00<?, ?it/s]

  0%|          | 0/4764 [00:00<?, ?it/s]

  0%|          | 0/5069 [00:00<?, ?it/s]

  0%|          | 0/2582 [00:00<?, ?it/s]

  0%|          | 0/18706 [00:00<?, ?it/s]

  0%|          | 0/933 [00:00<?, ?it/s]

  0%|          | 0/2306 [00:00<?, ?it/s]

  0%|          | 0/302 [00:00<?, ?it/s]

  0%|          | 0/609 [00:00<?, ?it/s]

  0%|          | 0/6893 [00:00<?, ?it/s]

  0%|          | 0/1041 [00:00<?, ?it/s]

  0%|          | 0/2823 [00:00<?, ?it/s]

  0%|          | 0/3337 [00:00<?, ?it/s]

  0%|          | 0/3624 [00:00<?, ?it/s]

  0%|          | 0/3713 [00:00<?, ?it/s]

  0%|          | 0/3638 [00:00<?, ?it/s]

  0%|          | 0/1138 [00:00<?, ?it/s]

  0%|          | 0/890 [00:00<?, ?it/s]

  0%|          | 0/2529 [00:00<?, ?it/s]

  0%|          | 0/11694 [00:00<?, ?it/s]

  0%|          | 0/1339 [00:00<?, ?it/s]

  0%|          | 0/3366 [00:00<?, ?it/s]

  0%|          | 0/387 [00:00<?, ?it/s]

  0%|          | 0/1377 [00:00<?, ?it/s]

  0%|          | 0/1732 [00:00<?, ?it/s]

  0%|          | 0/2794 [00:00<?, ?it/s]

  0%|          | 0/5663 [00:00<?, ?it/s]

  0%|          | 0/14052 [00:00<?, ?it/s]

  0%|          | 0/792 [00:00<?, ?it/s]

  0%|          | 0/7265 [00:00<?, ?it/s]

  0%|          | 0/580 [00:00<?, ?it/s]

In [49]:
#set up dataframe with all values
dataf = pd.DataFrame(data, columns=['Name','Polarity_mean','Num_pos_tweets','Num_neutral_tweets','Num_neg_tweets'])
display(dataf)

Unnamed: 0,Name,Polarity_mean,Num_pos_tweets,Num_neutral_tweets,Num_neg_tweets
0,Ralph Brinkhaus,0.026858,205,70,86
1,Hermann Gröhe,0.008751,350,350,147
2,Nadine Schön,0.080311,163,185,38
3,Norbert Röttgen,0.008242,1291,765,704
4,Peter Altmaier,0.02255,419,480,185
5,Jens Spahn,0.03279,727,236,251
6,Matthias Hauer,-0.046743,2584,5489,1962
7,Christian Lindner,-0.031693,1658,1808,1181
8,Marco Buschmann,-0.065289,3732,6838,3490
9,Bettina Stark-Watzinger,-0.023247,1044,516,761


In [44]:
#Apply the dictionary to the speeches accounts of the politicians
data=[]
for name in tqdm(['Ralph Brinkhaus','Hermann Gröhe', 'Nadine Schön' ,'Norbert Röttgen' , 'Peter Altmaier' , 'Jens Spahn' , 'Matthias Hauer',
            'Christian Lindner' , 'Marco Buschmann' , 'Bettina Stark-Watzinger', 'Alexander Graf Lambsdorff' , 'Johannes Vogel' , 'Konstantin Kuhle' , 'Marie-Agnes Strack-Zimmermann',
            'Lars Klingbeil' , 'Saskia Esken' , 'Hubertus Heil' , 'Heiko Maas' , 'Martin Schulz' , 'Karamba Diaby' , 'Karl Lauterbach',
            'Steffi Lemke' , 'Cem Özdemir' , 'Katrin Göring-Eckardt' , 'Konstantin von Notz' , 'Britta Haßelmann' , 'Sven Lehmann' , 'Annalena Baerbock',
            'Sahra Wagenknecht' , 'Bernd Riexinger' , 'Niema Movassat' , 'Jan Korte' , 'Dietmar Bartsch' , 'Gregor Gysi' , 'Sevim Dağdelen',
            'Alice Weidel' , 'Beatrix von Storch' , 'Joana Cotar' , 'Stephan Brandner' , 'Tino Chrupalla' , 'Götz Frömming' , 'Leif-Erik Holm']):
    #get tweets from the specific politician and from the desired period
    tweets_analyzing = speeches_explored.loc[speeches_explored['full_name']==name]
    tweets_analyzing1 = tweets_analyzing.text.progress_apply(nlp_tweets)
    #get the sentiment of the tweets
    politician_sum=[]
    for sentence in tweets_analyzing1:
        sentence_sum=[]
        for token in sentence:
            if token._.sentiws == None:
                a=0
            elif token._.sentiws == 'nan':
                a=0
            else:
                sentence_sum.append(token._.sentiws)
        sentence_score=np.nanmean(sentence_sum)
        politician_sum.append(sentence_score)
    politician_score=np.nanmean(politician_sum)
    #get the number of positive, neutral and negative tweets
    positive_p=0
    neutral_p=0
    negative_p=0
    for item_p in politician_sum:
        if item_p>0:
            positive_p += 1
        elif item_p<0:
            negative_p += 1
        elif item_p == 'nan':
            neutral_p += 1
        else:
            neutral_p += 1
    #set up list to secure the values generated
    data.append([name,politician_score,positive_p,neutral_p,negative_p])
        

  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/57 [00:00<?, ?it/s]

  sentence_score=np.nanmean(sentence_sum)


  0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/83 [00:00<?, ?it/s]

  0%|          | 0/33 [00:00<?, ?it/s]

  0%|          | 0/182 [00:00<?, ?it/s]

  0%|          | 0/266 [00:00<?, ?it/s]

  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/106 [00:00<?, ?it/s]

  0%|          | 0/97 [00:00<?, ?it/s]

  0%|          | 0/72 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

  0%|          | 0/256 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

  0%|          | 0/82 [00:00<?, ?it/s]

  0%|          | 0/112 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

  0%|          | 0/333 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/140 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/158 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/77 [00:00<?, ?it/s]

  0%|          | 0/73 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/165 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

In [45]:
#set up dataframe with all values
dataf = pd.DataFrame(data, columns=['Name','Polarity_mean','Num_pos_speeches','Num_neutral_speeches','Num_neg_speeches'])
display(dataf)

Unnamed: 0,Name,Polarity_mean,Num_pos_tweets,Num_neutral_tweets,Num_neg_tweets
0,Ralph Brinkhaus,-0.022753,20,4,33
1,Hermann Gröhe,0.004228,60,4,35
2,Nadine Schön,0.015663,44,9,30
3,Norbert Röttgen,-0.061556,5,4,24
4,Peter Altmaier,0.004122,102,23,57
5,Jens Spahn,-0.000652,146,36,84
6,Matthias Hauer,-0.038118,11,2,25
7,Christian Lindner,-0.028265,11,1,38
8,Marco Buschmann,-0.043712,14,0,34
9,Bettina Stark-Watzinger,-0.070444,4,3,21
