# Data preprocessing

## 1. Prepare pipelines

In [1]:
# Download pipeline from spacy
# python -m spacy download de_core_news_sm

In [2]:
import re
import pickle
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn' based on false positives
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector
from spacy.tokens.doc import Doc
from spacy.vocab import Vocab
from tqdm.notebook import tqdm

tqdm.pandas()

Problem Tweets die über mehrere gehen.

### 2.2 Speeches

Ein Problem sind die Fälle wo wir englische und deutsche sätze in einem tweets haben.

In [3]:
@Language.component("Remove non alphabetic words")
def remove_non_alpha(doc):
    return [token for token in doc if token.is_alpha]

In [4]:
@Language.factory("Detect languages")
def create_language_detector(nlp, name):
    return LanguageDetector(language_detection_function=None)

In [5]:
@Language.component("Keep only German documents")
def remove_non_german(doc):
    res = [sent for sent in doc.sents if sent._.language["language"] == "de"]
    if res:
        return [token for sent in res for token in sent]
    else:
        return Doc(Vocab([]), words=[], spaces=[])

In [6]:
@Language.component("Remove stopwords")
def remove_stopwords(doc): 
    return [token for token in doc if not token.is_stop]

In [7]:
@Language.component("Lemmatize text")
def lemmatize_text(doc):
    return [token.lemma_ for token in doc]

In [8]:
@Language.component("Lowercase Text")
def lowercase(doc):
    return [token.lower() for token in doc]

In [9]:
emoji_codes = re.compile("["
                         u"\U0001F600-\U0001F64F"
                         u"\U0001F300-\U0001F5FF"
                         u"\U0001F680-\U0001F6FF"
                         u"\U0001F1E0-\U0001F1FF"
                         u"\U00002500-\U00002BEF"
                         u"\U00002702-\U000027B0"
                         u"\U00002702-\U000027B0"
                         u"\U000024C2-\U0001F251"
                         u"\U0001f926-\U0001f937"
                         u"\U00010000-\U0010ffff"
                         u"\u2640-\u2642"
                         u"\u2600-\u2B55"
                         u"\u200d"
                         u"\u23cf"
                         u"\u23e9"
                         u"\u231a"
                         u"\ufe0f"
                         u"\u3030"
                         "]+", re.UNICODE)

@Language.component("Remove emojis")
def remove_emojis(doc):
    doc = [token.text for token in doc if not re.match(emoji, token.text)]
    doc = ' '.join(doc)
    return nlp_twitter.make_doc(doc)

In [10]:
@Language.component("Remove URLs")
def remove_urls(doc):
    doc = [token.text for token in doc if not token.like_url]
    doc = ' '.join(doc)
    return nlp_twitter.make_doc(doc)

In [11]:
@Language.component("Remove mentions")
def remove_mentions(doc):
    doc = [token.text for token in doc if not re.match("@.*", token.text)]
    doc = ' '.join(doc)
    return nlp_twitter.make_doc(doc)

In [12]:
@Language.component("Remove stopwords and punctuation")
def remove_stopwords(doc):
    doc = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return doc

In [13]:
pipeline_exclude = ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'ner', 'morphologizer']

Ein Problem sind die Fälle wo wir englische und deutsche sätze in einem tweets haben.

## 2. Topic Modelling

###  2.1 Tweets

In [14]:
tweets_explored = pd.read_csv("../data/interim/tweets_explored.csv")

In [15]:
tweets_explored

Unnamed: 0,text,retweet_count,like_count,party,full_name,date
0,Mit allen Fußballfans freue ich mich heute auf...,3,32.0,CDU,Ralph Brinkhaus,2021-06-15
1,Mit @antennedowideit habe ich außerdem noch üb...,0,5.0,CDU,Ralph Brinkhaus,2021-06-11
2,Wenn wir nachhaltig gegen den #Klimawandel käm...,0,4.0,CDU,Ralph Brinkhaus,2021-06-11
3,Wir brauchen nach der Pandemie gut bezahlte #A...,0,2.0,CDU,Ralph Brinkhaus,2021-06-11
4,In der #Wahldebatte von @welt und @insm ging e...,2,24.0,CDU,Ralph Brinkhaus,2021-06-11
...,...,...,...,...,...,...
164455,@andreasloeschel @zeitonline @LauraCwiertnia C...,0,1.0,Grüne,Annalena Baerbock,2017-11-09
164456,@DieLinkeBrdburg @Die_Gruenen Ich nehme das an...,0,13.0,Grüne,Annalena Baerbock,2017-11-08
164457,@LucaBrunsch widerspricht uns Grünen doch gar ...,0,0.0,Grüne,Annalena Baerbock,2017-11-08
164458,Weise Fußballerwahrheiten https://t.co/lQzp1p...,1,6.0,Grüne,Annalena Baerbock,2017-11-05


In [16]:
# Create spacy pipeline
nlp_tweets = spacy.load('de_core_news_sm', exclude=pipeline_exclude)
nlp_tweets.Defaults.stop_words |= {"amp", "rt"}

# The add_pipe function appends our functions to the default pipeline.
nlp_tweets.add_pipe("sentencizer", last=True)
nlp_tweets.add_pipe("Detect languages", name='Detect languages', last=True)
nlp_tweets.add_pipe("Keep only German documents", name='Keep only German documents', last=True)
nlp_tweets.add_pipe("Remove non alphabetic words", name="Remove non alphabetic words", last=True)
nlp_tweets.add_pipe("Remove stopwords", name="Remove stopwords", last=True)
nlp_tweets.add_pipe("Lemmatize text", name="Lemmatize text", last=True)
nlp_tweets.add_pipe("Lowercase Text", name="Lowercase Text", last=True)

<function __main__.lowercase(doc)>

In [17]:
tweets_explored["text_preprocessed"] = tweets_explored.text.progress_apply(nlp_tweets)

  0%|          | 0/164460 [00:00<?, ?it/s]

In [18]:
tweets_explored["text_preprocessed_sentence"] = tweets_explored["text_preprocessed"].progress_apply(
    lambda x: " ".join(x))
tweets_preprocessed = tweets_explored[["full_name", "date", "party", "text", "text_preprocessed",
                                       "text_preprocessed_sentence", 'retweet_count', 'like_count']]
tweets_preprocessed.replace('', np.NaN, inplace=True)
tweets_preprocessed.dropna(inplace=True)
pickle.dump(tweets_preprocessed, open("../data/processed/tweets_processed.p", "wb"))

  0%|          | 0/164460 [00:00<?, ?it/s]

### 2.2 Speeches

In [19]:
speeches_explored = pd.read_csv("../data/interim/speeches_explored.csv")

In [20]:
speeches_explored.columns = ["text", "date", "full_name", "party"]

In [21]:
# Create spacy pipeline
nlp_speeches = spacy.load('de_core_news_sm', exclude=pipeline_exclude)

# The add_pipe function appends our functions to the default pipeline.
nlp_speeches.add_pipe('sentencizer', last=True)
nlp_speeches.add_pipe("Detect languages", name='Detect languages', last=True)
nlp_speeches.add_pipe("Keep only German documents", name='Keep only German documents', last=True)
nlp_speeches.add_pipe("Remove non alphabetic words", name="Remove non alphabetic words", last=True)
nlp_speeches.add_pipe("Remove stopwords", name="Remove stopwords", last=True)
nlp_speeches.add_pipe("Lemmatize text", name="Lemmatize text", last=True)
nlp_speeches.add_pipe("Lowercase Text", name="Lowercase Text", last=True)

<function __main__.lowercase(doc)>

In [22]:
speeches_explored["text_preprocessed"] = speeches_explored.text.progress_apply(nlp_speeches)

  0%|          | 0/4099 [00:00<?, ?it/s]

In [23]:
speeches_explored["text_preprocessed_sentence"] = speeches_explored["text_preprocessed"].progress_apply(
    lambda x: " ".join(x))
speeches_preprocessed = speeches_explored[["full_name", "date", "party", "text",
                                           "text_preprocessed", "text_preprocessed_sentence"]]
speeches_preprocessed.replace('', np.NaN, inplace=True)
speeches_preprocessed.dropna(inplace=True)
pickle.dump(speeches_preprocessed, open("../data/processed/speeches_processed.p", "wb"))

  0%|          | 0/4099 [00:00<?, ?it/s]

## 3. Sentiment Analysis

### 3.1 Tweets

### 3.2 Speeches