# Data preprocessing

## 1. Import and view data

In [None]:
import pandas as pd

#### Tweets

In [None]:
tweets_raw = pd.read_csv("../data/interim/Bundestag_Tweets.csv", index_col = "Unnamed: 0")

In [None]:
tweets_raw.head()

In [None]:
tweets_raw.info()

In [None]:
tweets_raw.dropna(inplace = True)

In [None]:
usernames_to_fullname = {'rbrinkhaus': 'Ralph Brinkhaus',
                         'groehe': 'Hermann Gröhe',
                         'NadineSchoen': 'Nadine Schön',
                         'n_roettgen': 'Norbert Röttgen',
                         'peteraltmaier': 'Peter Altmaier',
                         'jensspahn': 'Jens Spahn',
                         'MatthiasHauer': 'Matthias Hauer',
                         'c_lindner': 'Christian Lindner',
                         'MarcoBuschmann': 'Marco Buschmann',
                         'starkwatzinger': 'Bettina Stark-Watzinger',
                         'Lambsdorff': 'Alexander Graf Lambsdorff',
                         'johannesvogel': 'Johannes Vogel',
                         'KonstantinKuhle': 'Konstantin Kuhle',
                         'MAStrackZi': 'Marie-Agnes Strack-Zimmermann',
                         'larsklingbeil': 'Lars Klingbeil',
                         'EskenSaskia': 'Saskia Esken',
                         'hubertus_heil': 'Hubertus Heil',
                         'HeikoMaas': 'Heiko Maas',
                         'MartinSchulz': 'Martin Schulz',
                         'KarambaDiaby': 'Karamba Diaby',
                         'Karl_Lauterbach': 'Karl Lauterbach',
                         'SteffiLemke': 'Steffi Lemke',
                         'cem_oezdemir': 'Cem Özdemir',
                         'GoeringEckardt': 'Katrin Göring-Eckardt',
                         'KonstantinNotz': 'Konstantin von Notz',
                         '22': 'Konstantin von Notz',
                         'BriHasselmann': 'Britta Haßelmann',
                         'svenlehmann': 'Sven Lehmann',
                         'ABaerbock': 'Annalena Baerbock',
                         'SWagenknecht': 'Sahra Wagenknecht',
                         'b_riexinger': 'Bernd Riexinger',
                         'NiemaMovassat': 'Niema Movassat',
                         'jankortemdb': 'Jan Korte',
                         'DietmarBartsch': 'Dietmar Bartsch',
                         'GregorGysi': 'Gregor Gysi',
                         'SevimDagdelen': 'Sevim Dağdelen',
                         'Alice_Weidel': 'Alice Weidel',
                         'Beatrix_vStorch': 'Beatrix von Storch',
                         'JoanaCotar': 'Joana Cotar',
                         'StBrandner': 'Stephan Brandner',
                         'Tino_Chrupalla': 'Tino Chrupalla',
                         'GtzFrmming': 'Götz Frömming',
                         '5': 'Götz Frömming',
                         'Leif_Erik_Holm': 'Leif-Erik Holm'}

In [None]:
tweets_raw["full_name"] = tweets_raw.Username.replace(usernames_to_fullname)

In [None]:
tweets_raw.info()

## 2. Prepocess data

#### Plenar protocolls

In [None]:
speeches_raw = pd.read_csv("../data/interim/bundestag_speeches_processed.csv")

In [None]:
speeches_raw.head()

In [None]:
speeches_raw.info()

In [None]:
speeches_raw.dropna(subset = ["speech_content"], inplace = True)

In [None]:
speeches_raw["full_name"] = speeches_raw["first_name"] + " " + speeches_raw["last_name"]

In [None]:
len(speeches_raw["full_name"].unique())

In [None]:
speeches_subset = speeches_raw[speeches_raw.full_name.isin(tweets_raw.full_name.unique())]

In [None]:
len(speeches_subset["full_name"].unique())

### Speeches

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()

import spacy
from spacy.language import Language
# python -m spacy download de_core_news_sm

In [None]:
# Create spacy pipeline
pipeline_exclude = ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'ner', 'morphologizer']
nlp_speeches = spacy.load('de_core_news_sm', exclude=pipeline_exclude)
nlp_speeches.Defaults.stop_words |= {"\n  ","\n\n  "}

@Language.component("Lemmatize text")
def lemmatize_text(doc):
    doc = [token.lemma_ for token in doc]
    doc = ' '.join(doc)
    return nlp_speeches.make_doc(doc)

@Language.component("Lowercase Text")
def lowercase(doc):
    doc = [token.lower_ for token in doc]
    doc = ' '.join(doc)
    return nlp_speeches.make_doc(doc)

@Language.component("Remove numbers that mark no year")
def remove_number_not_year(doc):
    return [token for token in doc if not token.is_digit or len(token.text) > 3]

@Language.component("Remove stopwords and punctuation")
def remove_stopwords(doc):
    doc = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return doc

# The add_pipe function appends our functions to the default pipeline.
nlp_speeches.add_pipe("Lemmatize text", name="Lemmatize text", last=True)
nlp_speeches.add_pipe("Lowercase Text", name="Lowercase Text", last=True)
nlp_speeches.add_pipe("Remove numbers that mark no year", name="Remove numbers that mark no year", last=True)
nlp_speeches.add_pipe("Remove stopwords and punctuation", name="Remove stopwords and punctuation", last=True)

In [None]:
speeches_subset["text_preprocessed"] = speeches_subset.speech_content.progress_apply(nlp_speeches)

In [None]:
speeches_subset["text_preprocessed_sentence"] = speeches_subset["text_preprocessed"].progress_apply(lambda x: " ".join(x))

In [None]:
speeches_subset.to_csv("../data/processed/speeches_preprocessed.csv")

###  Twitter data

In [None]:
# Create emoji matcher
import re
emoji = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)

In [None]:
# Create spacy pipeline
pipeline_exclude = ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'ner', 'morphologizer']
nlp_twitter = spacy.load('de_core_news_sm', exclude=pipeline_exclude)
nlp_twitter.Defaults.stop_words |= {"\n    ", "amp", "rt"}

@Language.component("Lemmatize text")
def lemmatize_text(doc):
    doc = [token.lemma_ for token in doc]
    doc = ' '.join(doc)
    return nlp_twitter.make_doc(doc)

@Language.component("Lowercase Text")
def lowercase(doc):
    doc = [token.lower_ for token in doc]
    doc = ' '.join(doc)
    return nlp_twitter.make_doc(doc)

@Language.component("Remove URLs")
def remove_urls(doc):
    doc = [token.text for token in doc if not token.like_url]
    doc = ' '.join(doc)
    return nlp_twitter.make_doc(doc)

@Language.component("Remove emojis")
def remove_emojis(doc):
    doc = [token.text for token in doc if not re.match(emoji, token.text)]
    doc = ' '.join(doc)
    return nlp_twitter.make_doc(doc)

@Language.component("Remove mentions")
def remove_mentions(doc):
    doc = [token.text for token in doc if not re.match("@.*", token.text)]
    doc = ' '.join(doc)
    return nlp_twitter.make_doc(doc)

@Language.component("Remove stopwords and punctuation")
def remove_stopwords(doc):
    doc = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return doc

# The add_pipe function appends our functions to the default pipeline.
nlp_twitter.add_pipe("Lemmatize text", name="Lemmatize text", last=True)
nlp_twitter.add_pipe("Lowercase Text", name="Lowercase Text", last=True)
nlp_twitter.add_pipe("Remove URLs", name="Remove URLs", last=True)
nlp_twitter.add_pipe("Remove emojis", name="Remove emojis", last=True)
nlp_twitter.add_pipe("Remove mentions", name="Remove mentions", last=True)
nlp_twitter.add_pipe("Remove stopwords and punctuation", name="Remove stopwords and punctuation", last=True)

In [None]:
tweets_raw["text_preprocessed"] = tweets_raw.Text.progress_apply(nlp_twitter)

In [None]:
tweets_raw["text_preprocessed_sentence"] = tweets_raw["text_preprocessed"].progress_apply(lambda x: " ".join(x))

In [None]:
tweets_raw.to_csv("../data/processed/tweets_preprocessed.csv")