## 3.4 Data preprocessing

Before we can use the data for topic modeling and sentiment analysis, wen need to preprocess the data. This will be done with individual spacy pipelines for each use case.

In [None]:
import re
import pickle

import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None

from collections import Counter

import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector
from spacy.tokens.doc import Doc
from spacy.vocab import Vocab
# python -m spacy download de_core_news_sm

from tqdm.notebook import tqdm
tqdm.pandas()

### 3.4.1 Prepare spacy pipelines

For creating the spacy pipelines we define individual components.

In [None]:
@Language.component("Remove non alphabetic words")
def remove_non_alpha(doc):
    return [token for token in doc if token.is_alpha]

In [None]:
@Language.factory("Detect languages")
def create_language_detector(nlp, name):
    return LanguageDetector(language_detection_function=None)

In [None]:
@Language.component("Keep only German documents")
def remove_non_german(doc):
    res = [sent for sent in doc.sents if sent._.language["language"] == "de"]
    if res:
        return [token for sent in res for token in sent]
    else:
        return Doc(Vocab([]), words=[], spaces=[])

In [None]:
@Language.component("Remove stopwords")
def remove_stopwords(doc): 
    return [token for token in doc if not token.is_stop]

In [None]:
@Language.component("Lemmatize text")
def lemmatize_text(doc):
    return [token.lemma_ for token in doc]

In [None]:
@Language.component("Lowercase Text")
def lowercase(doc):
    return [token.lower() for token in doc]

In [None]:
emoji_codes = re.compile("["
                         u"\U0001F600-\U0001F64F"
                         u"\U0001F300-\U0001F5FF"
                         u"\U0001F680-\U0001F6FF"
                         u"\U0001F1E0-\U0001F1FF"
                         u"\U00002500-\U00002BEF"
                         u"\U00002702-\U000027B0"
                         u"\U00002702-\U000027B0"
                         u"\U000024C2-\U0001F251"
                         u"\U0001f926-\U0001f937"
                         u"\U00010000-\U0010ffff"
                         u"\u2640-\u2642"
                         u"\u2600-\u2B55"
                         u"\u200d"
                         u"\u23cf"
                         u"\u23e9"
                         u"\u231a"
                         u"\ufe0f"
                         u"\u3030"
                         "]+", re.UNICODE)

@Language.component("Remove emojis")
def remove_emojis(doc):
    doc = [token.text for token in doc if not re.match(emoji_codes, token.text)]
    doc = ' '.join(doc)
    return nlp_twitter.make_doc(doc)

In [None]:
@Language.component("Remove URLs")
def remove_urls(doc):
    doc = [token.text for token in doc if not token.like_url]
    doc = ' '.join(doc)
    return nlp_twitter.make_doc(doc)

In [None]:
@Language.component("Remove mentions")
def remove_mentions(doc):
    doc = [token.text for token in doc if not re.match("@.*", token.text)]
    doc = ' '.join(doc)
    return nlp_twitter.make_doc(doc)

In [None]:
@Language.component("Remove stopwords and punctuation")
def remove_stopwords(doc):
    doc = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return doc

### 3.4.2 Topic modeling preprocessing

In [None]:
# Exclude not needed pipeline elements
pipeline_exclude = ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'ner', 'morphologizer']

####  3.4.2.1 Tweets

In [None]:
# Import data
tweets_explored = pd.read_csv("../data/interim/tweets_explored.csv")

In [None]:
# Create spacy pipeline
nlp_tweets = spacy.load('de_core_news_sm', exclude=pipeline_exclude)
nlp_tweets.Defaults.stop_words |= {"amp", "rt"}

# Add needed pipeline components
nlp_tweets.add_pipe("sentencizer", last=True)
nlp_tweets.add_pipe("Detect languages", name='Detect languages', last=True)
nlp_tweets.add_pipe("Keep only German documents", name='Keep only German documents', last=True)
nlp_tweets.add_pipe("Remove non alphabetic words", name="Remove non alphabetic words", last=True)
nlp_tweets.add_pipe("Remove stopwords", name="Remove stopwords", last=True)
nlp_tweets.add_pipe("Lemmatize text", name="Lemmatize text", last=True)
nlp_tweets.add_pipe("Lowercase Text", name="Lowercase Text", last=True)

In [None]:
# Apply pipeline to text
# Uncomment if you want to update the preprocessing of the data 
# tweets_explored["text_preprocessed"] = tweets_explored.text.progress_apply(nlp_tweets)

In [None]:
# Add sentence structure
# Uncomment if you want to update the preprocessing of the data 
# tweets_explored["text_preprocessed_sentence"] = tweets_explored["text_preprocessed"].progress_apply(
#    lambda x: " ".join(x))

In [None]:
# Subset needed data
# Uncomment if you want to update the preprocessing of the data 
# tweets_preprocessed = tweets_explored[["full_name", "date", "party", "text", "text_preprocessed",
#                                       "text_preprocessed_sentence", 'retweet_count', 'like_count']]

In [None]:
# Drop empty texts
# Uncomment if you want to update the preprocessing of the data
# tweets_preprocessed.replace('', np.NaN, inplace=True)
# tweets_preprocessed.dropna(inplace=True)
# tweets_preprocessed.reset_index(drop = True, inplace = True)

In [None]:
# Save data as pickle file
# Uncomment if you want to update the preprocessing of the data
# pickle.dump(tweets_preprocessed, open("../data/processed/tweets_processed.p", "wb"))

### 2.2 Speeches

In [None]:
# Import data
speeches_explored = pd.read_csv("../data/interim/speeches_explored.csv")

In [None]:
# Create spacy pipeline
nlp_speeches = spacy.load('de_core_news_sm', exclude=pipeline_exclude)

# Add needed pipeline components
nlp_speeches.add_pipe('sentencizer', last=True)
nlp_speeches.add_pipe("Detect languages", name='Detect languages', last=True)
nlp_speeches.add_pipe("Keep only German documents", name='Keep only German documents', last=True)
nlp_speeches.add_pipe("Remove non alphabetic words", name="Remove non alphabetic words", last=True)
nlp_speeches.add_pipe("Remove stopwords", name="Remove stopwords", last=True)
nlp_speeches.add_pipe("Lemmatize text", name="Lemmatize text", last=True)
nlp_speeches.add_pipe("Lowercase Text", name="Lowercase Text", last=True)

In [None]:
# Apply pipeline to text
# Uncomment if you want to update the preprocessing of the data
# speeches_explored["text_preprocessed"] = speeches_explored.text.progress_apply(nlp_speeches)

In [None]:
# Add sentence structure
# Uncomment if you want to update the preprocessing of the data
# speeches_explored["text_preprocessed_sentence"] = speeches_explored["text_preprocessed"].progress_apply(
#    lambda x: " ".join(x))

In [None]:
# Subset needed data
# Uncomment if you want to update the preprocessing of the data
# speeches_preprocessed = speeches_explored[["full_name", "date", "party", "text",
#                                           "text_preprocessed", "text_preprocessed_sentence"]]

In [None]:
# Define function for removing frequent words
def remove_frequent_words(words_list, most_frequent_words):
    return [word for word in words_list if word not in most_frequent_words]

In [None]:
# Additional preprocessing for Bertopic model
# Uncomment if you want to update the preprocessing of the data
# long_string_speeches= ' '.join(speeches_preprocessed.text_preprocessed_sentence.tolist())
# counter_speeches = Counter(long_string_speeches.split())
# most_frequent_words = []
# for item in counter_speeches.most_common(200):
#    most_frequent_words.append(item[0])

In [None]:
# Add columns with preprocessed text and removed frequent words
# Uncomment if you want to update the preprocessing of the data
# speeches_preprocessed["text_preprocessed_infrequent"] = speeches_preprocessed.text_preprocessed.progress_apply(remove_frequent_words,most_frequent_words = most_frequent_words)
# speeches_preprocessed["text_preprocessed_infrequent_sentence"] = speeches_preprocessed["text_preprocessed_infrequent"].progress_apply(lambda x: " ".join(x))

In [None]:
# Drop empty texts
# Uncomment if you want to update the preprocessing of the data
# speeches_preprocessed.replace('', np.NaN, inplace=True)
# speeches_preprocessed.dropna(inplace=True)
# speeches_preprocessed.reset_index(drop = True, inplace = True)

In [None]:
# Save data as pickle file
# Uncomment if you want to update the preprocessing of the data
# pickle.dump(speeches_preprocessed, open("../data/processed/speeches_processed.p", "wb"))

### 3.4.3 Sentiment analysis preprocessing

#### 3.4.3.1 Tweets

#### 3.4.3.1 Speeches