# Preprocessing final

In [None]:
# import the language specific models 
!python -m spacy download de_core_news_sm
!python -m spacy download pl_core_news_sm
!python -m spacy download da_core_news_sm

In [None]:
# import packages
import os
import pandas as pd
import re
import spacy
import emoji

### Loading data
Set the working directory and load the data.

In [None]:
# set working directory
os.chdir(r'C:\Users\maril\Documents\20-21 KU\block 4\DM\twitter')

In [None]:
# Germany: import data
de = pd.read_csv(r'final_data_prepare1\de_original_text.csv')
print(de.shape)
de.head()

In [None]:
# Denmark: import data
da = pd.read_csv('final_data_prepare1\da_original_text.csv')
print(da.shape)
da.head()

In [None]:
# Poland: import data
pl = pd.read_csv('final_data_prepare1\pl_original_text.csv')
print(pl.shape)
pl.head()

### Preprocessing
We create two slightly different preprocessed datasets: One that contains @mentions and a second one that does not contain mentions. This is because we are not entirely sure yet if we want to use the mentions or not for the topic modelling.
  
We conduct the following steps:

* Remove **URLs**.
* Remove **emojis** using the ``emoji`` package.
* **For the dataset without @mentions: Remove the mentions.**
* Remove ``&amp;`` (the HTML code for the **ampersand** symbol)
* **Replace '-' by an empty string:** This is important to keep words together that belong together. E.g. the German 'Impf-Reihenfolge' should be merge into one word 'ImpfReihenfolge' in order to not distort it's meaning. This makes lemmatization more difficult esp. because 'ImpfReihenfolge' is not the correct spelling of the word (it should be 'Impfreihenfolge'), but since this will only affect a very small number of words, we deem it acceptable.
* **Remove ':', '\*' and *_*:** Again, this is mostly relevant for German. In German, nouns describing people (e.g. the word for 'doctor') usually come in a male ('Doktor') and female form ('Doktorin'). In recent years, there has been a movement to include both spelling as either 'Doktor_in', 'Doktor:in' or 'Doktor\*in' in an attempt at more gender neutral language. If we replace these symbols by spaces, then we would distort the words meaning since 'Doktor in' is not the same as 'Doktorin'. We therefore just remove these symbols. This is not relevant for Polish or Danish, but since removing these symbols does not cause any other issue there, we do this for all three datasets.
* Only **keep the remaining alphanumeric characters** (incl. ``#`` for hashtags and ``@`` for the dataset containing @mentions).
* Remove **numbers**.
* Remove **single characters.** They are not usually not particularly meaningful: In Polish and German, there are (meaningful) words that only consist of one character. In Danish, there is the 'I' (the plural 'you'; as in 'Hvordan har I det?'). But this character will be removed in the stopword list anyway, we might as well already remove it here.
* Remove **double, triple etc. whitespaces**.
* Remove **leading and trailing whitespaces**.

**Important:** We do not lowercase words yet. In German, nouns are spelled with a capital first letter and the German lemmatizer does not work well on lowercased words (capitalization does not make a difference for the Polish and Danish lemmatizers). We therefore lowercase all words after we've conducted the lemmatization.

In [None]:
# preprocess: keep @mentions

def preprocess(text):
    
    # remove URLs
    text = re.sub(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))",
                  ' ', text)
    
    # remove emojis: we use the 'emoji' package to do so
    # the function .get_emoji_regexp() returns a regex pattern for all unicode emoji characters
    # we use this pattern to match emojis and then replace them with a whitespace
    text = re.sub(emoji.get_emoji_regexp(), ' ', text)
    
    # remove all '&amp;' (the HTML code for the ampersand symbol)
    text = re.sub('&amp;', '', text)
   
    # replace '-' by an empty string
    text = re.sub('-', '', text)
   
    # replace '_' by an empty string
    text = re.sub('_', '', text)
 
    # replace '*' by an empty string
    text = re.sub('\*', '', text)
 
    # replace ':' by an empty string
    text = re.sub(':', '', text)
 
    # keep all alphanumeric characters (i.e. [a-zA-Z0-9_]) incl. @ and #
    # this removes all other weird/funny characters
    text = ' '.join(re.findall(r'[\w@#]+', text))
 
    # remove numbers; note: this will remove the '19' in Covid19, but we do not see this as an issue
    text = re.sub('\d+', ' ', text)
 
    # remove single characters (because they are not particularly meaningful)
    text = re.sub(r'\b[a-zA-Z]\b', ' ', text)
 
    # remove whitespace
    text = re.sub(r'\s+', ' ', text)
 
    # remove leading and trailing whitespace
    text= text.strip()
 
    return text

In [None]:
# preprocess: remove @mentions

def preprocess_without_mentions(text):
    
    # remove URLs
    text = re.sub(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))",
                  ' ', text)
    
    # remove emojis: we use the 'emoji' package to do so
    # the function .get_emoji_regexp() returns a regex pattern for all unicode emoji characters
    # we use this pattern to match emojis and then replace them with a whitespace
    text = re.sub(emoji.get_emoji_regexp(), ' ', text)
    
    # remove @mentions
    text = re.sub(r'@\w+ ', ' ', text)
    
    # replace all '&amp;' (the HTML code for the ampersand symbol) by &
    text = re.sub('&amp;', '', text)
   
    # replace '-' by an empty string
    text = re.sub('-', '', text)
   
    # replace '_' by an empty string
    text = re.sub('_', '', text)
 
    # replace '*' by an empty string
    text = re.sub('\*', '', text)
 
    # replace ':' by an empty string
    text = re.sub(':', '', text)
 
    # keep all alphanumeric characters (i.e. [a-zA-Z0-9_])
    # that removes all weird/funny characters
    text = ' '.join(re.findall(r'[\w#]+', text))
 
    # remove numbers; note: this will remove the '19' in Covid19, but we do not see this as an issue
    text = re.sub('\d+', ' ', text)
 
    # remove single characters (because they are not particularly meaningful)
    text = re.sub(r'\b[a-zA-Z]\b', ' ', text)
 
    # remove whitespace
    text = re.sub(r'\s+', ' ', text)
 
    # remove leading and trailing whitespace
    text= text.strip()
 
    return text

In [None]:
# Germany: apply to df
de['preprocess'] = de['text'].apply(preprocess)
de['preprocess_no_mention'] = de['text'].apply(preprocess_without_mentions)

# check the dataframe
de.head()

In [None]:
# Denmark: apply to df
da['preprocess'] = da['text'].apply(preprocess)
da['preprocess_no_mention'] = da['text'].apply(preprocess_without_mentions)

# check the dataframe
da.head()

In [None]:
# Poland: apply to df
pl['preprocess'] = pl['text'].apply(preprocess)
pl['preprocess_no_mention'] = pl['text'].apply(preprocess_without_mentions)

# check the dataframe
pl.head()

### Checking if there are any completely empty strings

In [None]:
# list of the dataframes
check_list = [de, da, pl]
countries = ['German', 'Danish', 'Polish']
    
for i in range(3):
    
    # counter variable
    c = 0

    # iterate through the text without mentions (since that is the more strict preprocessing)
    for text in check_list[i]['preprocess_no_mention']:

        # if there is an empty string, update the counter variable by 1
        if not text:
            c += 1

    print(f"{countries[i]} data: There are {c} issues with empty strings.")

### Loading (and customizing) spaCy models

We load the models 'de_core_news_sm', 'da_core_news_sm' and 'pl_core_news_sm' model. We customize the tokenizer so that it does not split hashtags so that we keep information about which hashtags are used.

In [None]:
# GERMAN CORPUS

# loading the model
de_nlp = spacy.load('de_core_news_sm')
    
# make sure that hashtags won't be split

# retrieve the default token-matching regex pattern
de_re_token_match = spacy.tokenizer._get_regex_pattern(de_nlp.Defaults.token_match)

# add #hashtag pattern
de_re_token_match = f"({de_re_token_match}|#\\w+)"
de_nlp.tokenizer.token_match = re.compile(de_re_token_match).match

In [None]:
# DANISH CORPUS
da_nlp = spacy.load('da_core_news_sm')

# make sure that hashtags won't be split

# retrieve the default token-matching regex pattern
da_re_token_match = spacy.tokenizer._get_regex_pattern(da_nlp.Defaults.token_match)

# add #hashtag pattern
da_re_token_match = f"({da_re_token_match}|#\\w+)"
da_nlp.tokenizer.token_match = re.compile(da_re_token_match).match

In [None]:
# POLISH CORPUS
pl_nlp = spacy.load('pl_core_news_sm')
    
# make sure that hashtags won't be split

# retrieve the default token-matching regex pattern
pl_re_token_match = spacy.tokenizer._get_regex_pattern(pl_nlp.Defaults.token_match)

# add #hashtag pattern
pl_re_token_match = f"({pl_re_token_match}|#\\w+)"
pl_nlp.tokenizer.token_match = re.compile(pl_re_token_match).match

### Tokenization and lemmatization

In [None]:
# GERMANY: define tokenizer and lemmatizer function using spaCy
def de_tokenize(text):
    
    # apply the pipeline to dataset
    doc = de_nlp(text)

    # removing stopwords and retrieving tokens: the tokens have an 
    # attribute .is_stop and in order to filter out stopwords, we need to remove all words where this keyword is False
    tok = [str(token).lower() for token in doc if token.is_stop == False]

    return tok

def de_lemmatize(text):
    
    # apply the pipeline to dataset
    doc = de_nlp(text)
    
    # removing stopwords and retrieving lemmas
    # now we lowercase all words
    lem = [str(token.lemma_).lower() for token in doc if token.is_stop == False]
    
    return lem

¤

In [None]:
# DENMARK: define tokenizer function using spaCy

# tokenizer
def da_tokenize(text):
    
    # apply the pipeline to dataset
    doc = da_nlp(text)

    # removing stopwords and retrieving tokens: the tokens have an 
    # attribute .is_stop and in order to filter out stopwords, we need to remove all words where this keyword is False
    tok = [str(token).lower() for token in doc if token.is_stop == False]

    return tok


# lemmatizer
def da_lemmatize(text):
    
    # apply the pipeline to dataset
    doc = da_nlp(text)
    
    # removing stopwords and retrieving lemmas
    # now we lowercase all words
    lem = [str(token.lemma_).lower() for token in doc if token.is_stop == False]
    
    return lem

In [None]:
# POLAND: define tokenizer and lemmatizer function using spaCy

# tokenizer
def pl_tokenize(text):
    
    # apply the pipeline to dataset
    doc = pl_nlp(text)

    # removing stopwords and retrieving tokens: the tokens have an 
    # attribute .is_stop and in order to filter out stopwords, we need to remove all words where this keyword is False
    tok = [str(token).lower() for token in doc if token.is_stop == False]

    return tok

# lemmatizer
def pl_lemmatize(text):
    
    # apply the pipeline to dataset
    doc = pl_nlp(text)
    
    # removing stopwords and retrieving lemmas
    # now we lowercase all words
    lem = [str(token.lemma_).lower() for token in doc if token.is_stop == False]
    
    return lem

### Germany

#### For the preprocessed tweets incl. mentions

In [None]:
# apply functions
de['token'] = de['preprocess'].apply(de_tokenize)
de['lemma'] = de['preprocess'].apply(de_lemmatize)

#### For the preprocessed tweets without mentions

In [None]:
# apply functions
de['token_no_mention'] = de['preprocess_no_mention'].apply(de_tokenize)
de['lemma_no_mention'] = de['preprocess_no_mention'].apply(de_lemmatize)

### Denmark

#### For the preprocessed tweets incl. mentions

In [None]:
# apply functions
da['token'] = da['preprocess'].apply(da_tokenize)
da['lemma'] = da['preprocess'].apply(da_lemmatize)

#### For the preprocessed tweets without mentions

In [None]:
# apply functions
da['token_no_mention'] = da['preprocess_no_mention'].apply(da_tokenize)
da['lemma_no_mention'] = da['preprocess_no_mention'].apply(da_lemmatize)

### Poland

#### For the preprocessed tweets without mentions

In [None]:
# apply functions
pl['token'] = pl['preprocess'].apply(pl_tokenize)
pl['lemma'] = pl['preprocess'].apply(pl_lemmatize)

#### For the preprocessed tweets without mentions

In [None]:
# apply functions
pl['token_no_mention'] = pl['preprocess_no_mention'].apply(pl_tokenize)
pl['lemma_no_mention'] = pl['preprocess_no_mention'].apply(pl_lemmatize)

### Saving the dataframe

In [None]:
display(de.head(3))
display(da.head(3))
display(pl.head(3))

In [None]:
# saving the different dataframes as files
df_list = [de, da, pl]

fil_name_list = ['de_preprocess', 'da_preprocess', 'pl_preprocess']

for i in range(len(df_list)):
    df_list[i].to_csv(f"final_data_preprocess\\{fil_name_list[i]}.csv", index=False)