# 3: Preprocessing for word2vec

In [None]:
# import the language specific models 
!python -m spacy download de_core_news_sm
!python -m spacy download pl_core_news_sm
!python -m spacy download da_core_news_sm

In [None]:
# import packages
import os
import pandas as pd
import re
import spacy
import emoji

### Loading data
Set the working directory and load the data.

In [None]:
# set working directory
os.chdir(r'C:\Users\maril\Documents\20-21 KU\block 4\DM\twitter\raw data')

In [None]:
# Germany: import data
de = pd.read_csv('de_original_text.csv')
print(de.shape)
de.head()

In [None]:
# Denmark: import data
da = pd.read_csv('da_original_text.csv')
print(da.shape)
da.head()

In [None]:
# Poland: import data
pl = pd.read_csv('pl_original_text.csv')
print(pl.shape)
pl.head()

### Preprocessing
We conduct the following steps:

* Remove URLs.
* Remove @mentions.
* Remove emojis using the ``emoji`` package
* Replace ``&amp;`` (the HTML code for the ampersand symbol) by ``&``
* Only keep the remaning alphanumeric characters incl. ``&``
* Remove numbers
* Remove single characters (because we anticipate that single characters won't be relevant to find interesting new keywords for our Twitter query. 
* Lowercase all words.
* Remove double, triple etc. whitespaces.
* Remove leading and trailing whitespaces.

In [None]:
# preprocess

def preprocess(text):
    
    # remove URLs
    text = re.sub(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))",
                  ' ', text)
     
    # remove @mentions
    text = re.sub(r'@\w+ ', ' ', text)
    
    # remove emojis: we use the 'emoji' package to do so
    # the function .get_emoji_regexp() returns a regex pattern for all unicode emoji characters
    # we use this pattern to match emojis and then replace them with a whitespace
    text = re.sub(emoji.get_emoji_regexp(), ' ', text)
    
    # replace all '&amp;' (the HTML code for the ampersand symbol) by &
    text = re.sub('&amp;', '&', text)
    
    # keep all alphanumeric characters (i.e. [a-zA-Z0-9_]) and the & symbol
    # that removes all weird/funny characters
    text = ' '.join(re.findall(r'[\w&]+', text))

    # remove numbers
    text = re.sub('\d+', ' ', text)
    
    # remove single characters (because they are not particularly meaningful)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    
    # lowercase all words
    text = text.lower()
    
    # remove whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # remove leading and trailing whitespace
    text= text.strip()
    
    return text

In [None]:
# Germany: apply to df
de['preprocess'] = de['text'].apply(preprocess)

# check the dataframe
de.head()

In [None]:
# Denmark: apply to df
da['preprocess'] = da['text'].apply(preprocess)

# check the dataframe
da.head()

In [None]:
# Poland: apply to df
pl['preprocess'] = pl['text'].apply(preprocess)

# check the dataframe
pl.head()

### Loading (and customizing) spaCy models

We load the models 'de_core_news_sm', 'da_core_news_sm' and 'pl_core_news_sm' model. We customize the tokenizer so that it does not split hashtags so that we keep information about which hashtags are used.

In [None]:
# GERMAN CORPUS

# loading the models
de_nlp = spacy.load('de_core_news_sm')
    
# make sure that hashtags won't be split

# retrieve the default token-matching regex pattern
de_re_token_match = spacy.tokenizer._get_regex_pattern(de_nlp.Defaults.token_match)

# add #hashtag pattern
de_re_token_match = f"({de_re_token_match}|#\\w+)"
de_nlp.tokenizer.token_match = re.compile(de_re_token_match).match

In [None]:
# DANISH CORPUS
da_nlp = spacy.load('da_core_news_sm')

# make sure that hashtags won't be split

# retrieve the default token-matching regex pattern
da_re_token_match = spacy.tokenizer._get_regex_pattern(da_nlp.Defaults.token_match)

# add #hashtag pattern
da_re_token_match = f"({da_re_token_match}|#\\w+)"
da_nlp.tokenizer.token_match = re.compile(da_re_token_match).match

In [None]:
# POLISH CORPUS
pl_nlp = spacy.load('pl_core_news_sm')

    
# make sure that hashtags won't be split

# retrieve the default token-matching regex pattern
pl_re_token_match = spacy.tokenizer._get_regex_pattern(pl_nlp.Defaults.token_match)

# add #hashtag pattern
pl_re_token_match = f"({pl_re_token_match}|#\\w+)"
pl_nlp.tokenizer.token_match = re.compile(pl_re_token_match).match

### Tokenization
For word2vec, we only tokenize the words but we don't lemmatize them. We want all words exactly as they are used by people in their tweets in order to improve our Twitter search query.

In [None]:
# GERMANY: define tokenizer function using spaCy
def de_tokenize(text):
    
    # apply the pipeline to dataset
    doc = de_nlp(text)

    # removing stopwords and retrieving tokens: the tokens have an 
    # attribute .is_stop and in order to filter out stopwords, we need to remove all words where this keyword is False
    tok = [str(token) for token in doc if token.is_stop == False]

    return tok

In [None]:
# DENMARK: define tokenizer function using spaCy
def da_tokenize(text):
    
    # apply the pipeline to dataset
    doc = da_nlp(text)

    # removing stopwords and retrieving tokens: the tokens have an 
    # attribute .is_stop and in order to filter out stopwords, we need to remove all words where this keyword is False
    tok = [str(token) for token in doc if token.is_stop == False]

    return tok

In [None]:
# POLAND: define tokenizer function using spaCy
def pl_tokenize(text):
    
    # apply the pipeline to dataset
    doc = pl_nlp(text)

    # removing stopwords and retrieving tokens: the tokens have an 
    # attribute .is_stop and in order to filter out stopwords, we need to remove all words where this keyword is False
    tok = [str(token) for token in doc if token.is_stop == False]

    return tok

In [None]:
# GERMANY: apply tokenizer function and add the lists back to the dataframe
de['preprocess_token'] = de['preprocess'].apply(de_tokenize)

In [None]:
# DENMARK: apply tokenizer function and add the lists back to the dataframe
da['preprocess_token'] = da['preprocess'].apply(da_tokenize)

In [None]:
# POLAND: apply tokenizer function and add the lists back to the dataframe
pl['preprocess_token'] = pl['preprocess'].apply(pl_tokenize)

### Saving the dataframe

In [None]:
display(de.head(3), da.head(3), pl.head(3))

In [None]:
# change working directory
os.chdir(r'C:\Users\maril\Documents\20-21 KU\block 4\DM\twitter\preprocess_word2vec')

# saving the different dataframes as files
df_list = [de, da, pl]

fil_name_list = ['de_preprocess', 'da_preprocess', 'pl_preprocess']

for i in range(len(df_list)):
    df_list[i].to_csv(f"{fil_name_list[i]}.csv", index=False, encoding='utf8')