# Preprocessing

In [2]:
import re
import string

import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In this section we conduct light preprocessing since the wikipedia documents do not contain a lot of noise. 
In the following preprocessing steps, we split on the ‘see also’ section on each page to remove all possible irrelevant links. We remove all non-alphanumeric characters such as quotation marks, line break characters and remove punctuation. Finally, we lowercase before moving on to removing stopwords, lemmatizing and tokenizing the text.

In [None]:
# functions for preprocessing
def clean_text(text):
    # get relevant text (exclude references)
    text = text.split('See also')[0]
    # remove all non alphanumerical characters
    text = re.sub('\W+', ' ', text)
    return text.lower()

#Remove default stopwords
def remove_stopwords(text): 
    patterns = set(stopwords.words('english'))
    for pattern in patterns:
        if re.search(' '+pattern+' ', text):           #Searching for stopwords bounded by whitespace in each tweet
            text = re.sub(' '+pattern+' ', ' ', text)  #Substituting stopwords with whitespace
    return text

def lemmatize(text):
    
    lemmatizer = WordNetLemmatizer()
    text = word_tokenize(text)      #Creating lemmatizer.
                                        #Tokenizing, as lemmatizer only takes tokenized sentences
    sent_lemmatized = []                    #Empty list to save lemmatized sentence

    for word in text:
        lemma = lemmatizer.lemmatize(word)  #Where the magic happens
        sent_lemmatized.append(lemma)
    
    return ' '.join(sent_lemmatized)

def word_tokenize(text):
    text = WordPunctTokenizer().tokenize(text)
    return text

We create three different columns of preprocessed text to work with to accommodate different needs for our text analysis methods:

1. The first will simply contain the preprocessed text as described above. 
2. In the second stopwords have been removed and all words have been lemmatized. 
3. The final one is a duplicate of the second column in a tokenized version.

In [None]:
# we apply all functions to the text
df['cleaned_text'] = df['text'].astype(str).apply(lambda x: clean_text(x)) #contains stopwords
df['lemmatized'] = df['cleaned_text'].astype(str).apply(lambda x: remove_stopwords(x)) #removing stopwords
df['lemmatized'] = df['lemmatized'].astype(str).apply(lambda x: lemmatize(x))
df['tokens'] = df['lemmatized'].astype(str).apply(lambda x: word_tokenize(x))