In [1]:
import contractions
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os
import pandas as pd
import pickle
import re
import spacy
import sys
import tensorflow as tf
import unicodedata

from keras.preprocessing.text import Tokenizer
from nltk import word_tokenize

%matplotlib inline

# Parameters

In [2]:
# folders
home = os.getenv("HOME")
nlp_repo = os.path.join(home, 'git/nlp-product-sentiment-classification')

# preprocessing parameters
preprocessed_corpus_path_TF = os.path.join(
    nlp_repo, 'data/03_processed/product_descr_preprocessed_TF.p')

preprocessed_corpus_path_TF_oh = os.path.join(
    nlp_repo, 'data/03_processed/product_descr_preprocessed_TF_oh.p')

# max_words = vocabulary size = our samples - number of most frequent words.
# We set it to 10.000, although in our particular case we have less.
# We do this to parametise the code.
# Aleternatively, we can set it to the length of our vocabulary = word_index
max_words = 10000

# Tokenizing the Text

In [3]:
home = os.getenv("HOME")
nlp_repo = os.path.join(home, 'git/nlp-product-sentiment-classification')
src_dir = os.path.join(os.getcwd(), '..', 'src')
sys.path.append(src_dir)

In [4]:
train_csv_path = os.path.join(nlp_repo, 'data/03_processed/Train.csv')
train_descr = pd.read_csv(train_csv_path)

test_csv_path = os.path.join(nlp_repo, 'data/03_processed/Test.csv')
test_descr = pd.read_csv(test_csv_path)

# Helpers

Sources: 
* text wrangling: https://towardsdatascience.com/a-practitioners-guide-to-natural-language-processing-part-i-processing-understanding-text-9f4abfd13e72 & https://github.com/dipanjanS/practical-machine-learning-with-python/blob/master/bonus%20content/nlp%20proven%20approach/NLP%20Strategy%20I%20-%20Processing%20and%20Understanding%20Text.ipynb

#### Remove Accented Characters

In [5]:
def remove_accented_chars(text):
    """ Sets characters like â to a """

    text = unicodedata.normalize('NFKD',
                                 text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


# test
remove_accented_chars('Sómě Áccěntěd těxt')

'Some Accented text'

#### Expand Contractions

In [6]:
# test
print(contractions.fix("Y'all can't expand contractions I'd think"))

you all can not expand contractions I would think


#### Text Lemmatization

In [7]:
nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)

In [8]:
def lemmatize_text(text):
    """ Returns the root of a word, for ex. went go """

    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ !=
                     '-PRON-' else word.text for word in text])
    return text


# test
lemmatize_text(
    "My system keeps crashing! his crashed yesterday, ours crashes daily")

'My system keep crash ! his crash yesterday , ours crash daily'

#### Remove Special Characters

In [9]:
def remove_special_characters(text, remove_digits=False):
    """ Removes characters like "#" """

    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

#test
remove_special_characters("Well this was fun! What do you think? 123#@!",
                          remove_digits=True)

'Well this was fun What do you think '

#### Remove Stopwords

In [10]:
stopword_list = nltk.corpus.stopwords.words('english')
# remove the negative words from stopword_list, as they are useful for a sentiment analysis
stopword_list.remove('no')
stopword_list.remove('not')

In [11]:
def remove_stopwords(text, is_lower_case=False):
    """ Removes words like "the", "and" etc. """

    tokens = word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [
            token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [
            token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text


# test
remove_stopwords("The, and, if are stopwords, computer is not")

', , stopwords , computer not'

#### Building a Text Normalizer

Why did I choose to lemmatize and not to stem: 
* stemming is generally faster than lemmatizing. However, the dataset at hand is relatively small and speed won't be decisive
* the result will be a meaningful part of the language, i.e. infinitive, singular form etc., which in my opinion will contribute the sentiment analysis 
* as the language is English one can use well-developed libraries like NLTK

In [12]:
def normalize_corpus(corpus,
                     accented_char_removal=True,
                     contraction_expansion=True,

                     text_lower_case=True,
                     text_lemmatization=True,
                     special_char_removal=True,
                     stopword_removal=True,
                     remove_digits=True
                     ):
    """
    This function normalizes the text and prepares it for the corpus pre-processing

    Args: 
        - corpus - the text to be normalised
        If set to true, the following functions are applied accordingly:
        - accented_char_removal - sets characters like â to a
        - contraction_expansion - expands phrases like I'm to I am
        - text_lower_case - turns all characters to lower case
        - text_lemmatization - lemmatizes text
        - special_char_removal - removes characters like "#"
        - stopword_removal - removes stop words such as "the", "and" etc.


    Returns:
        - normalized_corpus - the normalized text
    """

    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:

        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)

        # expand characters
        if contraction_expansion:
            doc = contractions.fix(doc)

        # lowercase the text
        if text_lower_case:
            doc = doc.lower()

        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ', doc)

        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)

        # remove special characters and / or digits
        if special_char_removal:
            # insert spaces between special characters to isolate them
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub("\\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)
        
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)

        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)

        normalized_corpus.append(doc)

    return normalized_corpus

In [13]:
samples_normalized = normalize_corpus(train_descr['Product_Description'])

In [15]:
#samples_normalized

In [16]:
tokenizer = Tokenizer(
    num_words=max_words, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True,
    split=' ', char_level=False, oov_token='oov', document_count=0,
)

In [17]:
samples = samples_normalized

In [18]:
# creates the vocabulary of index-word
tokenizer.fit_on_texts(samples)

In [19]:
# transforms the text to integers
sequences = tokenizer.texts_to_sequences(samples)

In [20]:
word_index = tokenizer.word_index

In [21]:
word_index['oov']

1

In [24]:
pickle.dump((sequences, word_index), open(preprocessed_corpus_path_TF, "wb"))

# One-hot encoding (for Bag-of-Words)

"Manual" word-level one-hot encoding

For the "manual" one-hot encoding we are "cheating" a bit, i.e. we will use the tokenised sequences (sequences) and the vocabulary (word_index) to above generate some of the values (for ex. max_len). 

We do this, because the manual part is just for demonstration purposes and we won't be using it. 
Furthermore, it will enable comparability between bag-of-words and the other models.

In [25]:
# max_len = sequence length - the text is cut off after this number of words
# usually this parameter can be manually defined.
# However, since our tokenised sequences are not that long anyway, we define it as the maximum sequence length in our list of tokenised sequences
max_len = np.max([len(x) for x in sequences])

In [26]:
results = np.zeros(shape=(len(sequences),
                          max_len,
                          max(word_index.values()) + 1))

In [27]:
for i, sequence in enumerate(sequences):
    for j, word in list(enumerate(sequence))[:max_len]:
        index = word_index.get(word)
        results[i, j, index] = 1

Tensorflow word-level one-hot encoding (analogue to the "manual" one-hot encoding it is just for demonstration purposes)

In [28]:
dimensions_descr = len(word_index)

# before proceedting to one-hot with TF, we need to pad the sequences.
# Otherwise it will give us an error due to the different lengths of the sequences
results_tf_oh_prep = tf.keras.preprocessing.sequence.pad_sequences(
    sequences, maxlen=dimensions_descr)

#results_tf_oh = tf.one_hot(indices=results_tf_oh_prep, depth=dimensions_descr)
#results_tf_oh = tf.reduce_max(results_tf_oh, 0)

Keras word-level one-hot encoding

In [29]:
tokenizer_oh = Tokenizer(num_words=max_words, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True,
                         split=' ', char_level=False, oov_token='oov', document_count=0,)
tokenizer_oh.fit_on_texts(samples)

In [30]:
sequences_oh = tokenizer_oh.texts_to_sequences(samples)

In [31]:
results_oh = tokenizer_oh.texts_to_matrix(samples, mode='binary')

In [32]:
word_index_oh = tokenizer_oh.word_index
print(f'Found {len(word_index_oh)} unique tokens.')

Found 7100 unique tokens.


In [33]:
pickle.dump((results_oh, word_index_oh), open(
    preprocessed_corpus_path_TF_oh, "wb"))