In [1]:
import contractions
import keras
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os
import pandas as pd
import pickle
import progressbar
import random
import re
import spacy
import sys
import tensorflow as tf
import unicodedata


from collections import Counter
from keras import layers
from keras import models
from keras import utils
from keras.utils import to_categorical
from nltk import word_tokenize
from tensorflow import keras


nltk.download('punkt')
nltk.download('perluniprops')

# This tokenizer is nice, but could cause problems.
try:
    from nltk.tokenize.moses import MosesDetokenizer
    detokenizer = MosesDetokenizer()
    use_moses_detokenizer = True
except:
    use_moses_detokenizer = False

%matplotlib inline

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/iskriyanavasileva/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package perluniprops to
[nltk_data]     /Users/iskriyanavasileva/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!


# Parameters

In [2]:
# Corpus parameters
# corpus_path =

# Preprocessing parameters
preprocessed_corpus_path = "product_descr_preprocessed.p"
most_common_words_number = 10000

# Training parameters
train_anyway = False
model_path = "product_descr.h5"
#dataset_size = 5000
sequence_length = 30
epochs = 10
batch_size = 128
hidden_size = 1000

# Helpers

In [3]:
def encode_sequence(sequence, vocabulary):
    """ Encodes a sequence of tokens into a sequence of indices. """

    return [vocabulary.index(element) for element in sequence if element in vocabulary]


def decode_indices(indices, vocabulary):
    """ Decodes a sequence of indices and returns a string. """

    decoded_tokens = [vocabulary[index] for index in indices]
    if use_moses_detokenizer == True:
        return detokenizer.detokenize(decoded_tokens, return_str=True)
    else:
        return " ".join(decoded_tokens)

# The Corpus

In [4]:
home = os.getenv("HOME")
nlp_repo = os.path.join(home, 'git/nlp-product-sentiment-classification')
src_dir = os.path.join(os.getcwd(), '..', 'src')
sys.path.append(src_dir)

In [5]:
train_csv_path = os.path.join(nlp_repo, 'data/03_processed/Train.csv')
train_descr = pd.read_csv(train_csv_path)

In [6]:
train_descr['Product_Description'].to_numpy()

array(['The Web Designer\x89Ûªs Guide to iOS (and Android) Apps, today @mention 10 a.m! {link} #sxsw',
       'RT @mention Line for iPad 2 is longer today than yesterday. #SXSW  // are you getting in line again today just for fun?',
       'Crazy that Apple is opening a temporary store in Austin tomorrow to handle the rabid #sxsw eye pad too seekers.',
       ...,
       'RT @mention RT @mention Download 20+ free tracks from @mention Music Sampler @mention including @glove! {link} #SXSW',
       "OH at Texas Social Media Awards: 'You don't need to ask your mother anymore. Just Google it.' #sxswi #sxsw",
       '#Google launching a &quot;major&quot; new social network at #sxsw ... Wonder what that can be...'],
      dtype=object)

In [7]:
corpus_prep = train_descr['Product_Description'].tolist()
corpus_prep[:16]

['The Web Designer\x89Ûªs Guide to iOS (and Android) Apps, today @mention 10 a.m! {link} #sxsw',
 'RT @mention Line for iPad 2 is longer today than yesterday. #SXSW  // are you getting in line again today just for fun?',
 'Crazy that Apple is opening a temporary store in Austin tomorrow to handle the rabid #sxsw eye pad too seekers.',
 'The lesson from Google One Pass: In this digital environment, users want to purchase across every platform with one tool. #sxsw #elonsxsw',
 'RT @mention At the panel: &quot;Your mom has an ipad, designing for boomers&quot; #sxsw',
 'RT @mention I think my effing hubby is in line for an #iPad 2. Can someone point him towards the line-up for wife number #2. #sxswi #sxsw',
 '&quot;Android users are not iPhone users. (They use the Options menu, and Contextual menu)&quot; ~@mention #sxsw',
 'Wow! RT@mention We interrupt your regularly scheduled #sxsw geek programming with big news {link}  #google #circles',
 'Google to Launch New Social Network Called Circl

# Text Normalization

Sources: 
* text wrangling: https://towardsdatascience.com/a-practitioners-guide-to-natural-language-processing-part-i-processing-understanding-text-9f4abfd13e72 & https://github.com/dipanjanS/practical-machine-learning-with-python/blob/master/bonus%20content/nlp%20proven%20approach/NLP%20Strategy%20I%20-%20Processing%20and%20Understanding%20Text.ipynb

In [8]:
test = set(corpus_prep)
len(test)

6352

#### Remove Accented Characters

In [9]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD',
                                 text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


remove_accented_chars('Sómě Áccěntěd těxt')

'Some Accented text'

#### Expand Contractions

In [10]:
print(contractions.fix("Y'all can't expand contractions I'd think"))

you all can not expand contractions I would think


#### Remove Special Characters

In [11]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text


remove_special_characters("Well this was fun! What do you think? 123#@!",
                          remove_digits=True)

'Well this was fun What do you think '

#### Text Lemmatization

In [12]:
nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)

In [13]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ !=
                     '-PRON-' else word.text for word in text])
    return text


lemmatize_text(
    "My system keeps crashing! his crashed yesterday, ours crashes daily")

'My system keep crash ! his crash yesterday , ours crash daily'

#### Text Stemming

In [14]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])

    return text


simple_stemmer(
    "My system keeps crashing his crashed yesterday, ours crashes daily")

'My system keep crash hi crash yesterday, our crash daili'

#### Remove Stopwords

In [15]:
stopword_list = nltk.corpus.stopwords.words('english')
# remove the negative words from stopword_list, as they are useful for a sentiment analysis
stopword_list.remove('no')
stopword_list.remove('not')

In [36]:
def remove_stopwords(text, is_lower_case=False):
    tokens = word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [
            token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [
            token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text


remove_stopwords("The, and, if are stopwords, computer is not")

', , stopwords , computer not'

#### Building a Text Normalizer

In [22]:
def normalize_corpus(corpus,
                     contraction_expansion=True,
                     accented_char_removal=True,
                     text_lower_case=True,
                     text_lemmatization=True,
                     special_char_removal=True,
                     stopword_removal=True,
                     remove_digits=True
                     ):
    """
    This function normalizes the text and prepares it for the corpus pre-processing

    Args: 
    - corpus - the text to be normalised
    If set to true, the following functions are applied accordingly:
    - contraction_expansion - expands phrases like I'm to I am
    - accented_char_removal - sets characters like â to a
    - text_lower_case - turns all characters to lower case
    - text_lemmatization - lemmatizes text
    - special_char_removal - removes characters like "#"
    - stopword_removal - removes stop words such as "the", "and" etc.


    Returns:
    - normalized_corpus - the normalized text
    """
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus: 
        
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        
        # expand characters
        if contraction_expansion:
            doc = contractions.fix(doc)
            
        # lowercase the text
        if text_lower_case:
            doc = doc.lower()
        
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ', doc)
        
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        
        # remove special characters and / or digits
        if special_char_removal:
            # insert spaces between special characters to isolate them
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub("\\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)
            
            # remove extra whitespace
            doc = re.sub(' +', ' ', doc)
            
            # remove stopwords
            if stopword_removal:
                doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
            normalized_corpus.append(doc)
        
    return normalized_corpus

In [39]:
train_descr['Clean_Text'] = normalize_corpus(train_descr['Product_Description'])
norm_corpus = list(train_descr['Clean_Text'])
train_descr.iloc[1][['Product_Description', 'Clean_Text']].to_dict()

{'Product_Description': 'RT @mention Line for iPad 2 is longer today than yesterday. #SXSW  // are you getting in line again today just for fun?',
 'Clean_Text': 'rt mention line for ipad be long today than yesterday sxsw be you get in line again today just for fun '}

# Pre-processing of the Corpus

Sources: 
* pre-procesing: DSR bootcamp
* TF dataset: https://stackoverflow.com/questions/58362316/how-do-i-go-from-pandas-dataframe-to-tensorflow-batchdataset-for-nlp

In [None]:
def preprocess_corpus(corpus_prep):
    """
    Preprocesses the corpus

    Args: 
    corpus_prep - the text to be pre-processed

    """

    if not os.path.exists(preprocessed_corpus_path):
        print("Preprocessing corpus...")

        # Getting the vocabulary
        # 1. Tokenizing
        print("Tokenizing...")
        corpus_string = [word_tokenize(description)
                         for description in corpus_prep]
        corpus_tokens = [item.lower()
                         for sublist in corpus_string for item in sublist]
        print("Number of tokens:", len(corpus_tokens))
        print("Building vocabulary...")
        word_counter = Counter()
        word_counter.update(corpus_tokens)
        print("Length of vocabulary before pruning:", len(word_counter))

        # 2. Derive the vocabulary - 10.000 most used words
        vocabulary = [key for key, value in word_counter.most_common(
            most_common_words_number)]
        print("Length of vocabulary after pruning:", len(vocabulary))

        # 3. Converting to indices
        print("Index-encoding...")
        indices = [encode_sequence(sequence, vocabulary)
                   for sequence in corpus_string]
        print("Number of indices:", len(indices))

        # 4. Saving
        print("Saving file...")
        pickle.dump((indices, vocabulary), open(
            preprocessed_corpus_path, "wb"))

    else:
        print("Corpus already preprocessed.")


preprocess_corpus(corpus_prep)

In [None]:
indices, vocabulary = pd.read_pickle(r'product_descr_preprocessed.p')

# Turning the Encoded Sequences into a Tensorflow Dataset

In [None]:
train_seqs = tf.keras.preprocessing.sequence.pad_sequences(
    indices, padding='post')
train_labels = train_descr['Sentiment'][:3].to_numpy().flatten()

In [None]:
print(train_seqs)
print(train_labels)

In [None]:
train_ds = tf.data.Dataset.from_tensor_slices((train_seqs, train_labels))

#### Check, if transformation went ok

In [None]:
test_descr, test_label = list(train_ds.take(1))[0]

In [None]:
test_descr.numpy()

In [None]:
test_label.numpy()

In [None]:
decoded_test_descr = decode_indices(test_descr, vocabulary)
decoded_test_descr