In [1]:
import contractions
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os
import pandas as pd
import pickle
import progressbar
import random
import re
import spacy
import sys
import unicodedata


from collections import Counter
from nltk import word_tokenize


nltk.download('punkt')
nltk.download('perluniprops')

# This tokenizer is nice, but could cause problems.
try:
    from nltk.tokenize.moses import MosesDetokenizer
    detokenizer = MosesDetokenizer()
    use_moses_detokenizer = True
except:
    use_moses_detokenizer = False

%matplotlib inline

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/iskriyanavasileva/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package perluniprops to
[nltk_data]     /Users/iskriyanavasileva/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!


# Parameters

In [2]:
# folders
home = os.getenv("HOME")
nlp_repo = os.path.join(home, 'git/nlp-product-sentiment-classification')

# Corpus parameters
# corpus_path

# Preprocessing parameters
preprocessed_corpus_path = os.path.join(
    nlp_repo, 'data/03_processed/product_descr_preprocessed.p')
indices_test_path = os.path.join(
    nlp_repo, 'data/03_processed/indices_test_path.p')
# as our data is scarce, we will not need this parameter.
# let's leave it in the code for parametization sake
most_common_words_number = 10000

# Helpers

Sources: 
* text wrangling: https://towardsdatascience.com/a-practitioners-guide-to-natural-language-processing-part-i-processing-understanding-text-9f4abfd13e72 & https://github.com/dipanjanS/practical-machine-learning-with-python/blob/master/bonus%20content/nlp%20proven%20approach/NLP%20Strategy%20I%20-%20Processing%20and%20Understanding%20Text.ipynb

#### Remove Accented Characters

In [3]:
def remove_accented_chars(text):
    """ Sets characters like â to a """

    text = unicodedata.normalize('NFKD',
                                 text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


remove_accented_chars('Sómě Áccěntěd těxt')

'Some Accented text'

#### Expand Contractions

In [4]:
print(contractions.fix("Y'all can't expand contractions I'd think"))

you all can not expand contractions I would think


#### Text Lemmatization

In [5]:
nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)

In [6]:
def lemmatize_text(text):
    """ Returns the root of a word, for ex. went go """

    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ !=
                     '-PRON-' else word.text for word in text])
    return text


lemmatize_text(
    "My system keeps crashing! his crashed yesterday, ours crashes daily")

'My system keep crash ! his crash yesterday , ours crash daily'

#### Text Stemming

In [7]:
def simple_stemmer(text):
    """ Returns the stem of a word, for ex. going go. It does not always overlap with the root """

    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])

    return text


simple_stemmer(
    "My system keeps crashing his crashed yesterday, ours crashes daily")

'My system keep crash hi crash yesterday, our crash daili'

#### Remove Special Characters

In [8]:
def remove_special_characters(text, remove_digits=False):
    """ Removes characters like "#" """

    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text


remove_special_characters("Well this was fun! What do you think? 123#@!",
                          remove_digits=True)

'Well this was fun What do you think '

#### Remove Stopwords

In [9]:
stopword_list = nltk.corpus.stopwords.words('english')
# remove the negative words from stopword_list, as they are useful for a sentiment analysis
stopword_list.remove('no')
stopword_list.remove('not')

In [10]:
def remove_stopwords(text, is_lower_case=False):
    """ Removes words like "the", "and" etc. """

    tokens = word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [
            token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [
            token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text


remove_stopwords("The, and, if are stopwords, computer is not")

', , stopwords , computer not'

#### Encoder & Decoder

In [11]:
def encode_sequence(sequence, vocabulary):
    """ Encodes a sequence of tokens into a sequence of indices. """
    return [vocabulary.index(element) for element in sequence if element in vocabulary]


def decode_indices(indices, vocabulary):
    """ Decodes a sequence of indices and returns a string. """
    decoded_tokens = [vocabulary[index] for index in indices]
    if use_moses_detokenizer == True:
        return detokenizer.detokenize(decoded_tokens, return_str=True)
    else:
        return " ".join(decoded_tokens)

# The Text

In [12]:
home = os.getenv("HOME")
nlp_repo = os.path.join(home, 'git/nlp-product-sentiment-classification')
src_dir = os.path.join(os.getcwd(), '..', 'src')
sys.path.append(src_dir)

In [13]:
train_csv_path = os.path.join(nlp_repo, 'data/03_processed/Train.csv')
train_descr = pd.read_csv(train_csv_path)

test_csv_path = os.path.join(nlp_repo, 'data/03_processed/Test.csv')
test_descr = pd.read_csv(test_csv_path)

In [14]:
train_descr['Product_Description'].to_numpy()

array(['The Web Designer\x89Ûªs Guide to iOS (and Android) Apps, today @mention 10 a.m! {link} #sxsw',
       'RT @mention Line for iPad 2 is longer today than yesterday. #SXSW  // are you getting in line again today just for fun?',
       'Crazy that Apple is opening a temporary store in Austin tomorrow to handle the rabid #sxsw eye pad too seekers.',
       ...,
       'RT @mention RT @mention Download 20+ free tracks from @mention Music Sampler @mention including @glove! {link} #SXSW',
       "OH at Texas Social Media Awards: 'You don't need to ask your mother anymore. Just Google it.' #sxswi #sxsw",
       '#Google launching a &quot;major&quot; new social network at #sxsw ... Wonder what that can be...'],
      dtype=object)

In [15]:
corpus_prep = train_descr['Product_Description'].tolist()
corpus_prep[:16]

['The Web Designer\x89Ûªs Guide to iOS (and Android) Apps, today @mention 10 a.m! {link} #sxsw',
 'RT @mention Line for iPad 2 is longer today than yesterday. #SXSW  // are you getting in line again today just for fun?',
 'Crazy that Apple is opening a temporary store in Austin tomorrow to handle the rabid #sxsw eye pad too seekers.',
 'The lesson from Google One Pass: In this digital environment, users want to purchase across every platform with one tool. #sxsw #elonsxsw',
 'RT @mention At the panel: &quot;Your mom has an ipad, designing for boomers&quot; #sxsw',
 'RT @mention I think my effing hubby is in line for an #iPad 2. Can someone point him towards the line-up for wife number #2. #sxswi #sxsw',
 '&quot;Android users are not iPhone users. (They use the Options menu, and Contextual menu)&quot; ~@mention #sxsw',
 'Wow! RT@mention We interrupt your regularly scheduled #sxsw geek programming with big news {link}  #google #circles',
 'Google to Launch New Social Network Called Circl

# Text Normalization

#### Building a Text Normalizer

In [16]:
def normalize_corpus(corpus,
                     accented_char_removal=True,
                     contraction_expansion=True,

                     text_lower_case=True,
                     text_lemmatization=True,
                     special_char_removal=True,
                     stopword_removal=True,
                     remove_digits=True
                     ):
    """
    This function normalizes the text and prepares it for the corpus pre-processing

    Args: 
    - corpus - the text to be normalised
    If set to true, the following functions are applied accordingly:
    - accented_char_removal - sets characters like â to a
    - contraction_expansion - expands phrases like I'm to I am
    - text_lower_case - turns all characters to lower case
    - text_lemmatization - lemmatizes text
    - special_char_removal - removes characters like "#"
    - stopword_removal - removes stop words such as "the", "and" etc.


    Returns:
    - normalized_corpus - the normalized text
    """

    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:

        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)

        # expand characters
        if contraction_expansion:
            doc = contractions.fix(doc)

        # lowercase the text
        if text_lower_case:
            doc = doc.lower()

        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ', doc)

        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)

        # remove special characters and / or digits
        if special_char_removal:
            # insert spaces between special characters to isolate them
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub("\\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)

            # remove extra whitespace
            doc = re.sub(' +', ' ', doc)

            # remove stopwords
            if stopword_removal:
                doc = remove_stopwords(doc, is_lower_case=text_lower_case)

            normalized_corpus.append(doc)

    return normalized_corpus

In [17]:
#train_descr['Clean_Text'] = normalize_corpus(
#    train_descr['Product_Description'])
#corpus_norm = list(train_descr['Clean_Text'])
#corpus_norm[:3]

#train_descr.iloc[1][['Product_Description', 'Clean_Text']].to_dict()

# Pre-processing of the Corpus

Sources: 
* pre-procesing: DSR bootcamp

In [18]:
def preprocess_corpus(corpus_prep):
    """
    Preprocesses the corpus

    Args: 
    corpus_prep - the text to be pre-processed

    """

    if not os.path.exists(preprocessed_corpus_path):
        print("Preprocessing corpus...")

        # Getting the vocabulary
        # 1. Normalizing & Tokenizing
        print("Normalising & Tokenizing...")
        corpus_norm = normalize_corpus(corpus_prep)
        corpus_string = [word_tokenize(description) for description in corpus_norm]
        corpus_tokens = [item for sublist in corpus_string for item in sublist]

        print("Number of tokens:", len(corpus_tokens))
        print("Building vocabulary...")
        word_counter = Counter()
        word_counter.update(corpus_tokens)
        print("Length of vocabulary before pruning:", len(word_counter))

        # 2. Derive the vocabulary - 10.000 most used words
        vocabulary = [key for key, value in word_counter.most_common(
            most_common_words_number)]
        print("Length of vocabulary after pruning:", len(vocabulary))

        # 3. Converting to indices
        print("Index-encoding...")
        indices = [encode_sequence(sequence, vocabulary)
                   for sequence in corpus_string]
        print("Number of indices:", len(indices))

        # 4. Saving
        print("Saving file...")
        pickle.dump((indices, vocabulary), open(
            preprocessed_corpus_path, "wb"))
        print("File Saved")

    else:
        print("Corpus already preprocessed.")


preprocess_corpus(train_descr['Product_Description'])

Preprocessing corpus...
Normalising & Tokenizing...
Number of tokens: 77698
Building vocabulary...
Length of vocabulary before pruning: 7119
Length of vocabulary after pruning: 7119
Index-encoding...
Number of indices: 6364
Saving file...
File Saved


In [20]:
def indices_test(corpus_prep_test):
    """ Nomralizes and Tokenizes the test data, i.e. uses  """
    corpus_norm_test = normalize_corpus(corpus_prep_test)
    
    corpus_string_test = [word_tokenize(description) for description in corpus_norm_test]
    
    _, vocabulary = pd.read_pickle(preprocessed_corpus_path)
    
    indices_test = [encode_sequence(sequence, vocabulary)
                   for sequence in corpus_string_test]
    
    pickle.dump((indices_test, vocabulary), open(
            indices_test_path, "wb"))

indices_test(test_descr['Product_Description'])

In [21]:
indices, vocabulary = pd.read_pickle(preprocessed_corpus_path)
indices_test, _ = pd.read_pickle(indices_test_path)

In [22]:
indices_test[:3]

[[4, 1, 21, 0, 9, 7, 294, 12, 1, 28, 399, 535, 1162, 28, 552],
 [4, 1, 7, 581, 11, 724, 94, 1777, 0],
 [4, 1, 73, 1, 166, 307, 322, 759, 6, 8, 166, 63, 234, 1065, 109, 0]]

In [23]:
indices[:3]

[[154, 3719, 294, 450, 17, 11, 18, 1, 2, 0],
 [4, 1, 24, 3, 137, 18, 513, 0, 15, 24, 18, 145],
 [451, 6, 22, 44, 8, 12, 161, 710, 3720, 0, 1354, 3721, 3722]]

In [24]:
decoded_indices = [decode_indices(index, vocabulary) for index in indices[:3]]
decoded_indices

['web designeruas guide ios android app today mention link sxsw',
 'rt mention line ipad long today yesterday sxsw get line today fun',
 'crazy apple open temporary store austin tomorrow handle rabid sxsw eye pad seeker']

In [25]:
corpus_prep_test = train_descr['Product_Description'][:3]

In [26]:
corpus_norm_test = normalize_corpus(corpus_prep_test)

In [27]:
corpus_string_test = [word_tokenize(description) for description in corpus_norm_test]
corpus_string_test

[['web',
  'designeruas',
  'guide',
  'ios',
  'android',
  'app',
  'today',
  'mention',
  'link',
  'sxsw'],
 ['rt',
  'mention',
  'line',
  'ipad',
  'long',
  'today',
  'yesterday',
  'sxsw',
  'get',
  'line',
  'today',
  'fun'],
 ['crazy',
  'apple',
  'open',
  'temporary',
  'store',
  'austin',
  'tomorrow',
  'handle',
  'rabid',
  'sxsw',
  'eye',
  'pad',
  'seeker']]

In [28]:
corpus_tokens_test = [item for sublist in corpus_string_test for item in sublist]
corpus_tokens_test

['web',
 'designeruas',
 'guide',
 'ios',
 'android',
 'app',
 'today',
 'mention',
 'link',
 'sxsw',
 'rt',
 'mention',
 'line',
 'ipad',
 'long',
 'today',
 'yesterday',
 'sxsw',
 'get',
 'line',
 'today',
 'fun',
 'crazy',
 'apple',
 'open',
 'temporary',
 'store',
 'austin',
 'tomorrow',
 'handle',
 'rabid',
 'sxsw',
 'eye',
 'pad',
 'seeker']

In [29]:
word_counter_test = Counter()
word_counter_test.update(corpus_tokens_test)
word_counter_test

Counter({'web': 1,
         'designeruas': 1,
         'guide': 1,
         'ios': 1,
         'android': 1,
         'app': 1,
         'today': 3,
         'mention': 2,
         'link': 1,
         'sxsw': 3,
         'rt': 1,
         'line': 2,
         'ipad': 1,
         'long': 1,
         'yesterday': 1,
         'get': 1,
         'fun': 1,
         'crazy': 1,
         'apple': 1,
         'open': 1,
         'temporary': 1,
         'store': 1,
         'austin': 1,
         'tomorrow': 1,
         'handle': 1,
         'rabid': 1,
         'eye': 1,
         'pad': 1,
         'seeker': 1})

In [30]:
vocabulary_test = [key for key, value in word_counter_test.most_common(
            most_common_words_number)]
vocabulary_test

['today',
 'sxsw',
 'mention',
 'line',
 'web',
 'designeruas',
 'guide',
 'ios',
 'android',
 'app',
 'link',
 'rt',
 'ipad',
 'long',
 'yesterday',
 'get',
 'fun',
 'crazy',
 'apple',
 'open',
 'temporary',
 'store',
 'austin',
 'tomorrow',
 'handle',
 'rabid',
 'eye',
 'pad',
 'seeker']

In [31]:
#vocabulary_new = vocabulary_test[:]
#vocabulary_test[0:0] = ['abcd']
vocabulary_test

['today',
 'sxsw',
 'mention',
 'line',
 'web',
 'designeruas',
 'guide',
 'ios',
 'android',
 'app',
 'link',
 'rt',
 'ipad',
 'long',
 'yesterday',
 'get',
 'fun',
 'crazy',
 'apple',
 'open',
 'temporary',
 'store',
 'austin',
 'tomorrow',
 'handle',
 'rabid',
 'eye',
 'pad',
 'seeker']

In [32]:
print(len(vocabulary_test))

29


In [33]:
indices_test = [encode_sequence(sequence, vocabulary_test) for sequence in corpus_string_test]
indices_test

[[4, 5, 6, 7, 8, 9, 0, 2, 10, 1],
 [11, 2, 3, 12, 13, 0, 14, 1, 15, 3, 0, 16],
 [17, 18, 19, 20, 21, 22, 23, 24, 25, 1, 26, 27, 28]]

In [34]:
#[item for sublist in corpus_string for item in sublist] vocabulary_test[index]
decoded_tokens_test = [print(vocabulary_test[index]) for sublist in indices_test for index in sublist]

web
designeruas
guide
ios
android
app
today
mention
link
sxsw
rt
mention
line
ipad
long
today
yesterday
sxsw
get
line
today
fun
crazy
apple
open
temporary
store
austin
tomorrow
handle
rabid
sxsw
eye
pad
seeker


In [35]:
decoded_test_descr = [decode_indices(indices, vocabulary_test) for indices in indices_test]
decoded_test_descr

['web designeruas guide ios android app today mention link sxsw',
 'rt mention line ipad long today yesterday sxsw get line today fun',
 'crazy apple open temporary store austin tomorrow handle rabid sxsw eye pad seeker']