In [1]:
import contractions
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import sys
import tensorflow as tf

from keras.preprocessing.text import Tokenizer

%matplotlib inline

# Parameters

In [2]:
# folders
home = os.getenv("HOME")
nlp_repo = os.path.join(home, 'git/nlp-product-sentiment-classification')

# preprocessing parameters
preprocessed_corpus_path_TF = os.path.join(
    nlp_repo, 'data/03_processed/product_descr_preprocessed_TF.p')

preprocessed_corpus_path_TF_oh = os.path.join(
    nlp_repo, 'data/03_processed/product_descr_preprocessed_TF_oh.p')

# max_words = vocabulary size = our samples - number of most frequent words.
# We set it to 10.000, although in our particular case we have less.
# We do this to parametise the code.
# Aleternatively, we can set it to the length of our vocabulary = word_index
max_words = 10000

# Tokenizing the Text

In [3]:
home = os.getenv("HOME")
nlp_repo = os.path.join(home, 'git/nlp-product-sentiment-classification')
src_dir = os.path.join(os.getcwd(), '..', 'src')
sys.path.append(src_dir)

In [4]:
train_csv_path = os.path.join(nlp_repo, 'data/03_processed/Train.csv')
train_descr = pd.read_csv(train_csv_path)

test_csv_path = os.path.join(nlp_repo, 'data/03_processed/Test.csv')
test_descr = pd.read_csv(test_csv_path)

In [5]:
tokenizer = Tokenizer(
    num_words=max_words, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True,
    split=' ', char_level=False, oov_token='oov', document_count=0,
)

In [6]:
samples = train_descr['Product_Description']

In [7]:
tokenizer.fit_on_texts(samples)

In [8]:
sequences = tokenizer.texts_to_sequences(samples)

In [9]:
word_index = tokenizer.word_index

In [10]:
pickle.dump((sequences, word_index), open(preprocessed_corpus_path_TF, "wb"))

# One-hot encoding (for Bag-of-Words)

"Manual" word-level one-hot encoding

For the "manual" one-hot encoding we are "cheating" a bit, i.e. we will use the tokenised sequences (sequences) and the vocabulary (word_index) to above generate some of the values (for ex. max_len). 

We do this, because the manual part is just for demonstration purposes and we won't be using it. 
Furthermore, it will enable comparability between bag-of-words and the other models.

In [11]:
# max_len = sequence length - the text is cut off after this number of words
# usually this parameter can be manually defined.
# However, since our tokenised sequences are not that long anyway, we define it as the maximum sequence length in our list of tokenised sequences
max_len = np.max([len(x) for x in sequences])

In [12]:
results = np.zeros(shape=(len(sequences),
                          max_len,
                          max(word_index.values()) + 1))

In [13]:
for i, sequence in enumerate(sequences):
    for j, word in list(enumerate(sequence))[:max_len]:
        index = word_index.get(word)
        results[i, j, index] = 1

Tensorflow word-level one-hot encoding (analogue to the "manual" one-hot encoding it is just for demonstration purposes)

In [14]:
dimensions_descr = len(word_index)

# before proceedting to one-hot with TF, we need to pad the sequences.
# Otherwise it will give us an error due to the different lengths of the sequences
results_tf_oh_prep = tf.keras.preprocessing.sequence.pad_sequences(
    sequences, maxlen=dimensions_descr)

#results_tf_oh = tf.one_hot(indices=results_tf_oh_prep, depth=dimensions_descr)
#results_tf_oh = tf.reduce_max(results_tf_oh, 0)

Keras word-level one-hot encoding

In [15]:
tokenizer_oh = Tokenizer(num_words=max_words, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True,
                         split=' ', char_level=False, oov_token='oov', document_count=0,)
tokenizer_oh.fit_on_texts(samples)

In [16]:
sequences_oh = tokenizer_oh.texts_to_sequences(samples)

In [17]:
results_oh = tokenizer_oh.texts_to_matrix(samples, mode='binary')

In [18]:
word_index_oh = tokenizer_oh.word_index
print(f'Found {len(word_index_oh)} unique tokens.')

Found 8589 unique tokens.


In [19]:
pickle.dump((results_oh, word_index_oh), open(
    preprocessed_corpus_path_TF_oh, "wb"))