In [1]:
# for data
import pandas as pd
import numpy as np

# for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# for processing
import re
import nltk

# for bag-of-words
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing

# for word embedding
import gensim
import gensim.downloader as gensim_api

# for deep learning
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K

# for bert language model
# import transformers


In [2]:
DATA_PATH = 'data/amazon_reviews_grocery_100k.tsv'

In [3]:
# read tsv into dataframe
df = pd.read_csv(DATA_PATH, sep='\t', error_bad_lines=False)

df.head



  exec(code_obj, self.user_global_ns, self.user_ns)
b'Skipping line 1925: expected 15 fields, saw 22\nSkipping line 1977: expected 15 fields, saw 22\nSkipping line 35265: expected 15 fields, saw 22\nSkipping line 53357: expected 15 fields, saw 22\n'
b'Skipping line 68361: expected 15 fields, saw 22\nSkipping line 70922: expected 15 fields, saw 22\nSkipping line 73503: expected 15 fields, saw 22\nSkipping line 85612: expected 15 fields, saw 22\n'


<bound method NDFrame.head of       marketplace  customer_id       review_id  product_id  product_parent  \
0              US     42521656  R26MV8D0KG6QI6  B000SAQCWC       159713740   
1              US     12049833  R1OF8GP57AQ1A0  B00509LVIQ       138680402   
2              US       107642  R3VDC1QB6MC4ZZ  B00KHXESLC       252021703   
3              US      6042304  R12FA3DCF8F9ER  B000F8JIIC       752728342   
4              US     18123821   RTWHVNV6X4CNJ  B004ZWR9RQ       552138758   
...           ...          ...             ...         ...             ...   
98635          US     27635275  R1PHJCOXJ43DD5  B00V3W9SMU       217641189   
98636          US      1369561   R2ESP8RM8ERG4  B00FY2UESM       243349133   
98637          US     49479456  R1VRW0RFCHM8F4  B001L9MQK6       626575660   
98638          US      9965703  R1ZKNAAKDNX73G  B00OX6FIMW       403680832   
98639          US     11372488  R1L2GI2QOIBQJU  B00IN4HABM       974652159   

                                 

### Preprocessing:

In [5]:
# filter columns
df = df[['review_body', 'star_rating']]

df.head

<bound method NDFrame.head of                                              review_body  star_rating
0      As a family allergic to wheat, dairy, eggs, nu...            5
1      My favorite nut.  Creamy, crunchy, salty, and ...            5
2      This green tea tastes so good! My girlfriend l...            5
3      I love Melissa's brand but this is a great sec...            5
4                                                   good            5
...                                                  ...          ...
98635          This was a gift. Exactly what I expected.            5
98636  This is absolutely the most savory brittle I h...            5
98637                              Exactly what I wanted            5
98638                                      Not impressed            1
98639                                             Yummy.            5

[98640 rows x 2 columns]>

In [6]:
# rename columns
df = df.rename(columns={'review_body': 'text', 'star_rating': 'y'})

df.head

<bound method NDFrame.head of                                                     text  y
0      As a family allergic to wheat, dairy, eggs, nu...  5
1      My favorite nut.  Creamy, crunchy, salty, and ...  5
2      This green tea tastes so good! My girlfriend l...  5
3      I love Melissa's brand but this is a great sec...  5
4                                                   good  5
...                                                  ... ..
98635          This was a gift. Exactly what I expected.  5
98636  This is absolutely the most savory brittle I h...  5
98637                              Exactly what I wanted  5
98638                                      Not impressed  1
98639                                             Yummy.  5

[98640 rows x 2 columns]>

In [7]:
# map rating to class (<=1 = 0 | <1 = 1)

df['y'] = df['y'].map(lambda x: 0 if int(x) <= 1 else 1)

# print 5 random rows
df.sample(5)


Unnamed: 0,text,y
6733,My favorite - great flavor at a great price.,1
56381,Love these nuts! Roasted and salted perfectly!,1
35004,I ordered this Gourmet Italian Meat Lovers gif...,1
26525,My kids love these. I have a 2 year old and a ...,1
51591,This is a great product. It's organic so it wo...,1


In [8]:
'''
Preprocess a string.
:parameter
    :param text: string - name of column containing text
    :param lst_stopwords: list - list of stopwords to remove
    :param flg_stemm: bool - whether stemming is to be applied
    :param flg_lemm: bool - whether lemmitisation is to be applied
:return
    cleaned text
'''


def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    # clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())

    # Tokenize (convert from string to list)
    lst_text = text.split()
    # remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]

    # Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]

    # Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]

    # back to string from list
    text = " ".join(lst_text)

    return text


In [9]:
lst_stopwords = nltk.corpus.stopwords.words("english")
lst_stopwords


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [10]:
# applying preprocessing to dataset
df["text_clean"] = df["text"].apply(lambda x: utils_preprocess_text(
    x, flg_stemm=False, flg_lemm=True, lst_stopwords=lst_stopwords))

df.head()


Unnamed: 0,text,y,text_clean
0,"As a family allergic to wheat, dairy, eggs, nu...",1,family allergic wheat dairy egg nut several th...
1,"My favorite nut. Creamy, crunchy, salty, and ...",1,favorite nut creamy crunchy salty slightly swe...
2,This green tea tastes so good! My girlfriend l...,1,green tea taste good girlfriend love
3,I love Melissa's brand but this is a great sec...,1,love melissa brand great second cant get melis...
4,good,1,good


In [11]:
# split dataset
dtf_train, dtf_test = model_selection.train_test_split(df, test_size=0.2)

# get target
y_train = dtf_train["y"].values
y_test = dtf_test["y"].values


### Preparing word embeddings:

In [16]:
nlp = gensim_api.load("word2vec-google-news-300")


MemoryError: Unable to allocate 3.35 GiB for an array with shape (3000000, 300) and data type float32

In [None]:
corpus = df_train["text_clean"]

# create list of lists of unigrams
lst_corpus = []
for string in corpus:
    lst_words = string.split()
    lst_grams = [" ".join(lst_words[i:i+1])
                 for i in range(0, len(lst_words), 1)]
    lst_corpus.append(lst_grams)

# detect bigrams and trigrams
bigrams_detector = gensim.models.phrases.Phrases(
    lst_corpus, delimiter=" ".encode(), min_count=5, threshold=10)
bigrams_detector = gensim.models.phrases.Phraser(bigrams_detector)
trigrams_detector = gensim.models.phrases.Phrases(
    bigrams_detector[lst_corpus], delimiter=" ".encode(), min_count=5, threshold=10)
trigrams_detector = gensim.models.phrases.Phraser(trigrams_detector)


In [None]:
# fit w2v
nlp = gensim.models.word2vec.Word2Vec(
    lst_corpus, size=300, window=8, min_count=1, sg=1, iter=30)


In [None]:
## tokenize text
tokenizer = kprocessing.text.Tokenizer(
    lower=True, split=' ', oov_token="NaN", filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(lst_corpus)
dic_vocabulary = tokenizer.word_index

## create sequence
lst_text2seq = tokenizer.texts_to_sequences(lst_corpus)

## padding sequence
X_train = kprocessing.sequence.pad_sequences(
    lst_text2seq, maxlen=15, padding="post", truncating="post")


In [None]:
sns.heatmap(X_train == 0, vmin=0, vmax=1, cbar=False)
plt.show()
