In [None]:
!pip install kaggle


In [2]:
!kaggle datasets list -s "Amazon review"

ref                                                           title                                               size  lastUpdated          downloadCount  voteCount  usabilityRating  
------------------------------------------------------------  -------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
snap/amazon-fine-food-reviews                                 Amazon Fine Food Reviews                           242MB  2017-05-01 18:51:31         177023       2178  0.7941176        
eswarchandt/amazon-music-reviews                              Amazon Musical Instruments Reviews                   5MB  2020-03-29 02:59:52          16240        295  1.0              
kritanjalijain/amazon-reviews                                 Amazon reviews                                       1GB  2021-05-15 09:45:40          12404        136  1.0              
grikomsn/amazon-cell-phones-reviews                           Amazon Cell P

In [4]:
#Download dataset
!kaggle datasets download -d 'bittlingmayer/amazonreviews'

amazonreviews.zip: Skipping, found more recently modified local copy (use --force to force download)


In [5]:
#Extracting zip file
import glob
import zipfile

file = './amazonreviews.zip'
with zipfile.ZipFile(file, 'r') as zip_ref:
    zip_ref.extractall('dataset')

import os
os.system("rm amazonreviews.zip")

0

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
from keras import models, layers, optimizers, Sequential

from keras._tf_keras.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
import bz2
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import re

%matplotlib inline

import os
print(os.listdir("./dataset/"))

['test.ft.txt.bz2', 'train.ft.txt.bz2']


## Reading the text

In [6]:
def get_labels_and_texts(file):
    labels = []
    texts = []
    for line in bz2.BZ2File(file):
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        texts.append(x[10:].strip())
    return np.array(labels), texts

train_labels, train_texts = get_labels_and_texts('./dataset/train.ft.txt.bz2')
test_labels, test_texts = get_labels_and_texts('./dataset/test.ft.txt.bz2')

## Pre Processing

In [7]:
import re
NON_ALPHANUM = re.compile(r'[\W]')
NON_ASCII = re.compile(r'[^a-z0-1\s]')
def normalize_texts(texts):
    normalized_texts = []
    for text in texts:
        lower = text.lower()
        no_punctuation = NON_ALPHANUM.sub(r' ', lower)
        no_non_ascii = NON_ASCII.sub(r'', no_punctuation)
        normalized_texts.append(no_non_ascii)
    return normalized_texts
        
train_texts = normalize_texts(train_texts)
test_texts = normalize_texts(test_texts)

## Train/Validation Split

In [8]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, random_state=57643892, test_size=0.2)

In [9]:
MAX_FEATURES = 12000
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(train_texts)
train_texts = tokenizer.texts_to_sequences(train_texts)
val_texts = tokenizer.texts_to_sequences(val_texts)
test_texts = tokenizer.texts_to_sequences(test_texts)

import pickle
pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))

## Padding Sequences


In [11]:
MAX_LENGTH = max(len(train_ex) for train_ex in train_texts)
train_texts = pad_sequences(train_texts, maxlen=MAX_LENGTH)
val_texts = pad_sequences(val_texts, maxlen=MAX_LENGTH)
test_texts = pad_sequences(test_texts, maxlen=MAX_LENGTH)

In [15]:
# def build_rnn_model():
#     sequences = layers.Input(shape=(MAX_LENGTH,))
#     embedded = layers.Embedding(MAX_FEATURES, 64)(sequences)
#     x = layers.GRU(128, return_sequences=True)(embedded)
#     x = layers.GRU(128)(x)
#     x = layers.Dense(32, activation='relu')(x)
#     x = layers.Dense(100, activation='relu')(x)
#     predictions = layers.Dense(1, activation='sigmoid')(x)
#     model = models.Model(inputs=sequences, outputs=predictions)
#     model.compile(
#         optimizer='rmsprop',
#         loss='binary_crossentropy',
#         metrics=['binary_accuracy']
#     )
#     return model



def build_rnn_model():
    model = Sequential()
    model.add(layers.Input(shape=(MAX_LENGTH,)))  # Input layer specifying shape
    model.add(layers.Embedding(MAX_FEATURES, 64))
    model.add(layers.GRU(128, return_sequences=True))
    model.add(layers.GRU(128))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dense(100, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    
    model.compile(
        optimizer='rmsprop',
        loss='binary_crossentropy',
        metrics=['binary_accuracy']
    )
    
    return model

    
rnn_model = build_rnn_model()

In [16]:
rnn_model.fit(
    train_texts, 
    train_labels, 
    batch_size=128,
    epochs=1,
    validation_data=(val_texts, val_labels), )

[1m 4015/22500[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m6:11:32[0m 1s/step - binary_accuracy: 0.8148 - loss: 0.3676

KeyboardInterrupt: 

In [17]:
rnn_model.save("rnn_model.h5")



And we should find that this model will end up with an accuracy similar to the CNN model. I haven't bothered to set the seeds, but it can go as high as 95%.

In [18]:
preds = rnn_model.predict(test_texts)
print('Accuracy score: {:0.4}'.format(accuracy_score(test_labels, 1 * (preds > 0.5))))
print('F1 score: {:0.4}'.format(f1_score(test_labels, 1 * (preds > 0.5))))
print('ROC AUC score: {:0.4}'.format(roc_auc_score(test_labels, preds)))

[1m12500/12500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1111s[0m 89ms/step
Accuracy score: 0.9277
F1 score: 0.928
ROC AUC score: 0.9787


In [15]:
pred = [
    "This product is good",
    "This product is okayish"
]
MAX_LENGTH = 255
pred = normalize_texts(pred)
pred = tokenizer.texts_to_sequences(pred)
pred = pad_sequences(pred, maxlen=MAX_LENGTH)
temp = rnn_model.predict(pred)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
