## Extract dataset from kaggle

In [1]:
%pip install kaggle

Note: you may need to restart the kernel to use updated packages.


Set up your kaggle api token

In [2]:
!kaggle datasets list -s "Amazon review"

ref                                                           title                                               size  lastUpdated          downloadCount  voteCount  usabilityRating  
------------------------------------------------------------  -------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
snap/amazon-fine-food-reviews                                 Amazon Fine Food Reviews                           242MB  2017-05-01 18:51:31         171384       2161  0.7941176        
eswarchandt/amazon-music-reviews                              Amazon Musical Instruments Reviews                   5MB  2020-03-29 02:59:52          15643        293  1.0              
kritanjalijain/amazon-reviews                                 Amazon reviews                                       1GB  2021-05-15 09:45:40          11303        131  1.0              
grikomsn/amazon-cell-phones-reviews                           Amazon Cell P

In [3]:
#Download dataset
!kaggle datasets download -d bittlingmayer/amazonreviews

Downloading amazonreviews.zip to e:\IEEE\GoLocal\SentAnalysis




  0%|          | 0.00/493M [00:00<?, ?B/s]
  0%|          | 1.00M/493M [00:01<10:18, 834kB/s]
  0%|          | 2.00M/493M [00:01<05:11, 1.65MB/s]
  1%|          | 3.00M/493M [00:01<03:27, 2.48MB/s]
  1%|          | 4.00M/493M [00:01<02:38, 3.24MB/s]
  1%|          | 5.00M/493M [00:01<02:12, 3.87MB/s]
  1%|          | 6.00M/493M [00:02<01:54, 4.45MB/s]
  1%|▏         | 7.00M/493M [00:02<01:43, 4.92MB/s]
  2%|▏         | 8.00M/493M [00:02<01:36, 5.27MB/s]
  2%|▏         | 9.00M/493M [00:02<01:31, 5.52MB/s]
  2%|▏         | 10.0M/493M [00:02<01:28, 5.72MB/s]
  2%|▏         | 11.0M/493M [00:02<01:26, 5.86MB/s]
  2%|▏         | 12.0M/493M [00:03<01:24, 5.94MB/s]
  3%|▎         | 13.0M/493M [00:03<01:24, 5.97MB/s]
  3%|▎         | 14.0M/493M [00:03<01:23, 6.03MB/s]
  3%|▎         | 15.0M/493M [00:03<01:22, 6.06MB/s]
  3%|▎         | 16.0M/493M [00:03<01:22, 6.09MB/s]
  3%|▎         | 17.0M/493M [00:04<01:21, 6.11MB/s]
  4%|▎         | 18.0M/493M [00:04<01:21, 6.09MB/s]
  4%|▍         | 19.0

In [4]:
#Extracting zip file
import glob
import zipfile

file = './amazonreviews.zip'
with zipfile.ZipFile(file, 'r') as zip_ref:
    zip_ref.extractall('dataset')

import os
os.remove(file)

In [5]:
##Importing necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
from keras import models, layers, optimizers
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import pad_sequences
import bz2
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import re

%matplotlib inline

# Input data files are available in the "./dataset" directory.
import os
print(os.listdir("./dataset/"))

['test.ft.txt.bz2', 'train.ft.txt.bz2']


## Reading the text

The text is held in a compressed format. Luckily, we can still read it line by line. The first word gives the label, so we have to convert that into a number and then take the rest to be the comment.

In [6]:
def get_labels_and_texts(file):
    labels = []
    texts = []
    for line in bz2.BZ2File(file):
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        texts.append(x[10:].strip())
    return np.array(labels), texts

train_labels, train_texts = get_labels_and_texts('./dataset/train.ft.txt.bz2')
test_labels, test_texts = get_labels_and_texts('./dataset/test.ft.txt.bz2')

## Text Preprocessing

1. Convert to lowercase
2. substitute non alphanumeric characters with whitespace
3. Remove noascii characters

In [7]:
def preprocess(texts):
    """
    texts parameter is list of sentences which needs to be pre-processes
    returns list containing each item of texts preprocessed
    """
    preprocessed_texts = []

    for line in texts:
        line = line.lower()
        line = re.sub(r'[^0-9a-zA-Z]+', ' ', line)
        line = re.sub(r'[^\x00-\x7F]','', line)
        preprocessed_texts.append(line)

    return preprocessed_texts
        
train_texts = preprocess(train_texts)
test_texts = preprocess(test_texts)

In [8]:
#split train data into train and validation data - try keeping test_size = 0.2
from sklearn.model_selection import train_test_split


train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2);



In [9]:
#Use Tokenizer to convert text into model usable form

#code here

def tokenizeText(texts):

    ntoken = Tokenizer(char_level=True)
    ntoken.fit_on_texts(texts)
    tokenized_text = ntoken.texts_to_sequences(texts)
    vocab_size = len(ntoken.word_index) + 1
    
    return tokenized_text, vocab_size

train_texts, vocab_size = tokenizeText(train_texts)
val_texts = tokenizeText(val_texts)[0]
test_texts = tokenizeText(test_texts)[0]

# train_texts ,val_texts ,test_texts now contains model usable form of texts

#Do padding
#find maxlen
#Use padding to convert all vectors to maxlen and save them in train_texts ,val_texts ,test_texts itself

#code here

train_texts = pad_sequences(train_texts, maxlen=100)
test_texts = pad_sequences(test_texts, maxlen=100)
val_texts = pad_sequences(val_texts, maxlen=100)

## Model

In [10]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Bidirectional
from keras.layers import Dropout

def build_lstm_model():

    model = Sequential()
    model.add(Embedding(vocab_size, 128))
    model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

    return model
    
lstm_model = build_lstm_model()

In [11]:
#Training
lstm_model.fit(
    train_texts[:1000], 
    train_labels[:1000], 
    batch_size=128,
    epochs=5,
    validation_data=(val_texts[:200], val_labels[:200]), )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1b6cd8ebc50>

## Results

In [12]:
preds = lstm_model.predict(test_texts[:1000])
print('Accuracy score: {:0.4}'.format(accuracy_score(test_labels[:1000], 1 * (preds > 0.5))))
print('F1 score: {:0.4}'.format(f1_score(test_labels[:1000], 1 * (preds > 0.5))))
print('ROC AUC score: {:0.4}'.format(roc_auc_score(test_labels[:1000], preds)))

Accuracy score: 0.507
F1 score: 0.1085
ROC AUC score: 0.5542


## Try it out

In [25]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

your_comment = "this product is so trash!"

lst = []
tok1 = Tokenizer(char_level=True)
tok1.fit_on_texts([your_comment])
comment_sequences = tok1.texts_to_sequences([your_comment])
comment_sequences = pad_sequences(comment_sequences, maxlen=100)
print(comment_sequences)
preds = lstm_model.predict(comment_sequences)
print(preds[0])

if preds[0]>0.5:
    print("Positive")
else:
    print("Negative")

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  3  4  5  1  2  8  6  7  9 10 11  3  2  5  1  2  1  7  2  3  6
  12  1  4 13]]
[0.34090525]
Negative


In [None]:
#Run this cell only if dataset is no longer needed

#Deleting dataset as it exceeds 100 MB, github max limit
import os 
os.system("rm -r dataset")