## Extract dataset from kaggle

In [2]:
%pip install kaggle

Note: you may need to restart the kernel to use updated packages.


Set up your kaggle api token

In [3]:
!kaggle datasets list -s "Amazon review"

ref                                                           title                                               size  lastUpdated          downloadCount  voteCount  usabilityRating  
------------------------------------------------------------  -------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
snap/amazon-fine-food-reviews                                 Amazon Fine Food Reviews                           242MB  2017-05-01 18:51:31         170254       2155  0.7941176        
eswarchandt/amazon-music-reviews                              Amazon Musical Instruments Reviews                   5MB  2020-03-29 02:59:52          15524        293  1.0              
kritanjalijain/amazon-reviews                                 Amazon reviews                                       1GB  2021-05-15 09:45:40          11059        128  1.0              
grikomsn/amazon-cell-phones-reviews                           Amazon Cell P

In [5]:
#Download dataset
!kaggle datasets download -d bittlingmayer/amazonreviews

Downloading amazonreviews.zip to e:\IEEE\GoLocal\SentAnalysis




  0%|          | 0.00/493M [00:00<?, ?B/s]
  0%|          | 1.00M/493M [00:01<10:22, 828kB/s]
  0%|          | 2.00M/493M [00:01<05:11, 1.65MB/s]
  1%|          | 3.00M/493M [00:01<03:13, 2.66MB/s]
  1%|          | 5.00M/493M [00:01<02:08, 3.99MB/s]
  1%|▏         | 7.00M/493M [00:02<01:34, 5.37MB/s]
  2%|▏         | 8.00M/493M [00:02<01:32, 5.52MB/s]
  2%|▏         | 9.00M/493M [00:02<01:30, 5.59MB/s]
  2%|▏         | 10.0M/493M [00:02<01:25, 5.92MB/s]
  2%|▏         | 11.0M/493M [00:02<01:33, 5.38MB/s]
  2%|▏         | 12.0M/493M [00:03<01:34, 5.32MB/s]
  3%|▎         | 13.0M/493M [00:03<01:33, 5.37MB/s]
  3%|▎         | 14.0M/493M [00:03<01:30, 5.56MB/s]
  3%|▎         | 15.0M/493M [00:03<01:31, 5.48MB/s]
  3%|▎         | 16.0M/493M [00:03<01:26, 5.82MB/s]
  3%|▎         | 17.0M/493M [00:03<01:27, 5.68MB/s]
  4%|▎         | 18.0M/493M [00:04<01:25, 5.83MB/s]
  4%|▍         | 19.0M/493M [00:04<01:19, 6.23MB/s]
  4%|▍         | 20.0M/493M [00:04<01:11, 6.94MB/s]
  4%|▍         | 21.0

In [6]:
#Extracting zip file
import glob
import zipfile

file = './amazonreviews.zip'
with zipfile.ZipFile(file, 'r') as zip_ref:
    zip_ref.extractall('dataset')

import os
os.system("rm amazonreviews.zip")

1

In [8]:
##Importing necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
from keras import models, layers, optimizers
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import pad_sequences
import bz2
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import re

%matplotlib inline

# Input data files are available in the "./dataset" directory.
import os
print(os.listdir("./dataset/"))

['test.ft.txt.bz2', 'train.ft.txt.bz2']


## Reading the text

The text is held in a compressed format. Luckily, we can still read it line by line. The first word gives the label, so we have to convert that into a number and then take the rest to be the comment.

In [9]:
def get_labels_and_texts(file):
    labels = []
    texts = []
    for line in bz2.BZ2File(file):
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        texts.append(x[10:].strip())
    return np.array(labels), texts

train_labels, train_texts = get_labels_and_texts('./dataset/train.ft.txt.bz2')
test_labels, test_texts = get_labels_and_texts('./dataset/test.ft.txt.bz2')

## Text Preprocessing

1. Convert to lowercase
2. substitute non alphanumeric characters with whitespace
3. Remove noascii characters

In [10]:
def preprocess(texts):
    """
    texts parameter is list of sentences which needs to be pre-processes
    returns list containing each item of texts preprocessed
    """
    preprocessed_texts = []

    for line in texts:
        line = line.lower()
        line = re.sub(r'[^0-9a-zA-Z]+', ' ', line)
        line = re.sub(r'[^\x00-\x7F]','', line)
        preprocessed_texts.append(line)

    return preprocessed_texts
        
train_texts = preprocess(train_texts)
test_texts = preprocess(test_texts)

In [11]:
#split train data into train and validation data - try keeping test_size = 0.2
from sklearn.model_selection import train_test_split


train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2);



In [12]:
#Use Tokenizer to convert text into model usable form

#code here

def tokenizeText(texts):

    tokenized_text = []

    ntoken = Tokenizer()

    for line in texts:
        ntoken.fit_on_texts(line)
        list_words = text_to_word_sequence(line)
        tokenized_text.append(list_words)

    return tokenized_text

train_texts = tokenizeText(train_texts)
val_texts = tokenizeText(val_texts)
test_texts = tokenizeText(test_texts)

# train_texts ,val_texts ,test_texts now contains model usable form of texts

#Do padding
#find maxlen
#Use padding to convert all vectors to maxlen and save them in train_texts ,val_texts ,test_texts itself

#code here

train_texts = pad_sequences(train_texts, maxlen=100)
test_texts = pad_sequences(test_texts, maxlen=100)
val_texts = pad_sequences(val_texts, maxlen=100)

KeyboardInterrupt: 

## Model

In [None]:
def build_lstm_model():
    #code here

    #return model
    
lstm_model = build_rnn_model()

In [None]:
#Training
lstm_model.fit(
    train_texts, 
    train_labels, 
    batch_size=128,
    epochs=1,
    validation_data=(val_texts, val_labels), )

## Results

In [None]:
preds = lstm_model.predict(test_texts)
print('Accuracy score: {:0.4}'.format(accuracy_score(test_labels, 1 * (preds > 0.5))))
print('F1 score: {:0.4}'.format(f1_score(test_labels, 1 * (preds > 0.5))))
print('ROC AUC score: {:0.4}'.format(roc_auc_score(test_labels, preds)))

## Try it out

In [None]:
your_comment = "Type here.."


lst = []
lst.append(your_comment)
preds = lstm_model.predict(lst)

if preds[0]>0.5:
    print("Positive")
else:
    print("Negative")

In [None]:
#Run this cell only if dataset is no longer needed

#Deleting dataset as it exceeds 100 MB, github max limit
import os 
os.system("rm -r dataset")