In [None]:
!wget https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/Digital_Music_5.json.gz

--2024-08-18 10:51:55--  https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/Digital_Music_5.json.gz
Resolving datarepo.eng.ucsd.edu (datarepo.eng.ucsd.edu)... 132.239.8.30
Connecting to datarepo.eng.ucsd.edu (datarepo.eng.ucsd.edu)|132.239.8.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19408584 (19M) [application/x-gzip]
Saving to: ‘Digital_Music_5.json.gz’


2024-08-18 10:51:55 (124 MB/s) - ‘Digital_Music_5.json.gz’ saved [19408584/19408584]



In [None]:
!pip install keras-preprocessing

Collecting keras-preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl.metadata (1.9 kB)
Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: keras-preprocessing
Successfully installed keras-preprocessing-1.1.2


### Imports

In [None]:
import numpy as np
import json
import pandas as pd
import gzip
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from keras_preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

### Load the dataset: Digital_Music_5

In [None]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('Digital_Music_5.json.gz')

In [None]:
MAX_LEN = 100
BATCH_SIZE = 64
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
NUM_CLASSES = 5
EPOCHS = 15

### PreProcess

In [None]:
data = df[['reviewText', 'overall']]
data['reviewText'] = data['reviewText'].fillna('')
data['reviewText'] = data['reviewText'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['reviewText'] = data['reviewText'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['reviewText'] = data['reviewText'].astype(str)


### Tokenize

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['reviewText'])
sequences = tokenizer.texts_to_sequences(data['reviewText'])
word_index = tokenizer.word_index

In [None]:
X = pad_sequences(sequences, maxlen=MAX_LEN)
y = data['overall'] - 1

In [None]:
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

### AmazonDataset Class Implementation

In [None]:
class AmazonDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
      return torch.tensor(self.X[idx], dtype=torch.long), torch.tensor(self.y[idx], dtype=torch.long)

### DataLoaders

In [None]:
train_dataset = AmazonDataset(X_train, y_train)
val_dataset = AmazonDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Implementing a Custom Sentiment Analysis Model with Recurrent Neural Networks (RNN, GRU, LSTM)

In [None]:
class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_classes, num_layers, model_type='LSTM'):
        super(SentimentRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        if model_type == 'RNN':
            self.rnn = nn.RNN(embed_size, hidden_size, num_layers, batch_first=True)
        elif model_type == 'GRU':
            self.rnn = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True)
        else:
            self.rnn = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        h, _ = self.rnn(x)
        out = h[:, -1, :]
        out = self.fc(out)
        return out

### Train the model

In [None]:
def train_model(model, train_loader, val_loader, epochs):
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for X_batch, y_batch in tqdm(train_loader):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_train_loss}")

        # Validation
        model.eval()
        y_pred = []
        y_true = []
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                _, predicted = torch.max(outputs, 1)
                y_pred.extend(predicted.cpu().numpy())
                y_true.extend(y_batch.cpu().numpy())

        accuracy = accuracy_score(y_true, y_pred)
        print(f"Validation Accuracy: {accuracy:.4f}")

In [None]:
vocab_size = len(word_index) + 1
for model_type in ['RNN', 'GRU', 'LSTM']:
    for num_layers in [1, 2, 3]:
        print(f"Training {model_type} with {num_layers} layers...")
        model = SentimentRNN(vocab_size, EMBEDDING_DIM, HIDDEN_DIM, NUM_CLASSES, num_layers, model_type)
        train_model(model, train_loader, val_loader, EPOCHS)

Training RNN with 1 layers...


100%|██████████| 2123/2123 [00:13<00:00, 152.81it/s]


Epoch 1/15, Loss: 0.6360953835752063
Validation Accuracy: 0.7968


100%|██████████| 2123/2123 [00:12<00:00, 169.98it/s]


Epoch 2/15, Loss: 0.5841559007293595
Validation Accuracy: 0.8090


100%|██████████| 2123/2123 [00:12<00:00, 168.45it/s]


Epoch 3/15, Loss: 0.5408564049064467
Validation Accuracy: 0.8141


100%|██████████| 2123/2123 [00:12<00:00, 169.06it/s]


Epoch 4/15, Loss: 0.5127957907561923
Validation Accuracy: 0.8118


100%|██████████| 2123/2123 [00:12<00:00, 170.93it/s]


Epoch 5/15, Loss: 0.48669798022949406
Validation Accuracy: 0.8154


100%|██████████| 2123/2123 [00:12<00:00, 170.62it/s]


Epoch 6/15, Loss: 0.46035969971178614
Validation Accuracy: 0.8185


100%|██████████| 2123/2123 [00:12<00:00, 171.08it/s]


Epoch 7/15, Loss: 0.4241601028113336
Validation Accuracy: 0.8205


100%|██████████| 2123/2123 [00:13<00:00, 162.37it/s]


Epoch 8/15, Loss: 0.40053699517581354
Validation Accuracy: 0.8194


100%|██████████| 2123/2123 [00:12<00:00, 170.00it/s]


Epoch 9/15, Loss: 0.3737712168308906
Validation Accuracy: 0.8184


100%|██████████| 2123/2123 [00:12<00:00, 169.97it/s]


Epoch 10/15, Loss: 0.35459841345272364
Validation Accuracy: 0.8225


100%|██████████| 2123/2123 [00:12<00:00, 171.74it/s]


Epoch 11/15, Loss: 0.34514253792535904
Validation Accuracy: 0.8116


100%|██████████| 2123/2123 [00:12<00:00, 170.44it/s]


Epoch 12/15, Loss: 0.3146163904675323
Validation Accuracy: 0.8237


100%|██████████| 2123/2123 [00:12<00:00, 170.00it/s]


Epoch 13/15, Loss: 0.2992718676197265
Validation Accuracy: 0.8106


100%|██████████| 2123/2123 [00:12<00:00, 171.40it/s]


Epoch 14/15, Loss: 0.2936378240838008
Validation Accuracy: 0.8133


100%|██████████| 2123/2123 [00:12<00:00, 170.02it/s]


Epoch 15/15, Loss: 0.2772157283476148
Validation Accuracy: 0.8207
Training RNN with 2 layers...


100%|██████████| 2123/2123 [00:12<00:00, 164.57it/s]


Epoch 1/15, Loss: 0.6388513785990914
Validation Accuracy: 0.8049


100%|██████████| 2123/2123 [00:12<00:00, 165.34it/s]


Epoch 2/15, Loss: 0.5803251469466577
Validation Accuracy: 0.8080


100%|██████████| 2123/2123 [00:13<00:00, 154.26it/s]


Epoch 3/15, Loss: 0.5494532076061979
Validation Accuracy: 0.8054


100%|██████████| 2123/2123 [00:12<00:00, 164.87it/s]


Epoch 4/15, Loss: 0.5509994549819787
Validation Accuracy: 0.8108


100%|██████████| 2123/2123 [00:12<00:00, 165.47it/s]


Epoch 5/15, Loss: 0.5142758163609036
Validation Accuracy: 0.8148


100%|██████████| 2123/2123 [00:12<00:00, 165.38it/s]


Epoch 6/15, Loss: 0.49273870800443265
Validation Accuracy: 0.7997


100%|██████████| 2123/2123 [00:12<00:00, 164.36it/s]


Epoch 7/15, Loss: 0.4692281597397881
Validation Accuracy: 0.8159


100%|██████████| 2123/2123 [00:12<00:00, 164.24it/s]


Epoch 8/15, Loss: 0.44107432366540505
Validation Accuracy: 0.8172


100%|██████████| 2123/2123 [00:12<00:00, 166.17it/s]


Epoch 9/15, Loss: 0.42230633544124524
Validation Accuracy: 0.8154


100%|██████████| 2123/2123 [00:12<00:00, 165.17it/s]


Epoch 10/15, Loss: 0.4013751830609127
Validation Accuracy: 0.8180


100%|██████████| 2123/2123 [00:12<00:00, 163.31it/s]


Epoch 11/15, Loss: 0.383849385088758
Validation Accuracy: 0.8165


100%|██████████| 2123/2123 [00:13<00:00, 157.93it/s]


Epoch 12/15, Loss: 0.3656009626427856
Validation Accuracy: 0.8211


100%|██████████| 2123/2123 [00:12<00:00, 164.67it/s]


Epoch 13/15, Loss: 0.3547272767680353
Validation Accuracy: 0.8127


100%|██████████| 2123/2123 [00:12<00:00, 165.58it/s]


Epoch 14/15, Loss: 0.3435685298371809
Validation Accuracy: 0.8110


100%|██████████| 2123/2123 [00:12<00:00, 164.33it/s]


Epoch 15/15, Loss: 0.3314139735562533
Validation Accuracy: 0.8197
Training RNN with 3 layers...


100%|██████████| 2123/2123 [00:13<00:00, 156.25it/s]


Epoch 1/15, Loss: 0.631854942509772
Validation Accuracy: 0.8074


100%|██████████| 2123/2123 [00:13<00:00, 159.00it/s]


Epoch 2/15, Loss: 0.5809371119055387
Validation Accuracy: 0.8038


100%|██████████| 2123/2123 [00:12<00:00, 165.02it/s]


Epoch 3/15, Loss: 0.5850692111200395
Validation Accuracy: 0.8083


100%|██████████| 2123/2123 [00:12<00:00, 164.22it/s]


Epoch 4/15, Loss: 0.5506551731491898
Validation Accuracy: 0.8100


100%|██████████| 2123/2123 [00:12<00:00, 165.36it/s]


Epoch 5/15, Loss: 0.5329924421915736
Validation Accuracy: 0.8132


100%|██████████| 2123/2123 [00:13<00:00, 162.85it/s]


Epoch 6/15, Loss: 0.5394376659528074
Validation Accuracy: 0.8084


100%|██████████| 2123/2123 [00:14<00:00, 149.33it/s]


Epoch 7/15, Loss: 0.5053599703665673
Validation Accuracy: 0.8102


100%|██████████| 2123/2123 [00:13<00:00, 158.80it/s]


Epoch 8/15, Loss: 0.49740029707923006
Validation Accuracy: 0.8117


100%|██████████| 2123/2123 [00:13<00:00, 155.89it/s]


Epoch 9/15, Loss: 0.4701563420648907
Validation Accuracy: 0.8132


100%|██████████| 2123/2123 [00:13<00:00, 159.20it/s]


Epoch 10/15, Loss: 0.46208305863968
Validation Accuracy: 0.8140


100%|██████████| 2123/2123 [00:13<00:00, 157.75it/s]


Epoch 11/15, Loss: 0.45062004679702555
Validation Accuracy: 0.8145


100%|██████████| 2123/2123 [00:13<00:00, 159.10it/s]


Epoch 12/15, Loss: 0.431386947624817
Validation Accuracy: 0.8149


100%|██████████| 2123/2123 [00:13<00:00, 158.73it/s]


Epoch 13/15, Loss: 0.4233378591230889
Validation Accuracy: 0.8130


100%|██████████| 2123/2123 [00:13<00:00, 158.68it/s]


Epoch 14/15, Loss: 0.41399705592144564
Validation Accuracy: 0.8184


100%|██████████| 2123/2123 [00:13<00:00, 155.71it/s]


Epoch 15/15, Loss: 0.3911173820383303
Validation Accuracy: 0.8085
Training GRU with 1 layers...


100%|██████████| 2123/2123 [00:13<00:00, 156.90it/s]


Epoch 1/15, Loss: 0.5718672447998008
Validation Accuracy: 0.8192


100%|██████████| 2123/2123 [00:13<00:00, 162.67it/s]


Epoch 2/15, Loss: 0.4856114283281287
Validation Accuracy: 0.8258


100%|██████████| 2123/2123 [00:13<00:00, 160.99it/s]


Epoch 3/15, Loss: 0.42741256664510163
Validation Accuracy: 0.8304


100%|██████████| 2123/2123 [00:13<00:00, 162.98it/s]


Epoch 4/15, Loss: 0.37020554466877575
Validation Accuracy: 0.8350


100%|██████████| 2123/2123 [00:13<00:00, 161.15it/s]


Epoch 5/15, Loss: 0.316113880587618
Validation Accuracy: 0.8300


100%|██████████| 2123/2123 [00:13<00:00, 162.76it/s]


Epoch 6/15, Loss: 0.27077142954630723
Validation Accuracy: 0.8324


100%|██████████| 2123/2123 [00:13<00:00, 160.69it/s]


Epoch 7/15, Loss: 0.23554532644016474
Validation Accuracy: 0.8293


100%|██████████| 2123/2123 [00:13<00:00, 161.77it/s]


Epoch 8/15, Loss: 0.2107854013507146
Validation Accuracy: 0.8304


100%|██████████| 2123/2123 [00:13<00:00, 161.79it/s]


Epoch 9/15, Loss: 0.19172346204680707
Validation Accuracy: 0.8355


100%|██████████| 2123/2123 [00:13<00:00, 152.12it/s]


Epoch 10/15, Loss: 0.17941439652914604
Validation Accuracy: 0.8326


100%|██████████| 2123/2123 [00:13<00:00, 161.82it/s]


Epoch 11/15, Loss: 0.17002077871302754
Validation Accuracy: 0.8346


100%|██████████| 2123/2123 [00:13<00:00, 162.68it/s]


Epoch 12/15, Loss: 0.1642439126155944
Validation Accuracy: 0.8283


100%|██████████| 2123/2123 [00:13<00:00, 161.69it/s]


Epoch 13/15, Loss: 0.16021483274044235
Validation Accuracy: 0.8325


100%|██████████| 2123/2123 [00:12<00:00, 163.49it/s]


Epoch 14/15, Loss: 0.1555187467001361
Validation Accuracy: 0.8348


100%|██████████| 2123/2123 [00:13<00:00, 162.25it/s]


Epoch 15/15, Loss: 0.15419568879914997
Validation Accuracy: 0.8355
Training GRU with 2 layers...


100%|██████████| 2123/2123 [00:16<00:00, 129.40it/s]


Epoch 1/15, Loss: 0.5698063305762934
Validation Accuracy: 0.8183


100%|██████████| 2123/2123 [00:16<00:00, 130.95it/s]


Epoch 2/15, Loss: 0.4819225368357131
Validation Accuracy: 0.8274


100%|██████████| 2123/2123 [00:16<00:00, 129.26it/s]


Epoch 3/15, Loss: 0.41687716958492477
Validation Accuracy: 0.8316


100%|██████████| 2123/2123 [00:16<00:00, 128.54it/s]


Epoch 4/15, Loss: 0.3500354866472214
Validation Accuracy: 0.8348


100%|██████████| 2123/2123 [00:16<00:00, 131.37it/s]


Epoch 5/15, Loss: 0.2888845354037862
Validation Accuracy: 0.8341


100%|██████████| 2123/2123 [00:16<00:00, 129.73it/s]


Epoch 6/15, Loss: 0.2408028104997473
Validation Accuracy: 0.8349


100%|██████████| 2123/2123 [00:16<00:00, 131.12it/s]


Epoch 7/15, Loss: 0.20907794459039908
Validation Accuracy: 0.8329


100%|██████████| 2123/2123 [00:16<00:00, 128.96it/s]


Epoch 8/15, Loss: 0.18798546528060286
Validation Accuracy: 0.8360


100%|██████████| 2123/2123 [00:16<00:00, 130.91it/s]


Epoch 9/15, Loss: 0.1767728171077501
Validation Accuracy: 0.8371


100%|██████████| 2123/2123 [00:16<00:00, 130.35it/s]


Epoch 10/15, Loss: 0.16768313917931077
Validation Accuracy: 0.8355


100%|██████████| 2123/2123 [00:16<00:00, 126.98it/s]


Epoch 11/15, Loss: 0.16218811056076132
Validation Accuracy: 0.8268


100%|██████████| 2123/2123 [00:16<00:00, 130.76it/s]


Epoch 12/15, Loss: 0.16005172015282268
Validation Accuracy: 0.8332


100%|██████████| 2123/2123 [00:16<00:00, 128.78it/s]


Epoch 13/15, Loss: 0.15431078198947998
Validation Accuracy: 0.8323


100%|██████████| 2123/2123 [00:16<00:00, 131.07it/s]


Epoch 14/15, Loss: 0.15163485273574945
Validation Accuracy: 0.8340


100%|██████████| 2123/2123 [00:16<00:00, 130.62it/s]


Epoch 15/15, Loss: 0.15180548892878049
Validation Accuracy: 0.8341
Training GRU with 3 layers...


100%|██████████| 2123/2123 [00:19<00:00, 108.40it/s]


Epoch 1/15, Loss: 0.5704602151688011
Validation Accuracy: 0.8189


100%|██████████| 2123/2123 [00:19<00:00, 106.94it/s]


Epoch 2/15, Loss: 0.4836850115784597
Validation Accuracy: 0.8290


100%|██████████| 2123/2123 [00:20<00:00, 105.66it/s]


Epoch 3/15, Loss: 0.4196120619394843
Validation Accuracy: 0.8311


100%|██████████| 2123/2123 [00:19<00:00, 110.62it/s]


Epoch 4/15, Loss: 0.35471073186417207
Validation Accuracy: 0.8358


100%|██████████| 2123/2123 [00:19<00:00, 109.44it/s]


Epoch 5/15, Loss: 0.29585602548582285
Validation Accuracy: 0.8338


100%|██████████| 2123/2123 [00:19<00:00, 107.26it/s]


Epoch 6/15, Loss: 0.2511654904395609
Validation Accuracy: 0.8365


100%|██████████| 2123/2123 [00:19<00:00, 107.96it/s]


Epoch 7/15, Loss: 0.2192575060150503
Validation Accuracy: 0.8338


100%|██████████| 2123/2123 [00:19<00:00, 106.97it/s]


Epoch 8/15, Loss: 0.19895509751254778
Validation Accuracy: 0.8327


100%|██████████| 2123/2123 [00:20<00:00, 105.26it/s]


Epoch 9/15, Loss: 0.186679978128935
Validation Accuracy: 0.8320


100%|██████████| 2123/2123 [00:19<00:00, 108.00it/s]


Epoch 10/15, Loss: 0.1777089090795869
Validation Accuracy: 0.8337


100%|██████████| 2123/2123 [00:19<00:00, 108.30it/s]


Epoch 11/15, Loss: 0.16932195791508783
Validation Accuracy: 0.8329


100%|██████████| 2123/2123 [00:19<00:00, 106.62it/s]


Epoch 12/15, Loss: 0.16547779758556014
Validation Accuracy: 0.8323


100%|██████████| 2123/2123 [00:19<00:00, 106.88it/s]


Epoch 13/15, Loss: 0.16242059109643894
Validation Accuracy: 0.8328


100%|██████████| 2123/2123 [00:19<00:00, 108.02it/s]


Epoch 14/15, Loss: 0.1582850795616477
Validation Accuracy: 0.8308


100%|██████████| 2123/2123 [00:19<00:00, 107.41it/s]


Epoch 15/15, Loss: 0.15650911878634752
Validation Accuracy: 0.8272
Training LSTM with 1 layers...


100%|██████████| 2123/2123 [00:15<00:00, 137.54it/s]


Epoch 1/15, Loss: 0.5824823654798533
Validation Accuracy: 0.8184


100%|██████████| 2123/2123 [00:15<00:00, 141.40it/s]


Epoch 2/15, Loss: 0.4945502528044282
Validation Accuracy: 0.8242


100%|██████████| 2123/2123 [00:14<00:00, 141.95it/s]


Epoch 3/15, Loss: 0.43651319351670204
Validation Accuracy: 0.8276


100%|██████████| 2123/2123 [00:14<00:00, 142.67it/s]


Epoch 4/15, Loss: 0.3814619013471981
Validation Accuracy: 0.8346


100%|██████████| 2123/2123 [00:15<00:00, 138.87it/s]


Epoch 5/15, Loss: 0.32836873913942727
Validation Accuracy: 0.8331


100%|██████████| 2123/2123 [00:14<00:00, 143.47it/s]


Epoch 6/15, Loss: 0.2809141648093654
Validation Accuracy: 0.8344


100%|██████████| 2123/2123 [00:14<00:00, 142.40it/s]


Epoch 7/15, Loss: 0.24204883477034234
Validation Accuracy: 0.8325


100%|██████████| 2123/2123 [00:15<00:00, 141.50it/s]


Epoch 8/15, Loss: 0.21247683421086153
Validation Accuracy: 0.8354


100%|██████████| 2123/2123 [00:15<00:00, 138.85it/s]


Epoch 9/15, Loss: 0.19192325702490695
Validation Accuracy: 0.8339


100%|██████████| 2123/2123 [00:15<00:00, 140.67it/s]


Epoch 10/15, Loss: 0.17542250248596064
Validation Accuracy: 0.8363


100%|██████████| 2123/2123 [00:15<00:00, 140.61it/s]


Epoch 11/15, Loss: 0.1670752644970543
Validation Accuracy: 0.8394


100%|██████████| 2123/2123 [00:15<00:00, 141.04it/s]


Epoch 12/15, Loss: 0.16052921972923187
Validation Accuracy: 0.8275


100%|██████████| 2123/2123 [00:14<00:00, 142.59it/s]


Epoch 13/15, Loss: 0.1542091474044073
Validation Accuracy: 0.8308


100%|██████████| 2123/2123 [00:14<00:00, 142.66it/s]


Epoch 14/15, Loss: 0.15096898103291648
Validation Accuracy: 0.8341


100%|██████████| 2123/2123 [00:14<00:00, 141.86it/s]


Epoch 15/15, Loss: 0.1462686746922996
Validation Accuracy: 0.8259
Training LSTM with 2 layers...


100%|██████████| 2123/2123 [00:19<00:00, 106.16it/s]


Epoch 1/15, Loss: 0.6002119762179877
Validation Accuracy: 0.8119


100%|██████████| 2123/2123 [00:20<00:00, 104.15it/s]


Epoch 2/15, Loss: 0.5137308602955226
Validation Accuracy: 0.8219


100%|██████████| 2123/2123 [00:20<00:00, 104.79it/s]


Epoch 3/15, Loss: 0.45543488184208586
Validation Accuracy: 0.8257


100%|██████████| 2123/2123 [00:20<00:00, 104.65it/s]


Epoch 4/15, Loss: 0.398253861991965
Validation Accuracy: 0.8312


100%|██████████| 2123/2123 [00:20<00:00, 105.53it/s]


Epoch 5/15, Loss: 0.3400308308395585
Validation Accuracy: 0.8286


100%|██████████| 2123/2123 [00:20<00:00, 104.87it/s]


Epoch 6/15, Loss: 0.28578911202131047
Validation Accuracy: 0.8329


100%|██████████| 2123/2123 [00:20<00:00, 104.61it/s]


Epoch 7/15, Loss: 0.24385692860213473
Validation Accuracy: 0.8295


100%|██████████| 2123/2123 [00:20<00:00, 103.75it/s]


Epoch 8/15, Loss: 0.2122362684414626
Validation Accuracy: 0.8346


100%|██████████| 2123/2123 [00:20<00:00, 105.86it/s]


Epoch 9/15, Loss: 0.19326937047732196
Validation Accuracy: 0.8272


100%|██████████| 2123/2123 [00:20<00:00, 105.65it/s]


Epoch 10/15, Loss: 0.17863972313540671
Validation Accuracy: 0.8316


100%|██████████| 2123/2123 [00:19<00:00, 107.59it/s]


Epoch 11/15, Loss: 0.16901330789704677
Validation Accuracy: 0.8290


100%|██████████| 2123/2123 [00:19<00:00, 107.83it/s]


Epoch 12/15, Loss: 0.163092518424474
Validation Accuracy: 0.8331


100%|██████████| 2123/2123 [00:19<00:00, 106.38it/s]


Epoch 13/15, Loss: 0.15611402003083147
Validation Accuracy: 0.8245


100%|██████████| 2123/2123 [00:20<00:00, 103.51it/s]


Epoch 14/15, Loss: 0.15264948389834662
Validation Accuracy: 0.8306


100%|██████████| 2123/2123 [00:20<00:00, 105.03it/s]


Epoch 15/15, Loss: 0.14990361432842753
Validation Accuracy: 0.8312
Training LSTM with 3 layers...


100%|██████████| 2123/2123 [00:25<00:00, 83.09it/s]


Epoch 1/15, Loss: 0.5954705359825704
Validation Accuracy: 0.8068


100%|██████████| 2123/2123 [00:25<00:00, 83.07it/s]


Epoch 2/15, Loss: 0.5119073413491417
Validation Accuracy: 0.8235


100%|██████████| 2123/2123 [00:25<00:00, 82.70it/s]


Epoch 3/15, Loss: 0.45520650979010385
Validation Accuracy: 0.8287


100%|██████████| 2123/2123 [00:25<00:00, 81.71it/s]


Epoch 4/15, Loss: 0.40029456908979116
Validation Accuracy: 0.8297


100%|██████████| 2123/2123 [00:25<00:00, 82.76it/s]


Epoch 5/15, Loss: 0.34521575168203306
Validation Accuracy: 0.8308


100%|██████████| 2123/2123 [00:25<00:00, 82.64it/s]


Epoch 6/15, Loss: 0.2943991203733171
Validation Accuracy: 0.8342


100%|██████████| 2123/2123 [00:25<00:00, 82.42it/s]


Epoch 7/15, Loss: 0.2527094401605164
Validation Accuracy: 0.8337


100%|██████████| 2123/2123 [00:25<00:00, 82.69it/s]


Epoch 8/15, Loss: 0.22018524885240784
Validation Accuracy: 0.8309


100%|██████████| 2123/2123 [00:25<00:00, 81.73it/s]


Epoch 9/15, Loss: 0.19918915413379726
Validation Accuracy: 0.8362


100%|██████████| 2123/2123 [00:25<00:00, 82.74it/s]


Epoch 10/15, Loss: 0.18270045416311195
Validation Accuracy: 0.8346


100%|██████████| 2123/2123 [00:25<00:00, 82.73it/s]


Epoch 11/15, Loss: 0.1707886285313067
Validation Accuracy: 0.8357


100%|██████████| 2123/2123 [00:25<00:00, 82.72it/s]


Epoch 12/15, Loss: 0.1629443381550216
Validation Accuracy: 0.8367


100%|██████████| 2123/2123 [00:25<00:00, 82.63it/s]


Epoch 13/15, Loss: 0.15710299157809943
Validation Accuracy: 0.8283


100%|██████████| 2123/2123 [00:25<00:00, 81.76it/s]


Epoch 14/15, Loss: 0.1526015721980063
Validation Accuracy: 0.8320


100%|██████████| 2123/2123 [00:25<00:00, 82.84it/s]


Epoch 15/15, Loss: 0.15037217759231156
Validation Accuracy: 0.8387


### Delete the Data for the Second and Fourth Classes, Train on Classes 1, 3, and 5, and Then Test on Classes 2 and 4

In [None]:
def preprocess(data):
  sequences = tokenizer.texts_to_sequences(data['reviewText'])
  X = pad_sequences(sequences, maxlen=MAX_LEN)
  y = data['overall'] - 1
  return X, y

In [None]:
train_data_filtered = data[data['overall'].isin([1, 3, 5])]

test_data_classes_2_and_4 = data[data['overall'].isin([2, 4])]

In [None]:
X_train_filtered, y_train_filtered = preprocess(train_data_filtered)

X_test_classes_2_and_4, y_test_classes_2_and_4 = preprocess(test_data_classes_2_and_4)

In [None]:
y_train_filtered = le.fit_transform(y_train_filtered)

In [None]:
X_train_, X_val_, y_train_, y_val_ = train_test_split(X_train_filtered, y_train_filtered, test_size=0.2, random_state=42)

train_dataset_ = AmazonDataset(X_train_, y_train_)
val_dataset_ = AmazonDataset(X_val_, y_val_)

train_loader_ = DataLoader(train_dataset_, batch_size=BATCH_SIZE, shuffle=True)
val_loader_ = DataLoader(val_dataset_, batch_size=BATCH_SIZE)

In [None]:
model_type = 'LSTM'
num_layers = 3

model = SentimentRNN(vocab_size, EMBEDDING_DIM, HIDDEN_DIM, 3, num_layers, model_type)

train_model(model, train_loader_, val_loader_, EPOCHS)

100%|██████████| 1811/1811 [00:22<00:00, 82.11it/s]


Epoch 1/15, Loss: 0.21083171971549586
Validation Accuracy: 0.9443


100%|██████████| 1811/1811 [00:22<00:00, 81.60it/s]


Epoch 2/15, Loss: 0.1499315571041972
Validation Accuracy: 0.9542


100%|██████████| 1811/1811 [00:21<00:00, 82.68it/s]


Epoch 3/15, Loss: 0.1191663354653752
Validation Accuracy: 0.9577


100%|██████████| 1811/1811 [00:21<00:00, 82.65it/s]


Epoch 4/15, Loss: 0.09456544354725639
Validation Accuracy: 0.9534


100%|██████████| 1811/1811 [00:21<00:00, 84.31it/s]


Epoch 5/15, Loss: 0.07655402035249294
Validation Accuracy: 0.9586


100%|██████████| 1811/1811 [00:21<00:00, 84.32it/s]


Epoch 6/15, Loss: 0.06237495501041268
Validation Accuracy: 0.9558


100%|██████████| 1811/1811 [00:21<00:00, 83.74it/s]


Epoch 7/15, Loss: 0.05245318947990401
Validation Accuracy: 0.9579


100%|██████████| 1811/1811 [00:21<00:00, 82.50it/s]


Epoch 8/15, Loss: 0.047043873069546646
Validation Accuracy: 0.9593


100%|██████████| 1811/1811 [00:21<00:00, 82.80it/s]


Epoch 9/15, Loss: 0.042252564825007534
Validation Accuracy: 0.9588


100%|██████████| 1811/1811 [00:21<00:00, 82.66it/s]


Epoch 10/15, Loss: 0.03776661164047837
Validation Accuracy: 0.9602


100%|██████████| 1811/1811 [00:21<00:00, 82.61it/s]


Epoch 11/15, Loss: 0.03717431433929294
Validation Accuracy: 0.9605


100%|██████████| 1811/1811 [00:21<00:00, 82.35it/s]


Epoch 12/15, Loss: 0.03654553305240958
Validation Accuracy: 0.9582


100%|██████████| 1811/1811 [00:21<00:00, 82.51it/s]


Epoch 13/15, Loss: 0.03395439852993963
Validation Accuracy: 0.9595


100%|██████████| 1811/1811 [00:22<00:00, 82.30it/s]


Epoch 14/15, Loss: 0.0331034573501625
Validation Accuracy: 0.9601


100%|██████████| 1811/1811 [00:21<00:00, 82.67it/s]


Epoch 15/15, Loss: 0.03326133815912684
Validation Accuracy: 0.9603


In [None]:
label_mapping_test = {1: 0, 3: 2}
y_test_classes_2_and_4_mapped = np.array([label_mapping_test[label] for label in y_test_classes_2_and_4])
test_dataset_classes_2_and_4 = AmazonDataset(X_test_classes_2_and_4, y_test_classes_2_and_4_mapped)
test_loader_classes_2_and_4 = DataLoader(test_dataset_classes_2_and_4, batch_size=BATCH_SIZE)

In [None]:
model.eval()
y_pred_classes_2_and_4 = []

with torch.no_grad():
    for X_batch, y_batch in test_loader_classes_2_and_4:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)
        y_pred_classes_2_and_4.extend(predicted.cpu().numpy())

        # Map the real labels and print the real and predicted labels
        for real_label, pred_label in zip(y_batch.cpu().numpy(), predicted.cpu().numpy()):
            if real_label == 0:
                mapped_real_label = 1
            else:
                mapped_real_label = 3
            if pred_label == 1:
                pred_label = 2
            elif pred_label == 2:
                pred_label = 4
            print(f'Real label: {mapped_real_label}, Predicted label: {pred_label}')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Real label: 3, Predicted label: 4
Real label: 3, Predicted label: 4
Real label: 3, Predicted label: 4
Real label: 3, Predicted label: 4
Real label: 3, Predicted label: 4
Real label: 3, Predicted label: 4
Real label: 3, Predicted label: 4
Real label: 3, Predicted label: 4
Real label: 3, Predicted label: 4
Real label: 3, Predicted label: 4
Real label: 3, Predicted label: 4
Real label: 3, Predicted label: 4
Real label: 3, Predicted label: 4
Real label: 3, Predicted label: 4
Real label: 3, Predicted label: 4
Real label: 3, Predicted label: 4
Real label: 1, Predicted label: 4
Real label: 3, Predicted label: 4
Real label: 3, Predicted label: 4
Real label: 3, Predicted label: 2
Real label: 3, Predicted label: 4
Real label: 3, Predicted label: 4
Real label: 3, Predicted label: 4
Real label: 3, Predicted label: 4
Real label: 3, Predicted label: 4
Real label: 3, Predicted label: 4
Real label: 3, Predicted label: 4
Real label: 3, Pr