In [32]:
 import torch
from torch import nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset


In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [34]:
np.random.seed(42)

In [35]:
DATA_PATH = r"/content/tripadvisor_hotel_reviews.csv" # Make sure the path is correct and the file exists

df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [36]:
neutral_range = {"low": 4, "high": 5}
df["Sentiment"] = "neutral"
df["Sentiment"].loc[df["Rating"] < neutral_range["low"]] = "negative"
df["Sentiment"].loc[df["Rating"] >= neutral_range["high"]] = "positive"
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Sentiment"].loc[df["Rating"] < neutral_range["low"]] = "negative"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Sentiment"].loc[df["Rating"] >= neutral_range["high"]] = "positive"


Unnamed: 0,Review,Rating,Sentiment
0,nice hotel expensive parking got good deal sta...,4,neutral
1,ok nothing special charge diamond member hilto...,2,negative
2,nice rooms not 4* experience hotel monaco seat...,3,negative
3,"unique, great stay, wonderful time hotel monac...",5,positive
4,"great stay great stay, went seahawk game aweso...",5,positive


In [37]:
X_train, X_validation, y_train, y_validation = train_test_split(
    df["Review"], df["Sentiment"], stratify=df["Sentiment"], test_size=0.2, random_state=42
)

# RandomBaseline class
class RandomBaseline:
    def __init__(self):
        self.categories = {}

    def fit(self, data, target_col):
        cat_names = data[target_col].unique()
        agg = data.groupby(target_col).count()
        for n in cat_names:
            self.categories[n] = agg.loc[n][0] / len(data)

    def predict(self, data):
        return np.random.choice(list(self.categories.keys()), len(data), p=list(self.categories.values()))

In [38]:
rb = RandomBaseline()
rb.fit(df.iloc[X_train.index], "Sentiment")
pred = rb.predict(X_validation)
print(accuracy_score(y_validation, pred))  # 0.3273969260795316

# Tokenizer
tokenizer = get_tokenizer("basic_english")
print(tokenizer("the place was nice"))  # ['the', 'place', 'was', 'nice']

# Tokenized review iterator
def tokenized_review_iterator(reviews):
    for r in reviews:
        yield tokenizer(r)


0.33959502317638446
['the', 'place', 'was', 'nice']


In [39]:
vocab = build_vocab_from_iterator(tokenized_review_iterator(X_train), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])
print(vocab(['the', 'place', 'was', 'nice']))  # [33, 31, 3826, 15]

# Mappings
target_map = {"positive": 0, "neutral": 1, "negative": 2}
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: target_map[x]

[32, 30, 3985, 15]


In [40]:
# ReviewDataset class
class ReviewDataset(Dataset):
    def __init__(self, X, y, text_pipeline, label_pipeline):
        self.X = X
        self.y = y
        self.text_pipeline = text_pipeline
        self.label_pipeline = label_pipeline

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        text = torch.tensor(self.text_pipeline(self.X.iloc[idx]), dtype=torch.long)
        length = torch.tensor(len(text), dtype=torch.long)
        label = torch.tensor(self.label_pipeline(self.y.iloc[idx]), dtype=torch.long)
        return {"text": text, "length": length, "labels": label}

train_dataset = ReviewDataset(X_train, y_train, text_pipeline, label_pipeline)
test_dataset = ReviewDataset(X_validation, y_validation, text_pipeline, label_pipeline)

def collate(batch):
    batch.sort(key=lambda x: x["length"], reverse=True)
    text, lengths, labels = zip(*[d.values() for d in batch])
    text = torch.nn.utils.rnn.pad_sequence(text, batch_first=True) # pad_sequence pads the tensors with 0s to make them the same length
    lengths = torch.stack(lengths)
    labels = torch.stack(labels)
    return text, lengths, labels

In [41]:
# SentimentLSTM class
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, n_layers, num_class):
        super(SentimentLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, sparse=True)
        self.lstm = nn.LSTM(embed_dim, hidden_size, num_layers=n_layers, batch_first=True)
        self.drop = nn.Dropout(0.5)
        self.batch_norm = nn.BatchNorm1d(n_layers * hidden_size)
        self.dense = nn.Linear(n_layers * hidden_size, num_class)

    def dense_parameters(self):
        return list(self.lstm.parameters()) + list(self.dense.parameters())

    def forward(self, encoded_text, lengths):
        batch_size = lengths.shape[0]
        embedded = self.embedding(encoded_text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True)
        _, (hidden, cell) = self.lstm(packed_embedded)
        hidden = hidden.permute([1, 0, 2]).contiguous().view(batch_size, -1)
        hidden = self.drop(hidden)
        hidden = self.batch_norm(hidden)
        hidden = self.dense(hidden)
        return hidden


In [42]:
# Training parameters
n_epoch = 2
lr = 1e-4
batch_size = 16

# Model parameters
embedding_dim = 150
hidden_size = 98
n_layers = 5

# Define dictionaries to store losses and accuracies
losses = {"train": []}
accuracies = {"train": []}

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate) # Create train_loader




In [43]:
# Instantiate the SentimentLSTM model with the desired parameters
model = SentimentLSTM(len(vocab), embedding_dim, hidden_size, n_layers, len(target_map))

model = model.to(device)

# Define optimizers
optimizer_dense = torch.optim.Adam(model.dense_parameters(), lr=lr) # Define optimizer_dense for dense parameters
optimizer_sparse = torch.optim.SparseAdam(list(model.embedding.parameters()), lr=lr) # Define optimizer_sparse for sparse parameters

criterion = nn.CrossEntropyLoss() # Define loss function
for n in range(n_epoch):
    epoch_loss = []
    epoch_acc = []

    for encoded_text, lengths, labels in train_loader:
        model = model.train()
        optimizer_dense.zero_grad()
        optimizer_sparse.zero_grad()
        encoded_text, lengths, labels = encoded_text.to(device), lengths.to(device), labels.to(device)
        y_pred = model(encoded_text, lengths)
        loss = criterion(y_pred, labels)
        loss.backward()
        optimizer_sparse.step()

        optimizer_dense.step()
        epoch_loss.append(loss.item())
        acc = accuracy_score(labels.detach().cpu(), y_pred.argmax(1).detach().cpu())
        epoch_acc.append(acc)

    avg_loss = sum(epoch_loss) / len(epoch_loss)
    avg_acc = sum(epoch_acc) / len(epoch_acc)
    print(f"Epoch: {n}, Train Loss: {avg_loss:.4f}; Train Acc: {avg_acc:.4f}")
    losses["train"].append(avg_loss)
    accuracies["train"].append(avg_acc)

    epoch_loss = []
    epoch_acc = []

Epoch: 0, Train Loss: 1.1114; Train Acc: 0.4070
Epoch: 1, Train Loss: 0.9703; Train Acc: 0.5249


In [45]:
# Import the necessary module
from torch.utils.data import DataLoader
# Assuming you have a 'test_dataset'
validation_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate)
# Create a list to store validation losses
losses["validation"] = []
# Create a list to store validation accuracies
accuracies["validation"] = [] # Added this line to initialize accuracies['validation']
# Create validation_loader
with torch.no_grad():
    for encoded_text, lengths, labels in validation_loader:
        model = model.eval()
        encoded_text, lengths, labels = encoded_text.to(device), lengths.to(device), labels.to(device)
        y_pred = model(encoded_text, lengths)
        loss = criterion(y_pred, labels)
        epoch_loss.append(loss.item())
        acc = accuracy_score(labels.detach().cpu(), y_pred.argmax(1).detach().cpu())
        epoch_acc.append(acc)

  # The following two lines were incorrectly indented
    avg_loss = sum(epoch_loss) / len(epoch_loss)
    avg_acc = sum(epoch_acc) / len(epoch_acc)

    print(f"Epoch: {n}, Validation Loss: {avg_loss:.4f}; Validation Acc: {avg_acc:.4f}")
    losses["validation"].append(avg_loss)
    accuracies["validation"].append(avg_acc)

Epoch: 1, Validation Loss: 0.8527; Validation Acc: 0.5992
