<h1 align="center">Sentiment Analysis using LSTM</h1>

### Import

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import numpy as np
import re

### Load CSV

In [4]:
data=pd.read_csv(r"E:\Me\coding\IIT Guwahati Internship\Codes\archive\IMDB Dataset.csv")

In [5]:
df = pd.DataFrame(data) #converting to dataframe object

### Creating a simple tokeniser function

In [22]:
def simple_tokenize(text):
    return re.findall(r"\b\w+\b", text.lower())

tokenized_texts = [simple_tokenize(text) for text in df['review']]
all_words = [word for sent in tokenized_texts for word in sent]
vocab = ['<PAD>', '<UNK>'] + [word for word, freq in Counter(all_words).items() if freq >= 1]
word2idx = {word: idx for idx, word in enumerate(vocab)}

def encode_text(tokens, max_len=20):
    ids = [word2idx.get(token, word2idx['<UNK>']) for token in tokens]
    ids = ids[:max_len] + [word2idx['<PAD>']] * (max_len - len(ids))
    return ids

df['encoded'] = tokenized_texts
df['encoded'] = df['encoded'].apply(lambda x: encode_text(x))

In [23]:
label_enc = LabelEncoder()
df['label_id'] = label_enc.fit_transform(df['sentiment'])

### Data Splitting

In [24]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(np.array(X), dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

X_train, X_test, y_train, y_test = train_test_split(df['encoded'], df['label_id'], test_size=0.2, random_state=42)
train_dataset = TextDataset(list(X_train), list(y_train))
test_dataset = TextDataset(list(X_test), list(y_test))

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2)


In [25]:
class SentimentModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.lstm(embedded)
        return self.fc(hidden[-1])

model = SentimentModel(vocab_size=len(vocab), embed_dim=50, hidden_dim=64, num_classes=len(label_enc.classes_))

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(5):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


Epoch 1, Loss: 12695.5052
Epoch 2, Loss: 10351.1392
Epoch 3, Loss: 8784.1330
Epoch 4, Loss: 7199.8778
Epoch 5, Loss: 5614.0160


### Model Evaluation

In [27]:
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        _, preds = torch.max(outputs, 1)
        correct += (preds == y_batch).sum().item()
        total += y_batch.size(0)

print(f"Test Accuracy: {correct/total * 100:.2f}%")

Test Accuracy: 71.43%


### References

1. https://github.com/bentrevett/pytorch-sentiment-analysis.git
2. https://www.geeksforgeeks.org/deep-learning/how-to-use-pytorch-for-sentiment-analysis-on-textual-data/
3. https://youtu.be/rsy5Ragmso8?feature=shared
4. https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews