In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import torch.optim as optim

# Custom Dataset class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Define LSTM model
class SentimentClassifier(nn.Module):
    def __init__(self, bert, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super(SentimentClassifier, self).__init__()
        self.bert = bert
        self.lstm = nn.LSTM(bert.config.hidden_size, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            embedded = self.bert(input_ids=input_ids, attention_mask=attention_mask)[0]
        _, (hidden, _) = self.lstm(embedded)
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        else:
            hidden = self.dropout(hidden[-1, :, :])
        output = self.fc(hidden)
        return output

# Load data
positive_reviews = open('TrainingDataPositive.txt', 'r').readlines()
negative_reviews = open('TrainingDataNegative.txt', 'r').readlines()

texts = positive_reviews + negative_reviews
labels = [1] * len(positive_reviews) + [0] * len(negative_reviews)

# Parameters
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 4
HIDDEN_DIM = 256
OUTPUT_DIM = 2
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.3

# Tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Create datasets and dataloaders
dataset = SentimentDataset(texts, labels, tokenizer, MAX_LEN)

train_size = int(1 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# Initialize model, loss function, optimizer
model = SentimentClassifier(bert_model, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)





In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)

for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    for batch_idx, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if (batch_idx + 1) % 10 == 0:
            print(f'Epoch {epoch + 1}/{EPOCHS}, Batch {batch_idx + 1}/{len(train_loader)}, Loss: {loss.item()}')

    avg_loss = running_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Average Loss: {avg_loss}')



Epoch 1/4, Batch 10/703, Loss: 0.6691676378250122
Epoch 1/4, Batch 20/703, Loss: 0.6562215089797974
Epoch 1/4, Batch 30/703, Loss: 0.6106436848640442
Epoch 1/4, Batch 40/703, Loss: 0.5863316059112549
Epoch 1/4, Batch 50/703, Loss: 0.6195782423019409
Epoch 1/4, Batch 60/703, Loss: 0.54873126745224
Epoch 1/4, Batch 70/703, Loss: 0.5734463930130005
Epoch 1/4, Batch 80/703, Loss: 0.5607689023017883
Epoch 1/4, Batch 90/703, Loss: 0.5474369525909424
Epoch 1/4, Batch 100/703, Loss: 0.5574567317962646
Epoch 1/4, Batch 110/703, Loss: 0.5955451726913452
Epoch 1/4, Batch 120/703, Loss: 0.40213385224342346
Epoch 1/4, Batch 130/703, Loss: 0.41625481843948364
Epoch 1/4, Batch 140/703, Loss: 0.6221076846122742
Epoch 1/4, Batch 150/703, Loss: 0.5139080286026001
Epoch 1/4, Batch 160/703, Loss: 0.6183882355690002
Epoch 1/4, Batch 170/703, Loss: 0.5331366658210754
Epoch 1/4, Batch 180/703, Loss: 0.41678279638290405
Epoch 1/4, Batch 190/703, Loss: 0.4235105812549591
Epoch 1/4, Batch 200/703, Loss: 0.58927

In [None]:

# Evaluation on test set
test_df = pd.read_csv('TestReviews.csv')
test_texts = test_df['review'].tolist()
test_labels = test_df['class'].tolist()

test_dataset = SentimentDataset(test_texts, test_labels, tokenizer, MAX_LEN)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask)
        _, preds = torch.max(outputs, dim=1)
        correct += torch.sum(preds == labels)
        total += labels.size(0)

accuracy = correct.double() / total
print(f'Test Accuracy: {accuracy.item()}')

Test Accuracy: 0.9409858828974774
