In [1]:
import os
import random
import numpy as np
import torch
import pandas as pd
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
from sklearn.metrics import confusion_matrix, accuracy_score
from tqdm import tqdm

# Set the seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Define a custom dataset
class MovieReviewsDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        content = row['Content']
        sentiment = row['Sentiment']
        label = 1 if sentiment == 'Positive' else 0

        encoding = self.tokenizer.encode_plus(
            content,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)  # Ensure the label is a long for CrossEntropyLoss
        }

# Load the dataset from CSV files
train_df = pd.read_csv('train_data.csv')
val_df = pd.read_csv('val_data.csv')
test_df = pd.read_csv('test_data.csv')

print(train_df.head())

train_indices = train_df.sample(frac=1, random_state=200).index
train_df = train_df.loc[train_indices]

print(train_df.head())

# Create data loaders
max_len = 512
batch_size = 16

train_dataset = MovieReviewsDataset(train_df, tokenizer, max_len)
val_dataset = MovieReviewsDataset(val_df, tokenizer, max_len)
test_dataset = MovieReviewsDataset(test_df, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)



                                             Content Sentiment
0  synopsis : the president of a company wants to...  Negative
1  " take a number , fill out a form , and wait y...  Positive
2  here 's a concept -- jean - claude van damme g...  Negative
3  originally launched in 1978 , this popular fil...  Positive
4  it shows that america remains ambivalent over ...  Positive
                                                Content Sentiment
366   susan granger 's review of " osmosis jones " (...  Positive
1325  for a movie about disco - era excess , " 54 " ...  Positive
133   kate ( jennifer aniston ) is having some probl...  Negative
1419  i 'm not quite sure what to say about mars att...  Positive
1258  saw an advanced screening of the movie sniper ...  Negative


In [2]:
# Define the RNN model
class RNNModel(nn.Module):
    def __init__(self, hidden_dim, output_dim, n_layers, dropout_prob):
        super(RNNModel, self).__init__()
        self.rnn = nn.LSTM(bert_model.config.hidden_size, hidden_dim, num_layers=n_layers, batch_first=True, dropout=dropout_prob, bidirectional=False)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_prob)
        self.softmax = nn.Softmax(dim=1)  # Add a softmax activation

    def forward(self, bert_embeddings):
        packed_output, (hidden, cell) = self.rnn(bert_embeddings)
        output = self.fc(self.dropout(hidden[-1]))
        return self.softmax(output)  # Apply softmax activation

# Instantiate the model
hidden_dim = 128
output_dim = 2  # Two outputs, one for each class
n_layers = 1
dropout_prob = 0.3

model = RNNModel(hidden_dim, output_dim, n_layers, dropout_prob)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.00033960197625562883)

# Training function
def train_epoch(model, data_loader, criterion, optimizer, device, bert_model):
    model.train()
    losses = []
    correct_predictions = 0
    all_preds = []
    all_labels = []

    for batch in tqdm(data_loader, desc="Training", leave=False):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)  # Ensure labels are long for CrossEntropyLoss

        # Get BERT embeddings
        with torch.no_grad():
            bert_outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
            bert_embeddings = bert_outputs.last_hidden_state

        optimizer.zero_grad()
        outputs = model(bert_embeddings)
        loss = criterion(outputs, labels)
        losses.append(loss.item())
        preds = torch.argmax(outputs, dim=1)
        correct_predictions += torch.sum(preds == labels)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        loss.backward()
        optimizer.step()

    conf_matrix = confusion_matrix(all_labels, all_preds)
    accuracy = correct_predictions.double() / len(data_loader.dataset)
    avg_loss = sum(losses) / len(losses)
    return accuracy, avg_loss, conf_matrix

# Evaluation function
def eval_model(model, data_loader, criterion, device, bert_model):
    model.eval()
    losses = []
    correct_predictions = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)  # Ensure labels are long for CrossEntropyLoss

            # Get BERT embeddings
            bert_outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
            bert_embeddings = bert_outputs.last_hidden_state

            outputs = model(bert_embeddings)
            loss = criterion(outputs, labels)
            losses.append(loss.item())
            preds = torch.argmax(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    conf_matrix = confusion_matrix(all_labels, all_preds)
    accuracy = correct_predictions.double() / len(data_loader.dataset)
    avg_loss = sum(losses) / len(losses)
    return accuracy, avg_loss, conf_matrix

class EarlyStopping:
    def __init__(self, patience=5, verbose=False):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_acc = 0.0
        self.early_stop = False

    def __call__(self, train_acc, model):
        if train_acc > self.best_acc:
            self.best_acc = train_acc
            self.counter = 0
        else:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True

# Training loop with early stopping
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
bert_model = bert_model.to(device)  # Move BERT model to device
num_epochs = 40
best_val_acc = 0.0  # Initialize the best validation accuracy to zero
early_stopping = EarlyStopping(patience=5, verbose=True)  # Initialize early stopping

for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    print('-' * 10)

    train_acc, train_loss, train_conf_matrix = train_epoch(model, train_loader, criterion, optimizer, device, bert_model)
    print(f'Train loss {train_loss:.4f} accuracy {train_acc:.4f}')
    print('Train Confusion Matrix:')
    print(train_conf_matrix)

    val_acc, val_loss, val_conf_matrix = eval_model(model, val_loader, criterion, device, bert_model)
    print(f'Val loss {val_loss:.4f} accuracy {val_acc:.4f}')
    print('Validation Confusion Matrix:')
    print(val_conf_matrix)

    # Save the model if the validation accuracy is the best we've seen so far
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), 'best_model.pth')
        print(f'Saved best model at epoch {epoch + 1}')

    # Check early stopping
    early_stopping(train_acc, model)
    if early_stopping.early_stop:
        print("Early stopping")
        break

# Save the model at the last epoch
torch.save(model.state_dict(), 'last_epoch_model.pth')

# Load the best model before evaluating on the test set
model.load_state_dict(torch.load('best_model.pth'))

# Evaluate on test data using the best model
test_acc_best, test_loss_best, test_conf_matrix_best = eval_model(model, test_loader, criterion, device, bert_model)
print(f'Best Model Test loss {test_loss_best:.4f} accuracy {test_acc_best:.4f}')
print('Best Model Test Confusion Matrix:')
print(test_conf_matrix_best)

# Load the model from the last epoch
model.load_state_dict(torch.load('last_epoch_model.pth'))

# Evaluate on test data using the last epoch model
test_acc_last, test_loss_last, test_conf_matrix_last = eval_model(model, test_loader, criterion, device, bert_model)
print(f'Last Epoch Model Test loss {test_loss_last:.4f} accuracy {test_acc_last:.4f}')
print('Last Epoch Model Test Confusion Matrix:')
print(test_conf_matrix_last)



Epoch 1/40
----------


                                                                                

Train loss 0.6735 accuracy 0.5881
Train Confusion Matrix:
[[437 363]
 [296 504]]


                                                                                

Val loss 0.6071 accuracy 0.7500
Validation Confusion Matrix:
[[90 10]
 [40 60]]
Saved best model at epoch 1
Epoch 2/40
----------


                                                                                

Train loss 0.6081 accuracy 0.6850
Train Confusion Matrix:
[[558 242]
 [262 538]]


                                                                                

Val loss 0.5517 accuracy 0.7450
Validation Confusion Matrix:
[[88 12]
 [39 61]]
Epoch 3/40
----------


                                                                                

Train loss 0.5555 accuracy 0.7431
Train Confusion Matrix:
[[635 165]
 [246 554]]


                                                                                

Val loss 0.5850 accuracy 0.6800
Validation Confusion Matrix:
[[51 49]
 [15 85]]
Epoch 4/40
----------


                                                                                

Train loss 0.5384 accuracy 0.7619
Train Confusion Matrix:
[[607 193]
 [188 612]]


                                                                                

Val loss 0.5503 accuracy 0.7400
Validation Confusion Matrix:
[[83 17]
 [35 65]]
Epoch 5/40
----------


                                                                                

Train loss 0.5180 accuracy 0.7900
Train Confusion Matrix:
[[634 166]
 [170 630]]


                                                                                

Val loss 0.5754 accuracy 0.7250
Validation Confusion Matrix:
[[71 29]
 [26 74]]
Epoch 6/40
----------


                                                                                

Train loss 0.4815 accuracy 0.8269
Train Confusion Matrix:
[[662 138]
 [139 661]]


                                                                                

Val loss 0.5337 accuracy 0.7650
Validation Confusion Matrix:
[[86 14]
 [33 67]]
Saved best model at epoch 6
Epoch 7/40
----------


                                                                                

Train loss 0.4992 accuracy 0.8069
Train Confusion Matrix:
[[644 156]
 [153 647]]


                                                                                

Val loss 0.5629 accuracy 0.7250
Validation Confusion Matrix:
[[89 11]
 [44 56]]
EarlyStopping counter: 1 out of 5
Epoch 8/40
----------


                                                                                

Train loss 0.4556 accuracy 0.8569
Train Confusion Matrix:
[[690 110]
 [119 681]]


                                                                                

Val loss 0.5788 accuracy 0.7150
Validation Confusion Matrix:
[[73 27]
 [30 70]]
Epoch 9/40
----------


                                                                                

Train loss 0.4310 accuracy 0.8856
Train Confusion Matrix:
[[705  95]
 [ 88 712]]


                                                                                

Val loss 0.5785 accuracy 0.7150
Validation Confusion Matrix:
[[76 24]
 [33 67]]
Epoch 10/40
----------


                                                                                

Train loss 0.4011 accuracy 0.9187
Train Confusion Matrix:
[[732  68]
 [ 62 738]]


                                                                                

Val loss 0.5587 accuracy 0.7400
Validation Confusion Matrix:
[[77 23]
 [29 71]]
Epoch 11/40
----------


                                                                                

Train loss 0.4779 accuracy 0.8337
Train Confusion Matrix:
[[728  72]
 [194 606]]


                                                                                

Val loss 0.5987 accuracy 0.6950
Validation Confusion Matrix:
[[60 40]
 [21 79]]
EarlyStopping counter: 1 out of 5
Epoch 12/40
----------


                                                                                

Train loss 0.4750 accuracy 0.8387
Train Confusion Matrix:
[[690 110]
 [148 652]]


                                                                                

Val loss 0.5985 accuracy 0.6950
Validation Confusion Matrix:
[[71 29]
 [32 68]]
EarlyStopping counter: 2 out of 5
Epoch 13/40
----------


                                                                                

Train loss 0.3921 accuracy 0.9244
Train Confusion Matrix:
[[740  60]
 [ 61 739]]


                                                                                

Val loss 0.6007 accuracy 0.6800
Validation Confusion Matrix:
[[61 39]
 [25 75]]
Epoch 14/40
----------


                                                                                

Train loss 0.3675 accuracy 0.9525
Train Confusion Matrix:
[[765  35]
 [ 41 759]]


                                                                                

Val loss 0.5931 accuracy 0.6950
Validation Confusion Matrix:
[[74 26]
 [35 65]]
Epoch 15/40
----------


                                                                                

Train loss 0.3498 accuracy 0.9669
Train Confusion Matrix:
[[779  21]
 [ 32 768]]


                                                                                

Val loss 0.6439 accuracy 0.6450
Validation Confusion Matrix:
[[92  8]
 [63 37]]
Epoch 16/40
----------


                                                                                

Train loss 0.5084 accuracy 0.8019
Train Confusion Matrix:
[[724  76]
 [241 559]]


                                                                                

Val loss 0.6036 accuracy 0.6800
Validation Confusion Matrix:
[[65 35]
 [29 71]]
EarlyStopping counter: 1 out of 5
Epoch 17/40
----------


                                                                                

Train loss 0.3734 accuracy 0.9494
Train Confusion Matrix:
[[763  37]
 [ 44 756]]


                                                                                

Val loss 0.6037 accuracy 0.6750
Validation Confusion Matrix:
[[59 41]
 [24 76]]
EarlyStopping counter: 2 out of 5
Epoch 18/40
----------


                                                                                

Train loss 0.3512 accuracy 0.9669
Train Confusion Matrix:
[[773  27]
 [ 26 774]]


                                                                                

Val loss 0.5927 accuracy 0.7100
Validation Confusion Matrix:
[[71 29]
 [29 71]]
EarlyStopping counter: 3 out of 5
Epoch 19/40
----------


                                                                                

Train loss 0.3459 accuracy 0.9731
Train Confusion Matrix:
[[779  21]
 [ 22 778]]


                                                                                

Val loss 0.6203 accuracy 0.6800
Validation Confusion Matrix:
[[67 33]
 [31 69]]
Epoch 20/40
----------


                                                                                

Train loss 0.3396 accuracy 0.9769
Train Confusion Matrix:
[[783  17]
 [ 20 780]]


                                                                                

Val loss 0.6199 accuracy 0.6750
Validation Confusion Matrix:
[[69 31]
 [34 66]]
Epoch 21/40
----------


                                                                                

Train loss 0.3336 accuracy 0.9825
Train Confusion Matrix:
[[788  12]
 [ 16 784]]


                                                                                

Val loss 0.6104 accuracy 0.6900
Validation Confusion Matrix:
[[69 31]
 [31 69]]
Epoch 22/40
----------


                                                                                

Train loss 0.3318 accuracy 0.9825
Train Confusion Matrix:
[[789  11]
 [ 17 783]]


                                                                                

Val loss 0.6203 accuracy 0.6750
Validation Confusion Matrix:
[[67 33]
 [32 68]]
EarlyStopping counter: 1 out of 5
Epoch 23/40
----------


                                                                                

Train loss 0.3350 accuracy 0.9800
Train Confusion Matrix:
[[785  15]
 [ 17 783]]


                                                                                

Val loss 0.6222 accuracy 0.6750
Validation Confusion Matrix:
[[63 37]
 [28 72]]
EarlyStopping counter: 2 out of 5
Epoch 24/40
----------


                                                                                

Train loss 0.3309 accuracy 0.9838
Train Confusion Matrix:
[[788  12]
 [ 14 786]]


                                                                                

Val loss 0.6170 accuracy 0.6750
Validation Confusion Matrix:
[[71 29]
 [36 64]]
Epoch 25/40
----------


                                                                                

Train loss 0.3286 accuracy 0.9856
Train Confusion Matrix:
[[790  10]
 [ 13 787]]


                                                                                

Val loss 0.6255 accuracy 0.6750
Validation Confusion Matrix:
[[61 39]
 [26 74]]
Epoch 26/40
----------


                                                                                

Train loss 0.3279 accuracy 0.9856
Train Confusion Matrix:
[[790  10]
 [ 13 787]]


                                                                                

Val loss 0.6163 accuracy 0.6750
Validation Confusion Matrix:
[[71 29]
 [36 64]]
EarlyStopping counter: 1 out of 5
Epoch 27/40
----------


                                                                                

Train loss 0.3276 accuracy 0.9862
Train Confusion Matrix:
[[791   9]
 [ 13 787]]


                                                                                

Val loss 0.6222 accuracy 0.6750
Validation Confusion Matrix:
[[63 37]
 [28 72]]
Epoch 28/40
----------


                                                                                

Train loss 0.3273 accuracy 0.9862
Train Confusion Matrix:
[[791   9]
 [ 13 787]]


                                                                                

Val loss 0.6132 accuracy 0.7000
Validation Confusion Matrix:
[[68 32]
 [28 72]]
EarlyStopping counter: 1 out of 5
Epoch 29/40
----------


                                                                                

Train loss 0.3267 accuracy 0.9869
Train Confusion Matrix:
[[792   8]
 [ 13 787]]


                                                                                

Val loss 0.6151 accuracy 0.6950
Validation Confusion Matrix:
[[67 33]
 [28 72]]
Epoch 30/40
----------


                                                                                

Train loss 0.3266 accuracy 0.9869
Train Confusion Matrix:
[[792   8]
 [ 13 787]]


                                                                                

Val loss 0.6149 accuracy 0.6950
Validation Confusion Matrix:
[[67 33]
 [28 72]]
EarlyStopping counter: 1 out of 5
Epoch 31/40
----------


                                                                                

Train loss 0.3266 accuracy 0.9869
Train Confusion Matrix:
[[792   8]
 [ 13 787]]


                                                                                

Val loss 0.6116 accuracy 0.6950
Validation Confusion Matrix:
[[67 33]
 [28 72]]
EarlyStopping counter: 2 out of 5
Epoch 32/40
----------


                                                                                

Train loss 0.3265 accuracy 0.9869
Train Confusion Matrix:
[[792   8]
 [ 13 787]]


                                                                                

Val loss 0.6136 accuracy 0.6950
Validation Confusion Matrix:
[[67 33]
 [28 72]]
EarlyStopping counter: 3 out of 5
Epoch 33/40
----------


                                                                                

Train loss 0.3265 accuracy 0.9869
Train Confusion Matrix:
[[792   8]
 [ 13 787]]


                                                                                

Val loss 0.6145 accuracy 0.6950
Validation Confusion Matrix:
[[67 33]
 [28 72]]
EarlyStopping counter: 4 out of 5
Epoch 34/40
----------


                                                                                

Train loss 0.3265 accuracy 0.9869
Train Confusion Matrix:
[[792   8]
 [ 13 787]]


                                                                                

Val loss 0.6140 accuracy 0.6900
Validation Confusion Matrix:
[[67 33]
 [29 71]]
EarlyStopping counter: 5 out of 5
Early stopping


                                                                                

Best Model Test loss 0.5453 accuracy 0.7300
Best Model Test Confusion Matrix:
[[89 11]
 [43 57]]


                                                                                

Last Epoch Model Test loss 0.5403 accuracy 0.7650
Last Epoch Model Test Confusion Matrix:
[[82 18]
 [29 71]]




In [3]:
import torch
from transformers import BertTokenizer, BertModel

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
model = RNNModel(hidden_dim, output_dim, n_layers, dropout_prob)
model.load_state_dict(torch.load('last_epoch_model.pth'))
model.eval()

# Inference function
def predict(text, tokenizer, bert_model, model, device):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        bert_outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
        bert_embeddings = bert_outputs.last_hidden_state

        outputs = model(bert_embeddings)
        _, prediction = torch.max(outputs, dim=1)

    return prediction.item()

# Perform inference
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
bert_model = bert_model.to(device)

text = "That movie is terrific"
prediction = predict(text, tokenizer, bert_model, model, device)

# Map prediction to sentiment
sentiment = "Positive" if prediction == 1 else "Negative"
print(f'Text: "{text}"')
print(f'Predicted Sentiment: {sentiment}')




Text: "That movie is terrific"
Predicted Sentiment: Positive
