<a href="https://colab.research.google.com/github/LLKruczek/MeetingSchedulerProject_Public/blob/main/Test_Movie_Reviews_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers torch



In [2]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer


In [23]:
class BertBiLSTM(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', hidden_dim=256, num_labels=2):
        super(BertBiLSTM, self).__init__()

        # Load BERT model and tokenizer
        self.bert = BertModel.from_pretrained(bert_model_name)

        # Freeze BERT parameters to avoid training them
        for param in self.bert.parameters():
            param.requires_grad = False

        # BiLSTM layer
        self.lstm = nn.LSTM(
            input_size=self.bert.config.hidden_size,
            hidden_size=hidden_dim,
            num_layers=1,
            bidirectional=True,
            batch_first=True
        )

        # Fully connected layer for classification
        self.classifier = nn.Linear(hidden_dim * 2, num_labels)

    def forward(self, input_ids, attention_mask):
        # Extract embeddings from BERT
        with torch.no_grad():  # Ensure we don't backprop through BERT
            bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            last_hidden_state = bert_outputs.last_hidden_state  # Shape: (batch_size, seq_len, hidden_dim)

        # Pass through BiLSTM layer
        lstm_out, _ = self.lstm(last_hidden_state)  # Shape: (batch_size, seq_len, hidden_dim * 2)

        # Use the output from the last LSTM cell (can also use other pooling strategies)
        lstm_out = lstm_out[:, -1, :]  # Shape: (batch_size, hidden_dim * 2)

        # Classifier layer
        logits = self.classifier(lstm_out)  # Shape: (batch_size, num_labels)

        return logits


In [4]:
# Define device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Instantiate model and tokenizer
model = BertBiLSTM().to(device)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [6]:
from google.colab import files
uploaded = files.upload()

Saving imdb_master.csv to imdb_master.csv


In [7]:
import pandas as pd

df = pd.read_csv('imdb_master.csv', encoding='ISO-8859-1')

In [17]:
reviews=df["review"].head(1000)

In [9]:
print(reviews.head())

0    Once again Mr. Costner has dragged out a movie...
1    This is an example of why the majority of acti...
2    First of all I hate those moronic rappers, who...
3    Not even the Beatles could write songs everyon...
4    Brass pictures (movies is not a fitting word f...
Name: review, dtype: object


In [25]:
encodings = tokenizer(
    reviews.tolist(),           # Convert the column to a list
    padding=True,                    # Pad sequences to the same length
    truncation=True,                 # Truncate sequences longer than max_length
    max_length=32,                  # Define max_length based on your model's limit
    return_tensors="pt"              # Return as PyTorch tensors
)

In [26]:
input_ids = encodings['input_ids'].to(device)
attention_mask = encodings['attention_mask'].to(device)

In [27]:
import os
os.environ["TORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"

model.eval()  # Set model to evaluation mode
with torch.no_grad():
    logits = model(input_ids, attention_mask)
    predictions = torch.argmax(logits, dim=1)
    print(predictions)

tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
        0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,

In [33]:
labels = df['label'].head(1000).replace({'pos': 1, 'neg': 0})
labels_tensor = torch.tensor(labels)
labels_tensor=labels_tensor.to(device)

  labels = df['label'].head(1000).replace({'pos': 1, 'neg': 0})


In [35]:
# Compare predictions to true labels
correct = (predictions == labels_tensor).sum().item()  # Count number of correct predictions
total = labels_tensor.size(0)  # Total number of samples

# Calculate accuracy
accuracy = correct / total
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 88.30%
