In [1]:
# Imports
import os
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch.optim as optim
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Device configuration
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
# torch.cuda.empty_cache()
# torch.cuda.reset_max_memory_cached()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
# Train set
train_pos_folder = './data/aclImdb/train/pos'
train_neg_folder = './data/aclImdb/train/neg'

train_pos_sentences = [open(os.path.join(train_pos_folder, f)).read().strip() for f in os.listdir(train_pos_folder)]
train_neg_sentences = [open(os.path.join(train_neg_folder, f)).read().strip() for f in os.listdir(train_neg_folder)]

train_df = pd.DataFrame({
    'text': train_pos_sentences + train_neg_sentences,
    'label': [1] * len(train_pos_sentences) + [0] * len(train_neg_sentences)  # 1 for positive, 0 for negative
})

# Test set
test_pos_folder = './data/aclImdb/test/pos'
test_neg_folder = './data/aclImdb/test/neg'

test_pos_sentences = [open(os.path.join(test_pos_folder, f)).read().strip() for f in os.listdir(test_pos_folder)]
test_neg_sentences = [open(os.path.join(test_neg_folder, f)).read().strip() for f in os.listdir(test_neg_folder)]

test_df = pd.DataFrame({
    'text': test_pos_sentences + test_neg_sentences,
    'label': [1] * len(test_pos_sentences) + [0] * len(test_neg_sentences)  # 1 for positive, 0 for negative
})

train_df.head()

Unnamed: 0,text,label
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1


In [4]:
# Check for NaN values in columns
print('train NANs:', train_df['text'].isna().sum(), train_df['label'].isna().sum())
print('test NANs:', test_df['text'].isna().sum(), test_df['label'].isna().sum())

# Check labels
train_unique_values = train_df['label'].unique()
test_unique_values = test_df['label'].unique()
print('Check labels:', train_unique_values, test_unique_values)

# Check max length
train_max_words = train_df['text'].apply(lambda x: len(x.split())).max()
test_max_words = test_df['text'].apply(lambda x: len(x.split())).max()
print('max_words:', train_max_words, test_max_words)

train NANs: 0 0
test NANs: 0 0
Check labels: [1 0] [1 0]
max_words: 2470 2278


In [5]:
# Dataset
class IMDBDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx, 0]
        label = self.data.iloc[idx, 1]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt')    # as pytorch tensors

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor([label], dtype=torch.float)
        }

In [6]:
# Make torch DataLoader
batch_size = 32

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create data loaders
train_dataset = IMDBDataset(train_df, tokenizer, max_len=512)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = IMDBDataset(test_df, tokenizer, max_len=512)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)



In [7]:
# Check tensors
for i, (train_batch, test_batch) in enumerate(zip(train_loader, test_loader)):
    if i == 30:
        break
    train_input_ids = train_batch['input_ids']
    train_attention_mask = train_batch['attention_mask']
    train_labels = train_batch['labels']
    
    test_input_ids = test_batch['input_ids']
    test_attention_mask = test_batch['attention_mask']
    test_labels = test_batch['labels']

    print(i, train_input_ids.shape, train_attention_mask.shape, train_labels.shape)
    print(i, test_input_ids.shape, test_attention_mask.shape, test_labels.shape)

0 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
0 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
1 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
1 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
2 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
2 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
3 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
3 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
4 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
4 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
5 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
5 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
6 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
6 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
7 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
7 torch.Si

In [8]:
# Define hyperparmaters
num_epochs = 3
learning_rate = 1e-5

In [9]:
# Define the BertForSentenceClassification model
class BertForSentenceClassification(nn.Module):
    def __init__(self, bert_model):
        super(BertForSentenceClassification, self).__init__()
        self.bert_model = bert_model
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert_model.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert_model(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        outputs = self.classifier(pooled_output)
        return outputs

In [10]:
# Apply and check the model

# Load pre-trained BERT model
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Create an instance of the custom model
model = BertForSentenceClassification(bert_model)
model.to(device=device)

BertForSentenceClassification(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), 

In [11]:
# Basic check
input_ids = torch.randint(0, 100, (32, 512)).to(device)  # random input IDs
attention_mask = torch.ones((32, 512)).to(device)  # random attention mask
output = model(input_ids, attention_mask)
print(output.shape)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


torch.Size([32, 1])


In [12]:
# Define the optimizer and loss function
criterion = nn.BCELoss()

# # Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-5)

In [13]:
losses = []
for epoch in range(1):
    model.train()
    total_loss = 0
    for batch in train_loader:
        print(f"batch: {i}", end="\r")
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        print(input_ids.shape, attention_mask.shape, labels.shape)
        
        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    # Append the current loss to the list
    losses.append(loss.item())

    print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}')

torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# Plot the loss curve
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss Curve')
plt.show()

In [None]:
model.eval()
test_loss = 0
test_metrics = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device=device)
        attention_mask = batch['attention_mask'].to(device=device)
        labels = batch['labels'].to(device=device)
        
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        test_loss += loss.item()

        # Calculate metrics (e.g., accuracy, F1-score)
        _, preds = torch.max(outputs, dim=1)
        accuracy = (preds == labels).sum().item() / len(labels)
        test_metrics.append(accuracy)

test_loss /= len(test_loader)
test_metrics = np.mean(test_metrics)

print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_metrics:.4f}')