In [7]:
# Imports
import os
import pandas as pd
# from transformers import BertTokenizer, BertModel, AdamW
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch.optim as optim
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [9]:
# Train set
train_pos_folder = './data/aclImdb/train/pos'
train_neg_folder = './data/aclImdb/train/neg'

train_pos_sentences = [open(os.path.join(train_pos_folder, f)).read().strip() for f in os.listdir(train_pos_folder)]
train_neg_sentences = [open(os.path.join(train_neg_folder, f)).read().strip() for f in os.listdir(train_neg_folder)]

train_df = pd.DataFrame({
    'text': train_pos_sentences + train_neg_sentences,
    'label': [1] * len(train_pos_sentences) + [0] * len(train_neg_sentences)  # 1 for positive, 0 for negative
})

# Test set
test_pos_folder = './data/aclImdb/test/pos'
test_neg_folder = './data/aclImdb/test/neg'

test_pos_sentences = [open(os.path.join(test_pos_folder, f)).read().strip() for f in os.listdir(test_pos_folder)]
test_neg_sentences = [open(os.path.join(test_neg_folder, f)).read().strip() for f in os.listdir(test_neg_folder)]

test_df = pd.DataFrame({
    'text': test_pos_sentences + test_neg_sentences,
    'label': [1] * len(test_pos_sentences) + [0] * len(test_neg_sentences)  # 1 for positive, 0 for negative
})

train_df.head()

Unnamed: 0,text,label
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1


In [10]:
# Check for NaN values in columns
print('train NANs:', train_df['text'].isna().sum(), train_df['label'].isna().sum())
print('test NANs:', test_df['text'].isna().sum(), test_df['label'].isna().sum())

# Check labels
train_unique_values = train_df['label'].unique()
test_unique_values = test_df['label'].unique()
print('Check labels:', train_unique_values, test_unique_values)

# Check max length
train_max_words = train_df['text'].apply(lambda x: len(x.split())).max()
test_max_words = test_df['text'].apply(lambda x: len(x.split())).max()
print('max_words:', train_max_words, test_max_words)

train NANs: 0 0
test NANs: 0 0
Check labels: [1 0] [1 0]
max_words: 2470 2278


In [11]:
# Dataset
class IMDBDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx, 0]
        label = self.data.iloc[idx, 1]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt')    # as pytorch tensors

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor([label], dtype=torch.float)
        }

In [12]:
# Make torch DataLoader
batch_size = 32

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create data loaders
train_dataset = IMDBDataset(train_df, tokenizer, max_len=512)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = IMDBDataset(test_df, tokenizer, max_len=512)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)



In [13]:
# Check tensors
for i, (train_batch, test_batch) in enumerate(zip(train_loader, test_loader)):
    if i == 10:
        break
    train_input_ids = train_batch['input_ids']
    train_attention_mask = train_batch['attention_mask']
    train_labels = train_batch['labels']
    
    test_input_ids = test_batch['input_ids']
    test_attention_mask = test_batch['attention_mask']
    test_labels = test_batch['labels']

    print(i, train_input_ids.shape, train_attention_mask.shape, train_labels.shape)
    print(i, test_input_ids.shape, test_attention_mask.shape, test_labels.shape)

0 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
0 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
1 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
1 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
2 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
2 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
3 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
3 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
4 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
4 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
5 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
5 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
6 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
6 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
7 torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 1])
7 torch.Si

In [14]:
from transformers import BertModel

In [15]:
class BertBinaryClassifier(nn.Module):
    def __init__(self, pretrained_model_name='bert-base-uncased'):
        super(BertBinaryClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        self.dropout = nn.Dropout(0.3)  # Dropout layer for regularization
        self.fc = nn.Linear(self.bert.config.hidden_size, 1)  # Output layer for binary classification

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # Get the pooled output (CLS token representation)
        pooled_output = self.dropout(pooled_output)  # Apply dropout
        logits = self.fc(pooled_output)  # Pass through the linear layer
        return logits

In [16]:
model = BertBinaryClassifier()

In [24]:
from torch.optim import AdamW
from sklearn.metrics import accuracy_score

def train_model(model, dataloader, epochs=3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    optimizer = AdamW(model.parameters(), lr=2e-5)
    criterion = nn.BCEWithLogitsLoss()  # Use BCEWithLogitsLoss for binary classification
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        all_labels = []
        all_preds = []
        
        for batch in dataloader:

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            print(input_ids.shape, attention_mask.shape, labels.shape)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            print(outputs.shape)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            loss.backward()
            optimizer.step()

            preds = torch.sigmoid(outputs).detach().cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.detach().cpu().numpy())
        
        avg_loss = total_loss / len(dataloader)
        accuracy = accuracy_score((np.array(all_labels) > 0.5).astype(int), (np.array(all_preds) > 0.5).astype(int))
        
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')


In [25]:
train_model(model, train_loader)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
