In [None]:
import tarfile

with tarfile.open('C:/Users/kdarc/OneDrive/Desktop/M6W4/aclImdb_v1.tar.gz', 'r:gz') as tar:
    tar.extractall(path='C:/Users/kdarc/OneDrive/Desktop/M6W4')


In [1]:
# M6W4.

# Import required libraries.
import os
import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, BertTokenizerFast, AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch import cuda

device = 'cuda' if cuda.is_available() else 'cpu'



# Prepare the data.
class IMDbDataset(Dataset):

    def __init__(self, dir_path, tokenizer):
        self.tokenizer = tokenizer
        self.positive_samples = []
        self.negative_samples = []



# Load positive samples.
        for filename in os.listdir(os.path.join(dir_path, "pos")):
            with open(os.path.join(dir_path, "pos", filename), 'r', encoding="utf-8") as file:
                text = file.read()
                text = self._clean_text(text)
                self.positive_samples.append(text)


# Load negative samples.
        for filename in os.listdir(os.path.join(dir_path, "neg")):
            with open(os.path.join(dir_path, "neg", filename), 'r', encoding="utf-8") as file:
                text = file.read()
                text = self._clean_text(text)
                self.negative_samples.append(text)

        self.samples = self.positive_samples + self.negative_samples
        self.labels = [1] * len(self.positive_samples) + [0] * len(self.negative_samples)

    def _clean_text(self, text):
        
        text = re.sub(r'<.*?>', '', text)
        
        text = re.sub(r'http\S+|www\.\S+', '[URL]', text)
        
        text = re.sub(r'\d+', '[NUM]', text)
        return text

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return (self.samples[idx], self.labels[idx])


    
    
# Load tokenizer.
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')


# Load dataset.
train_dataset = IMDbDataset(r"C:/Users/kdarc/OneDrive/Desktop/M6W4/aclImdb/train", tokenizer)
test_dataset = IMDbDataset(r"C:/Users/kdarc/OneDrive/Desktop/M6W4/aclImdb/test", tokenizer)


# Load to dataloader.
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)


# Select and load a pre trained model.
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
model.to(device)


# Fine tune the model.
optimizer = AdamW(model.parameters(), lr=1e-5)

for epoch in range(3):  # Number of epochs.
    model.train()
    for idx, batch in enumerate(train_loader):
        inputs, labels = batch
        inputs = tokenizer(inputs, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
        labels = torch.tensor(labels).unsqueeze(0).to(device)

        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

    model.eval()
    preds = []
    true = []
    with torch.no_grad():
        for batch in test_loader:
            inputs, labels = batch
            inputs = tokenizer(inputs, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
            outputs = model(**inputs)
            preds.extend(outputs.logits.argmax(-1).tolist())
            true.extend(labels)

            
            
            
# Evaluate the model.
    precision, recall, f1, _ = precision_recall_fscore_support(true, preds, average='binary')
    acc = accuracy_score(true, preds)

    print(f'After epoch {epoch+1}, accuracy: {acc}, f1: {f1}, precision: {precision}, recall: {recall}')



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

After epoch 1, accuracy: 0.93396, f1: 0.9349564669266832, precision: 0.921058759605682, recall: 0.94928


  labels = torch.tensor(labels).unsqueeze(0).to(device)


After epoch 2, accuracy: 0.93308, f1: 0.9346714045843258, precision: 0.9129605614463345, recall: 0.95744


  labels = torch.tensor(labels).unsqueeze(0).to(device)


After epoch 3, accuracy: 0.93596, f1: 0.9361311684685044, precision: 0.9336357125805681, recall: 0.93864
