In [150]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re

from tqdm.auto import tqdm

from transformers import BertTokenizer, AutoModelForSequenceClassification, get_scheduler

import torch
from torch.utils.data import Dataset, DataLoader

### CONFIGURATION

In [180]:
DATA_PATH = '../data/IMDB_Dataset.csv'
OUTPUT_PATH = '../output'
BERT_CHECKPOINT = 'bert-base-uncased'

MAX_LEN = 128
BATCH_SIZE = 16
NUM_CLASSES = 2
LEARNING_RATE = 2e-5
NUM_EPOCHS= 5

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


### UTILS

In [178]:
# For cleaning reviews
def clean_text(text):
    # remove weird spaces
    text =  " ".join(text.split())
    # remove html tags
    text = re.sub(r'<.*?>', '', text)
    return text


# Class for custom dataset
class CustomDataset(Dataset):
    def __init__(self, review, target, tokenizer, max_len, clean_text=None):
        self.clean_text = clean_text
        self.review = review
        self.target = target
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.review)

    def __getitem__(self, idx):
        y = torch.tensor(self.target[idx], dtype=torch.long)
        X = str(self.review[idx])
        if self.clean_text:
            X = self.clean_text(X)
        
        encoded_X = self.tokenizer(
            X, 
            return_tensors = 'pt', 
            max_length = self.max_len, 
            truncation=True,
            padding = 'max_length'
            )

        return {'input_ids': encoded_X['input_ids'].squeeze(),
                'attention_mask': encoded_X['attention_mask'].squeeze(),
                'labels': y}



# Traing loop for one epoch
def train_epoch(model, dataloader, optimizer, scheduler, device, progress_bar):

    losses = []
    accuracies = []

    model.train()
    for batch in dataloader:

        optimizer.zero_grad()
        batch = {k:v.to(device) for k, v in batch.items()}
        outputs = model(**batch)

        loss = outputs.loss
        loss.backward()

        optimizer.step()
        scheduler.step()

        preds = torch.argmax(outputs.logits, dim=1)
        acc = torch.sum(preds == batch['labels']) / len(preds)
        accuracies.append(acc)
        losses.append(loss.item())

        progress_bar.update(1)
    
    return torch.tensor(losses, dtype=torch.float).mean(), torch.tensor(accuracies).mean()


# Evaluation loop
def eval_epoch(model, dataloader, device):
    losses = []
    accuracies = []

    model.eval()
    with torch.no_grad():
        for batch in dataloader:

            batch = {k:v.to(device) for k, v in batch.items()}
            outputs = model(**batch)

            loss = outputs.loss
            loss.backward()

            preds = torch.argmax(outputs.logits, dim=1)
            acc = torch.sum(preds == batch['labels']) / len(preds)
            accuracies.append(acc)
            losses.append(loss.item())
        
        return torch.tensor(losses, dtype=torch.float).mean(), torch.tensor(accuracies).mean()

### DATA PREPARATION

In [24]:
# read data and remove duplicates
data = (pd.read_csv(DATA_PATH).drop_duplicates())

print(f'Numbers of samples: {len(data)}')
data.head()

Numbers of samples: 49582


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [25]:
# transform targets to  integers
data['sentiment'] = data['sentiment'].apply(lambda x: 0 if x == "negative" else 1)
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [90]:
# Train, validation and test splits

train_df, test_val_df = train_test_split(data, test_size=0.3, stratify=data['sentiment'], random_state=20)

val_df, test_df = train_test_split(test_val_df, test_size=0.5, stratify=test_val_df['sentiment'], random_state=20)

train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

print(f'Number of samples in train set: {len(train_df)}')
print(f'Number of samples in validation set: {len(val_df)}')
print(f'Number of samples in test set: {len(test_df)}')

Number of samples in train set: 34707
Number of samples in validation set: 7437
Number of samples in test set: 7438


In [42]:
tokenizer = BertTokenizer.from_pretrained(BERT_CHECKPOINT)

In [130]:
dfs = {'train':train_df, 'val':val_df, 'test':test_df}
dataloaders = {}

for df in dfs:
    should_shuffle = True if df == 'train' else False
    dataloaders[df] = DataLoader(
    CustomDataset(dfs[df]['review'],  dfs[df]['sentiment'], tokenizer=tokenizer, max_len=MAX_LEN, clean_text=clean_text),
    batch_size=BATCH_SIZE, shuffle=should_shuffle
    )

In [137]:
# Testing if batch loads properly
for batch in dataloaders['train']:
    print({k:v.shape for k, v in batch.items()})
    break

{'input_ids': torch.Size([16, 128]), 'attention_mask': torch.Size([16, 128]), 'labels': torch.Size([16])}


### TRAINING

In [175]:
# SETUP

# model
model = AutoModelForSequenceClassification.from_pretrained(BERT_CHECKPOINT, num_labels=NUM_CLASSES)

model.to(device)
# optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr = LEARNING_RATE)

# scheduler
num_training_steps = NUM_EPOCHS * len(dataloaders['train'])
scheduler = get_scheduler(
    'linear',
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)



loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at C:\Users\ADMIN/.cache\huggingface\transformers\3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.19.4",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-base-u

In [151]:
# Test 
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)


tensor(0.8174, grad_fn=<NllLossBackward0>) torch.Size([16, 2])


In [179]:
# Training, evaluation

progress_bar = tqdm(range(num_training_steps))
history = {'train_loss':[], 'train_acc':[], 'val_loss':[], 'val_acc': []}

best_accuracy = 0
for epoch in range(NUM_EPOCHS):
    train_loss, train_acc = train_epoch(model, dataloaders['train'], optimizer, scheduler, device, progress_bar)
    print(f'Train Loss: {train_loss :.4f} | Accuracy: {train_acc*100 :.2f}')

    val_loss, val_acc = eval_epoch(model, dataloaders['eval'], device)
    print(f'Eval Loss: {val_loss :.4f} | Accuracy: {val_acc*100 :.2f}')

    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)

    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)

    # save best model
    if val_acc > best_accuracy:
        model.save_pretrained('output')
        best_accuracy = val_acc
        
    print('-'*50)

        

  0%|          | 0/10850 [00:00<?, ?it/s]

KeyboardInterrupt: 