In [1]:
import gc
import re
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel, get_cosine_schedule_with_warmup
from sklearn.model_selection import train_test_split


reviews = pd.read_csv("IMDB Dataset.csv")
print(reviews.shape)
reviews.head()

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [2]:
reviews["sentiment"].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [3]:
reviews["label"] = 1
reviews.loc[reviews["sentiment"] == "negative", "label"] = 0

In [4]:
def clean_review(review):
    html_tag = re.compile('<.*?>')
    cleaned_review = re.sub(html_tag, "", review).split()
    return " ".join(cleaned_review)

print("## before cleaning")
text = reviews.review[0]
print(text[:200])

print("\n## after cleaning")
cleaned_text = clean_review(text)
print(cleaned_text[:200])

## cleaning the review column
reviews["cleaned_review"] = reviews["review"].apply(lambda x: clean_review(x))

## before cleaning
One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me abo

## after cleaning
One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.The first thing that struck me about Oz was it


In [5]:
X_train, X_test, y_train, y_test = train_test_split(reviews["cleaned_review"], reviews["label"], 
                                                    test_size = .98,
                                                    random_state = 13)

In [6]:
y_train.value_counts()

label
1    534
0    466
Name: count, dtype: int64

In [7]:
y_test[:1000].value_counts()

label
0    501
1    499
Name: count, dtype: int64

In [8]:
BERT_MODEL = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)

In [9]:
class BertSentimentClassifier(torch.nn.Module):
    def __init__(self, model_name):
        super(BertSentimentClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.cls_head = torch.nn.Linear(self.bert.config.hidden_size, 1)
        self.loss_fn = torch.nn.BCELoss()

    def forward(self, input_ids, attention_mask, token_type_ids, labels = None):
        bert_output = self.bert(input_ids = input_ids,
                                attention_mask = attention_mask,
                                token_type_ids = token_type_ids)
        logits = self.cls_head(bert_output.pooler_output)
        probs = torch.nn.functional.sigmoid(logits).squeeze(-1)
        loss = None
        if labels is not None:
            loss = self.loss_fn(probs, labels)
        return loss, probs

In [10]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

model = BertSentimentClassifier(BERT_MODEL).to(device)

In [11]:
optimizer = torch.optim.Adam(model.parameters(), lr = 5e-5)
scheduler = get_cosine_schedule_with_warmup(
                optimizer,
                num_warmup_steps = 10,
                num_training_steps = 100)

In [12]:
total_records = 0
correct_records = 0
batch_size = 5
for i in range(0, len(X_test[:1000]), batch_size):
    batch_data = tokenizer(X_test[i:i+batch_size].tolist(), return_tensors = "pt",
                      padding = True, truncation = True).to(device)
    batch_y = torch.FloatTensor(y_test[i:i+batch_size].tolist()).to(device)
    _, logits = model(input_ids = batch_data.input_ids,
                         attention_mask = batch_data.attention_mask,
                         token_type_ids = batch_data.token_type_ids,
                         labels = batch_y)
    total_records += batch_size
    correct_records += torch.sum((1 * logits >= 0.5) == batch_y).item()

    torch.cuda.empty_cache()
    _ = gc.collect()
    
accuracy = correct_records / total_records
print(f"accuracy: {accuracy}")

accuracy: 0.443


In [13]:
epochs = 3
losses = []
batch_size = 20
model.train()
for epoch in range(epochs):
    in_losses = []
    print(f"epoch: {epoch}")
    total_records = 0
    correct_records = 0
    for i in range(0, len(X_train), batch_size):
        batch_data = tokenizer(X_train[i:i+batch_size].tolist(), return_tensors = "pt",
                          padding = True, truncation = True).to(device)
        batch_y = torch.FloatTensor(y_train[i:i+batch_size].tolist()).to(device)
        optimizer.zero_grad()
        loss, logits = model(input_ids = batch_data.input_ids,
                             attention_mask = batch_data.attention_mask,
                             token_type_ids = batch_data.token_type_ids,
                             labels = batch_y)
        loss.backward()
        optimizer.step()
        scheduler.step()
        in_losses.append(loss)
        total_records += batch_size
        correct_records += torch.sum((1 * logits >= 0.5) == batch_y).item()

        torch.cuda.empty_cache()
        _ = gc.collect()
        
    epoch_loss = sum(in_losses) / len(in_losses)
    losses.append(epoch_loss)
    accuracy = correct_records / total_records
    print(f"train loss: {epoch_loss}, accuracy: {accuracy}")

epoch: 0
train loss: 0.5525103211402893, accuracy: 0.695
epoch: 1
train loss: 0.23345758020877838, accuracy: 0.921
epoch: 2
train loss: 0.15327207744121552, accuracy: 0.957


In [14]:
model.eval()
total_records = 0
correct_records = 0
batch_size = 5
for i in range(0, len(X_test[:1000]), batch_size):
    batch_data = tokenizer(X_test[i:i+batch_size].tolist(), return_tensors = "pt",
                      padding = True, truncation = True).to(device)
    batch_y = torch.FloatTensor(y_test[i:i+batch_size].tolist()).to(device)
    _, logits = model(input_ids = batch_data.input_ids,
                         attention_mask = batch_data.attention_mask,
                         token_type_ids = batch_data.token_type_ids,
                         labels = batch_y)
    total_records += batch_size
    correct_records += torch.sum((1 * logits >= 0.5) == batch_y).item()

    torch.cuda.empty_cache()
    _ = gc.collect()
    
accuracy = correct_records / total_records
print(f"accuracy: {accuracy}")

accuracy: 0.897
