In [1]:
import torch
from torch import tensor, nn, optim
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader

from torch.optim.lr_scheduler import CosineAnnealingLR

In [5]:
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained("bert-base-uncased")

In [6]:
sum([p.numel() for p in bert.parameters()])

109482240

In [7]:
bert

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [8]:
from datasets import load_dataset

ds = load_dataset("Q-b1t/IMDB-Dataset-of-50K-Movie-Reviews-Backup")

README.md:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

archive.zip:   0%|          | 0.00/27.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [9]:
ds

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 50000
    })
})

In [10]:
x = 'review'
y = 'sentiment'

In [11]:
ds['train'][1][x], ds['train'][1][y]

('A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well 

In [8]:
def tokenization(example):
    tokenized =  tokenizer(example[x],
                return_tensors='pt',
                padding=True,
                truncation=True,)

    tokenized[y] = example[y]
    return tokenized
    
print("tokenizing")
ds = ds.map(tokenization, batched=True, batch_size=10000)

ds.set_format(type='torch',)

tokenizing


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [9]:
class MovieReviews(Dataset):
    def __init__(self, ds):
        self.ds = ds
        
    def __getitem__(self, i):
        yi = 1 if self.ds[i][y] == 'positive' else 0
        
        return self.ds[i]['input_ids'], tensor(yi, dtype=float), self.ds[i]['attention_mask']
    
    def __len__(self):
        return len(self.ds)

In [10]:
ds_train = MovieReviews(ds['train'])

In [11]:
ds_train[0]

(tensor([  101,  2028,  1997,  1996,  2060, 15814,  2038,  3855,  2008,  2044,
          3666,  2074,  1015, 11472,  2792,  2017,  1005,  2222,  2022, 13322,
          1012,  2027,  2024,  2157,  1010,  2004,  2023,  2003,  3599,  2054,
          3047,  2007,  2033,  1012,  1026,  7987,  1013,  1028,  1026,  7987,
          1013,  1028,  1996,  2034,  2518,  2008,  4930,  2033,  2055, 11472,
          2001,  2049, 24083,  1998,  4895, 10258,  2378,  8450,  5019,  1997,
          4808,  1010,  2029,  2275,  1999,  2157,  2013,  1996,  2773,  2175,
          1012,  3404,  2033,  1010,  2023,  2003,  2025,  1037,  2265,  2005,
          1996,  8143, 18627,  2030,  5199,  3593,  1012,  2023,  2265,  8005,
          2053, 17957,  2007, 12362,  2000,  5850,  1010,  3348,  2030,  4808,
          1012,  2049,  2003, 13076,  1010,  1999,  1996,  4438,  2224,  1997,
          1996,  2773,  1012,  1026,  7987,  1013,  1028,  1026,  7987,  1013,
          1028,  2009,  2003,  2170, 11472,  2004,  

In [12]:
class SentimentClassificationModel(nn.Module):
    def __init__(self, bert):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.fc = nn.Linear(768, 1)
        
    def forward(self, x, mask, labels = None):
        x = self.bert(x, mask)[1] # index 1 for pooled outputs
        x = self.fc(x)
        loss = None
        if labels is not None:
            return x, F.binary_cross_entropy_with_logits(x.view(-1), labels)
        else:
            return x

In [13]:
model = SentimentClassificationModel(bert)

dl_train = DataLoader(ds_train, batch_size=30)

In [14]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
lr = 2e-5
opt = optim.AdamW(model.parameters(), lr=lr)
sched = CosineAnnealingLR(opt, len(dl_train), lr*0.5)
model = model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [15]:
for xb, yb, mask in dl_train:
    opt.zero_grad()

    xb = xb.to(device).to(torch.long)
    yb = yb.to(device).to(torch.float)
    mask = mask.to(device)
    out = model(xb, mask, labels=yb)
    loss = out[1]
    logits = out[0]
    print(loss)
    loss.backward()
    with torch.no_grad():
        opt.step()
        sched.step()

tensor(0.7209, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.6724, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.6737, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.6885, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.6830, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.6344, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.6824, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.7101, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.6171, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.7052, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.6723, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.6392, device='cuda:0',


In [94]:
love = "I really love this movie!"
hate = "This movie is terrible!!!! I hate it!!!! Would never watch again"
tks_love = tokenizer(love, return_tensors='pt')
tks_hate = tokenizer(hate, return_tensors='pt')

In [95]:
model.eval()

out_love = model(tks_love.input_ids.to(device), tks_love.attention_mask.to(device))
out_love.sigmoid()

tensor([[0.9958]], device='cuda:0', grad_fn=<SigmoidBackward0>)

In [96]:
out_hate = model(tks_hate.input_ids.to(device), tks_hate.attention_mask.to(device))
out_hate.sigmoid()

tensor([[0.0078]], device='cuda:0', grad_fn=<SigmoidBackward0>)