In [None]:
pip install pytorch_transformers

Collecting pytorch_transformers
  Downloading pytorch_transformers-1.2.0-py3-none-any.whl (176 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/176.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/176.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.4/176.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting boto3 (from pytorch_transformers)
  Downloading boto3-1.34.11-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from pytorch_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacremoses (

In [None]:
import torch
import torch.nn as nn
import pickle
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
from tqdm import tqdm_notebook, trange
import os
from pytorch_transformers import BertConfig, BertTokenizer, BertModel
from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule
from torch.utils.data import Dataset, DataLoader

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import torch.optim as optim
from torch.optim import lr_scheduler
import time
import copy
import torch.nn.functional as F

In [None]:
class BertForSequenceClassification(nn.Module):
  def __init__(self, num_labels = 1):
    super(BertForSequenceClassification, self).__init__()
    self.num_labels = num_labels
    self.bert = BertModel.from_pretrained('bert-base-uncased')
    self.dropout = nn.Dropout(config.hidden_dropout_prob) #a dropout % in the hidden layer
    self.classifier = nn.Linear(config.hidden_size, num_labels)
    nn.init.xavier_normal_(self.classifier.weight)

  def forward(self, input_ids, token_type_ids = None, attention_mask = None, labels = None):
    outputs = self.bert(input_ids, token_type_ids, attention_mask)
    pooled_output = outputs[1]
    pooled_output = self.dropout(pooled_output)
    logits = self.classifier(pooled_output)
    return logits

  def freeze_bert_encoder(self):
    for param in self.bert.parameters():
      param.requires_grad = False

  def unfreeze_bert_encoder(self):
    for param in self.bert.parameters():
      param.requires_grad = True



In [None]:
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size = 768,
                    num_hidden_layers=12, num_attention_heads = 12, intermediate_size= 3072)

In [None]:
num_labels = 1
model = BertForSequenceClassification(num_labels)

100%|██████████| 433/433 [00:00<00:00, 1011210.26B/s]
100%|██████████| 440473133/440473133 [00:17<00:00, 25185611.61B/s]


In [None]:
def unpack_dataset():
  ! wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz > .None
  ! tar -zxf aclImdb_v1.tar.gz

In [None]:
unpack_dataset()

--2024-01-03 00:58:13--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2024-01-03 00:58:35 (3.66 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [None]:
from pathlib import Path
PATH = Path("./aclImdb/")
list(PATH.iterdir())

[PosixPath('aclImdb/imdb.vocab'),
 PosixPath('aclImdb/test'),
 PosixPath('aclImdb/README'),
 PosixPath('aclImdb/train'),
 PosixPath('aclImdb/imdbEr.txt')]

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

100%|██████████| 231508/231508 [00:00<00:00, 899431.01B/s]


In [None]:
path = PATH/"train/pos/0_9.txt"
z = tokenizer.tokenize(path.read_text())
z[:10]

['bro', '##m', '##well', 'high', 'is', 'a', 'cartoon', 'comedy', '.', 'it']

In [None]:
ids = tokenizer.convert_tokens_to_ids(z)
ids[:10]

[22953, 2213, 4381, 2152, 2003, 1037, 9476, 4038, 1012, 2009]

In [None]:
tokens_tensor = torch.tensor([ids])

In [None]:
tokens_tensor

tensor([[22953,  2213,  4381,  2152,  2003,  1037,  9476,  4038,  1012,  2009,
          2743,  2012,  1996,  2168,  2051,  2004,  2070,  2060,  3454,  2055,
          2082,  2166,  1010,  2107,  2004,  1000,  5089,  1000,  1012,  2026,
          3486,  2086,  1999,  1996,  4252,  9518,  2599,  2033,  2000,  2903,
          2008, 22953,  2213,  4381,  2152,  1005,  1055, 18312,  2003,  2172,
          3553,  2000,  4507,  2084,  2003,  1000,  5089,  1000,  1012,  1996,
         25740,  2000,  5788, 13732,  1010,  1996, 12369,  3993,  2493,  2040,
          2064,  2156,  2157,  2083,  2037, 17203,  5089,  1005, 13433,  8737,
          1010,  1996,  9004, 10196,  4757,  1997,  1996,  2878,  3663,  1010,
          2035, 10825,  2033,  1997,  1996,  2816,  1045,  2354,  1998,  2037,
          2493,  1012,  2043,  1045,  2387,  1996,  2792,  1999,  2029,  1037,
          3076,  8385,  2699,  2000,  6402,  2091,  1996,  2082,  1010,  1045,
          3202,  7383,  1012,  1012,  1012,  1012,  

In [None]:
logits = model(tokens_tensor)

In [None]:
logits

tensor([[0.5463]], grad_fn=<AddmmBackward0>)

In [None]:
def text2ids(text, max_seq_length = 300):
  tok_text = tokenizer.tokenize(text)
  if len(tok_text) > max_seq_length:
    tok_text = tok_text[:max_seq_length]
  ids_text = tokenizer.convert_tokens_to_ids(tok_text)
  padding = [0] * (max_seq_length - len(ids_text))
  ids_text += padding
  return np.array(ids_text)

In [None]:
text2ids(path.read_text())

array([22953,  2213,  4381,  2152,  2003,  1037,  9476,  4038,  1012,
        2009,  2743,  2012,  1996,  2168,  2051,  2004,  2070,  2060,
        3454,  2055,  2082,  2166,  1010,  2107,  2004,  1000,  5089,
        1000,  1012,  2026,  3486,  2086,  1999,  1996,  4252,  9518,
        2599,  2033,  2000,  2903,  2008, 22953,  2213,  4381,  2152,
        1005,  1055, 18312,  2003,  2172,  3553,  2000,  4507,  2084,
        2003,  1000,  5089,  1000,  1012,  1996, 25740,  2000,  5788,
       13732,  1010,  1996, 12369,  3993,  2493,  2040,  2064,  2156,
        2157,  2083,  2037, 17203,  5089,  1005, 13433,  8737,  1010,
        1996,  9004, 10196,  4757,  1997,  1996,  2878,  3663,  1010,
        2035, 10825,  2033,  1997,  1996,  2816,  1045,  2354,  1998,
        2037,  2493,  1012,  2043,  1045,  2387,  1996,  2792,  1999,
        2029,  1037,  3076,  8385,  2699,  2000,  6402,  2091,  1996,
        2082,  1010,  1045,  3202,  7383,  1012,  1012,  1012,  1012,
        1012,  1012,

In [None]:
class ImdbDataset(Dataset):
  def __init__(self, PATH, train = 'train'):
    self.path_to_images = PATH/train
    self.pos_files = list((self.path_to_images/"pos").iterdir())
    self.neg_files = list((self.path_to_images/"neg").iterdir())
    self.files = self.pos_files + self.neg_files
    self.y = np.concatenate((np.ones(len(self.pos_files), dtype = int),
                            np.zeros(len(self.neg_files), dtype = int)), axis = 0)

  def __getitem__(self, index):
    path = self.files[index]
    x = text2ids(path.read_text())
    return x, self.y[index]

  def __len__(self):
    return len(self.y)

In [None]:
train_ds = ImdbDataset(PATH)
valid_ds = ImdbDataset(PATH, "test")

In [None]:
batch_size = 10
train_dl = DataLoader(train_ds, batch_size = batch_size, shuffle = True)
valid_dl = DataLoader(valid_ds, batch_size = batch_size)

In [None]:
x, y = train_ds[0]

In [None]:
x, y = next(iter(train_dl))

In [None]:
x[3]

tensor([ 2004,  6827,  1037,  2112,  1997,  2329,  3769,  3226,  2004,  1996,
        18446, 18750,  1998,  2508,  5416,  1010,  3460,  2040,  2001,  1037,
         5294,  2718,  2005,  2656,  2086,  1006,  3699,  1011,  2960,  1007,
         1010,  2437,  2009,  2028,  1997,  1996,  6493,  2770,  2694,  3065,
         1999,  1996,  2088,  1006,  2087, 28172,  2024,  5341,  2000,  2031,
         2702,  3692,  1007,  1012,  3488,  2000,  2128, 27927,  1996,  2186,
         2020,  2467,  2006,  1996,  4035,  1005,  1055, 11376,  1010,  1998,
         2044,  1037, 28616,  9289, 19879,  3064,  1006,  2025,  2000,  5254,
         2137,  3550,  1007,  2694,  3185,  2550,  2011,  4419,  3478,  2000,
         5425,  1996,  3894,  1997,  1996,  2434,  2544,  1010,  2178,  3157,
         2086,  1006,  5021,  4335, 11867, 21511,  1998,  6579,  7163,  1011,
         2186, 26206,  1007,  2020,  3223,  2077,  1996,  7209,  2051,  2935,
         2071,  2709,  7919,  1010, 14571,  1997, 10251,  3213, 

In [None]:
train_ds = ImdbDataset(PATH)
valid_ds = ImdbDataset(PATH, "test")

In [None]:
batch_size = 10
train_dl = DataLoader(train_ds, batch_size = batch_size, shuffle = True)
valid_dl = DataLoader(valid_ds, batch_size = batch_size)

In [None]:
x, y = train_ds[0]

In [None]:
x, y = next(iter(train_dl))

In [None]:
x[3]

tensor([ 2023,  2003,  2382,  3371,  2265,  2055,  2028,  8257,  1012,  1996,
         8257,  1010,  5430,  3549,  2024,  2025,  5845,  7199,  1012,  5292,
         3270,  3270,   999,   999,   999,  2002,  2064,  1005,  1056, 13764,
         1037,  3042,  2138,  2002,  2003,  1037,  5430,  2386,  1012,  5430,
         3549,  2024,  2025,  2004,  6047,  2004,  2529,  9552,  1012,  2821,
        15333,  9351,  1010,  2216,  5430,  3549,  2024,  2061,  4895, 28793,
        24158, 22723,  3064,  1012,  2045,  2003,  2053,  8562,  1999,  2023,
         2265,  1012,  2027,  2064,  2069,  2448,  2125,  2023,  2028,  8257,
         2005,  2061,  2146,  1998,  2027,  2525,  2031,  2007,  1996, 16216,
        11261, 12698,  1012,  2023,  2265,  2515,  2025, 10107,  1037,  2051,
        10453,  2006,  2120,  1056,  1012,  1058,  1012,  1026,  7987,  1013,
         1028,  1026,  7987,  1013,  1028,  2023,  2265,  5363,  2000,  2524,
         2000,  2022,  6057,  1010,  2021,  2009,  2074,  3475, 

In [None]:
def train_model(model, optimizer, num_epochs = 25):
  for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for x, y in train_dl:
      x = x.cuda()
      y = y.float().unsqueeze(1).cuda()
      optimizer.zero_grad()
      logits = model(x)
      loss = F.binary_cross_entropy_with_logits(logits, y)
      loss.backward()
      optimizer.step()
      running_loss += loss.item() * x.size(0)
    epoch_loss = running_loss / len(train_ds)
    val_loss, accuracy = eval_model(model)
    print("train loss: {:.3f}, valid loss {:.3f} accuracy {:.3f}".format(
        epoch_loss, val_loss, accuracy
    ))

In [None]:
def eval_model(model):
  model.eval()
  running_loss = 0.0
  correct = 0
  for x, y in valid_dl:
    x = x.cuda()
    y = y.float().unsqueeze(1).cuda()
    logits = model(x)
    loss = F.binary_cross_entropy_with_logits(logits, y)
    y_pred = logits >0
    correct += (y_pred.float()== y).float().sum()
    running_loss += loss.item() * x.size(0)
  accuracy = correct / len(valid_ds)
  epoch_loss = running_loss / len(valid_ds)
  return epoch_loss, accuracy.item()

In [None]:
model = model.cuda()

In [None]:
lrlast = 0.0001
lrmain = 0.00001
optimizer = optim.Adam(
    [
        {"params": model.bert.parameters(), "lr": lrmain},
        {"params": model.classifier.parameters(), "lr": lrlast},
    ]
)

In [None]:
train_model(model, optimizer, num_epochs =2)

train loss: 0.166, valid loss 0.214 accuracy 0.920
train loss: 0.097, valid loss 0.217 accuracy 0.922
