In [13]:
!pip install sacremoses
!pip install datasets
!clear

[H[2J

In [14]:
from transformers import BertModel, BertTokenizer
import datasets
import torch
import pandas

In [15]:
# Tokenizer training
# https://ai.plainenglish.io/bert-pytorch-implementation-prepare-dataset-part-1-efd259113e5a
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased").to("cuda")

BATCH_SIZE = 32

In [16]:
model = BertModel.from_pretrained("bert-base-uncased").to("cuda")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

csv = pandas.read_csv("TrainReviews.csv").sample(frac=1).reset_index(drop=True)
train_csv = csv[0:3000].reset_index(drop=True)
valid_csv = csv[3000:3870].reset_index(drop=True)

class ReviewDataset(torch.utils.data.Dataset):
  def __init__(self, pandas_array):
    tokenized = tokenizer.batch_encode_plus(
        pandas_array["review"],
        truncation=True,
        max_length=512,
        padding=True,
        return_tensors="pt",
    )

    self.tokenized = tokenized
    self.classes = pandas_array["class"]

  def __getitem__(self, idx):
    return self.tokenized["input_ids"][idx], self.tokenized["attention_mask"][idx], torch.tensor(self.classes[idx])

  def __len__(self):
    return len(self.classes)

train_dataset = ReviewDataset(train_csv)
valid_dataset = ReviewDataset(valid_csv)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [17]:
class Classifier(torch.nn.Module):
  def __init__(self, bert_model):
    super(Classifier, self).__init__()

    for param in bert_model.parameters():
      param.requires_grad = False

    self.bert_model = bert_model

    self.linear = torch.nn.Linear(768, 1)
    self.sigmoid = torch.nn.Sigmoid()

  def forward(self, x, attention_mask):
    y = x.clone().detach()
    x = self.bert_model(x, attention_mask=attention_mask)
    x = self.linear(x.pooler_output)
    x = self.sigmoid(x)

    return x

In [18]:
classifier = Classifier(model).to("cuda")
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3)
criterion = torch.nn.BCELoss()

num_epochs = 2
for epoch in range(num_epochs):
  classifier.train()
  for i, (input_ids, attention_mask, labels) in enumerate(train_dataloader):
    labels = labels.float().to("cuda").view(-1, 1)
    input_ids = input_ids.to("cuda")
    attention_mask = attention_mask.to("cuda")

    out = classifier(input_ids, attention_mask=attention_mask)

    optimizer.zero_grad()
    loss = criterion(out, labels)
    loss.backward()
    optimizer.step()

    if i%10 == 0:
      print(f"Epoch: {epoch+1}/{num_epochs}; Step: {i}/{len(train_dataloader)}; Loss: {loss}")

  with torch.no_grad():
    loss_sum = 0
    acc_sum = 0

    classifier.eval()
    for i, (input_ids, attention_mask, labels) in enumerate(valid_dataloader):
      labels = labels.float().to("cuda").view(-1, 1)
      input_ids = input_ids.to("cuda")
      attention_mask = attention_mask.to("cuda")

      out = classifier(input_ids, attention_mask=attention_mask)
      loss = criterion(out, labels)
      loss_sum += loss
      acc_sum += torch.sum(torch.round(out) == labels) / labels.shape[0]

    avg_loss = loss_sum/len(valid_dataloader)
    avg_acc = acc_sum/len(valid_dataloader)
    print(f"###### VALIDATION ###### Loss: {avg_loss}; Accuracy: {avg_acc}")


Epoch: 1/2; Step: 0/94; Loss: 0.7262423038482666
Epoch: 1/2; Step: 10/94; Loss: 0.6578466892242432
Epoch: 1/2; Step: 20/94; Loss: 0.5404446125030518
Epoch: 1/2; Step: 30/94; Loss: 0.7021533250808716
Epoch: 1/2; Step: 40/94; Loss: 0.5734280347824097
Epoch: 1/2; Step: 50/94; Loss: 0.451293021440506
Epoch: 1/2; Step: 60/94; Loss: 0.5314268469810486
Epoch: 1/2; Step: 70/94; Loss: 0.6016862392425537
Epoch: 1/2; Step: 80/94; Loss: 0.45121335983276367
Epoch: 1/2; Step: 90/94; Loss: 0.565631628036499
###### VALIDATION ###### Loss: 0.5253691077232361; Accuracy: 0.7150298357009888
Epoch: 2/2; Step: 0/94; Loss: 0.5361878871917725
Epoch: 2/2; Step: 10/94; Loss: 0.5617011189460754
Epoch: 2/2; Step: 20/94; Loss: 0.6187691688537598
Epoch: 2/2; Step: 30/94; Loss: 0.4766705632209778
Epoch: 2/2; Step: 40/94; Loss: 0.502291202545166
Epoch: 2/2; Step: 50/94; Loss: 0.5353785157203674
Epoch: 2/2; Step: 60/94; Loss: 0.5627908706665039
Epoch: 2/2; Step: 70/94; Loss: 0.48338472843170166
Epoch: 2/2; Step: 80/94