In [1]:
import torch
print(torch.__version__)
!pip install transformers

1.3.1


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
import numpy as np
from torch.utils.data import Dataset, DataLoader

In [0]:
from Bert_DataFetcher import BertDataSet
from Capsule_Bert_model import BertCapsuleNet

In [0]:
CUDA_LAUNCH_BLOCKING="1"

ds = BertDataSet('train.txt')
ds_loader = DataLoader(ds, batch_size=16, shuffle=True)

criterion = nn.CrossEntropyLoss()

lr = 2e-3  # 2e-3
max_grad_norm = 1.0
num_training_steps = len(ds.data_x) * 10
num_warmup_steps = 0
warmup_proportion = float(num_warmup_steps) / float(num_training_steps)  # 0.1


device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu"
        )


model = BertCapsuleNet(freeze_bert=False)
model = model.to(device)

# optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                 )
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler

In [5]:
# train

num_epoch = 12

model.train()

for e in range(num_epoch):
    running_loss = 0.0
    for i, data in enumerate(ds_loader):
        X, Y, attn_msks = data
        X = X.to(device)
        Y = Y.to(device)
        attn_msks = attn_msks.to(device)
        outputs = model((X, attn_msks))
        loss = criterion(outputs, Y)
        running_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        # print statistics
        if i % 200 == 199:
            print('%d epoch: %d Done, loss = %f' % (e, i, running_loss / 200.0))
            running_loss = 0.0

print("------Finish training------")

0 epoch: 199 Done, loss = 0.511645
0 epoch: 399 Done, loss = 0.465977
0 epoch: 599 Done, loss = 0.459275
0 epoch: 799 Done, loss = 0.458863
0 epoch: 999 Done, loss = 0.442527
1 epoch: 199 Done, loss = 0.429207
1 epoch: 399 Done, loss = 0.427520
1 epoch: 599 Done, loss = 0.431351
1 epoch: 799 Done, loss = 0.427016
1 epoch: 999 Done, loss = 0.422343
2 epoch: 199 Done, loss = 0.408386
2 epoch: 399 Done, loss = 0.406893
2 epoch: 599 Done, loss = 0.424082
2 epoch: 799 Done, loss = 0.418728
2 epoch: 999 Done, loss = 0.417575
3 epoch: 199 Done, loss = 0.396948
3 epoch: 399 Done, loss = 0.401970
3 epoch: 599 Done, loss = 0.409090
3 epoch: 799 Done, loss = 0.407245
3 epoch: 999 Done, loss = 0.413367
4 epoch: 199 Done, loss = 0.397560
4 epoch: 399 Done, loss = 0.407979
4 epoch: 599 Done, loss = 0.395746
4 epoch: 799 Done, loss = 0.384986
4 epoch: 999 Done, loss = 0.396809
5 epoch: 199 Done, loss = 0.375542
5 epoch: 399 Done, loss = 0.362099
5 epoch: 599 Done, loss = 0.386741
5 epoch: 799 Done, l

In [6]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
torch.save(model.state_dict(), "./gdrive/My Drive/freeze_bert_capsule.pt")

In [14]:
# test

model = BertCapsuleNet(freeze_bert=False)
model.load_state_dict(torch.load("./gdrive/My Drive/freeze_bert_capsule.pt"))
model = model.to(device)

test_ds = BertDataSet('test.txt')
test_ds_loader = DataLoader(test_ds, batch_size=16, shuffle=False)

correct = 0
total = 0

# matrix used for computing f1 score
y_true = None
y_pred = None

model.eval()

with torch.no_grad():
    for i, data in enumerate(test_ds_loader):
        X, labels, attn_msks = data
        X = X.to(device)
        labels = labels.to(device)
        attn_msks = attn_msks.to(device)
        outputs = model((X, attn_msks))
        _, predicted = torch.max(outputs.data, 1)  # predicted shape: [batch_size, 1]
        total += labels.size(0)  # labels shape: [batch_size, 1]
        correct += (predicted == labels).sum().item()
        if i == 0:
            y_true = labels
            y_pred = predicted
        else:
            y_true = torch.cat((y_true, labels), 0)
            y_pred = torch.cat((y_pred, predicted), 0)
    print('F1 score: ', f1_score(y_true.cpu().numpy(), y_pred.cpu().numpy()))
    print('Precision score: ', precision_score(y_true.cpu().numpy(), y_pred.cpu().numpy()))
    print('Recall score: ', recall_score(y_true.cpu().numpy(), y_pred.cpu().numpy()))
    print('Accuracy score: ', accuracy_score(y_true.cpu().numpy(), y_pred.cpu().numpy()))

F1 score:  0.7523277467411545
Precision score:  0.7651515151515151
Recall score:  0.73992673992674
Accuracy score:  0.8358024691358025
