In [None]:
!pip install pytorch_pretrained_bert
import os
from tqdm import tqdm_notebook as tqdm
import numpy as np
import torch
import torch.nn as nn
from torch.utils import data
import torch.optim as optim
from pytorch_pretrained_bert import BertTokenizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

Collecting pytorch_pretrained_bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[K     |████████████████████████████████| 123 kB 4.3 MB/s 
[?25hCollecting boto3
  Downloading boto3-1.20.44-py3-none-any.whl (131 kB)
[K     |████████████████████████████████| 131 kB 35.0 MB/s 
Collecting jmespath<1.0.0,>=0.7.1
  Downloading jmespath-0.10.0-py2.py3-none-any.whl (24 kB)
Collecting s3transfer<0.6.0,>=0.5.0
  Downloading s3transfer-0.5.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 8.1 MB/s 
[?25hCollecting botocore<1.24.0,>=1.23.44
  Downloading botocore-1.23.44-py3-none-any.whl (8.5 MB)
[K     |████████████████████████████████| 8.5 MB 44.5 MB/s 
Collecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.8-py2.py3-none-any.whl (138 kB)
[K     |████████████████████████████████| 138 kB 45.2 MB/s 
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 46.8 MB/s 
[?25hInstalling c

In [None]:
from google.colab import drive 
drive.mount('/content/drive')
%cd /content/drive/My Drive/CS_project/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/CS_project


In [None]:
from read_sentences import read_file

filenames = ['gold_data/eval.conll', 'gold_data/test.conll']
with open('gold_data/testeval.conll', 'w') as outfile:
    for fname in filenames:
        with open(fname) as infile:
            for line in infile:
                outfile.write(line)

train_sent, train_tagged_sent = read_file("gold_data/train.conll")
eval_sent, eval_tagged_sent = read_file("gold_data/testeval.conll")

tags = list(set(word_tag[1] for sent in train_tagged_sent for word_tag in sent))
",".join(tags)
tags = ["<pad>"] + tags

tag2idx = {tag:idx for idx, tag in enumerate(tags)}
idx2tag = {idx:tag for idx, tag in enumerate(tags)}

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

100%|██████████| 213450/213450 [00:00<00:00, 292623.18B/s]


In [None]:
class SMTDataset(data.Dataset):
    def __init__(self, tagged_sents):
        sents, tags_li = [], []
        for sent in tagged_sents:
            words = [word_tag[0] for word_tag in sent]
            tags = [word_tag[1] for word_tag in sent]
            sents.append(["[CLS]"] + words + ["[SEP]"])
            tags_li.append(["<pad>"] + tags + ["<pad>"])
        self.sents, self.tags_li = sents, tags_li

    def __len__(self):
        return len(self.sents)

    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx]

        x, y = [], []
        is_heads = []
        for w, t in zip(words, tags):
            tokens = tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w]
            xx = tokenizer.convert_tokens_to_ids(tokens)
            is_head = [1] + [0]*(len(tokens) - 1)

            t = [t] + ["<pad>"] * (len(tokens) - 1)
            yy = [tag2idx[each] for each in t]

            x.extend(xx)
            is_heads.extend(is_head)
            y.extend(yy)

        assert len(x)==len(y)==len(is_heads), "len(x)={}, len(y)={}, len(is_heads)={}".format(len(x), len(y), len(is_heads))

        seqlen = len(y)

        words = " ".join(words)
        tags = " ".join(tags)
        return words, x, is_heads, tags, y, seqlen

In [None]:
def pad(batch):
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    is_heads = f(2)
    tags = f(3)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch]
    x = f(1, maxlen)
    y = f(-2, maxlen)


    f = torch.LongTensor

    return words, f(x), is_heads, tags, f(y), seqlens

In [None]:
from pytorch_pretrained_bert import BertModel

In [None]:
class Net(nn.Module):
    def __init__(self, vocab_size=None):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')

        self.fc = nn.Linear(768, vocab_size)
        self.device = device

    def forward(self, x, y):

        x = x.to(device)
        y = y.to(device)
        
        if self.training:
            self.bert.train()
            encoded_layers, _ = self.bert(x)
            enc = encoded_layers[-1]
        else:
            self.bert.eval()
            with torch.no_grad():
                encoded_layers, _ = self.bert(x)
                enc = encoded_layers[-1]
        
        logits = self.fc(enc)
        y_hat = logits.argmax(-1)
        return logits, y, y_hat

In [None]:
def train(model, iterator, optimizer, criterion):
    model.train()
    for i, batch in enumerate(iterator):
        words, x, is_heads, tags, y, seqlens = batch
        _y = y
        optimizer.zero_grad()
        logits, y, _ = model(x, y)

        logits = logits.view(-1, logits.shape[-1])
        y = y.view(-1)

        loss = criterion(logits, y)
        loss.backward()

        optimizer.step()

        if i%10==0:
            print("step: {}, loss: {}".format(i, loss.item()))

In [None]:
def eval(model, iterator):
    model.eval()

    Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            words, x, is_heads, tags, y, seqlens = batch

            _, _, y_hat = model(x, y)

            Words.extend(words)
            Is_heads.extend(is_heads)
            Tags.extend(tags)
            Y.extend(y.numpy().tolist())
            Y_hat.extend(y_hat.cpu().numpy().tolist())

    # get results and save
    with open("result.txt", 'w') as fout:
        for words, is_heads, tags, y_hat in zip(Words, Is_heads, Tags, Y_hat):
            y_hat = [hat for head, hat in zip(is_heads, y_hat) if head == 1]
            preds = [idx2tag[hat] for hat in y_hat]
            assert len(preds)==len(words.split())==len(tags.split())
            for w, t, p in zip(words.split()[1:-1], tags.split()[1:-1], preds[1:-1]):
                fout.write("{} {} {}\n".format(w, t, p))
            fout.write("\n")
            
    # calculate metrics
    y_true =  np.array([tag2idx[line.split()[1]] for line in open('result.txt', 'r').read().splitlines() if len(line) > 0])
    y_pred =  np.array([tag2idx[line.split()[2]] for line in open('result.txt', 'r').read().splitlines() if len(line) > 0])
    
    print("Accuracy: ", accuracy_score(y_true, y_pred))
    print("Precision: ", precision_score(y_true, y_pred, average="macro"))
    print("Recall: ", recall_score(y_true, y_pred, average="macro"))
    print("F-score: ", f1_score(y_true, y_pred, average="macro"))

In [None]:
model = Net(vocab_size=len(tag2idx))
model.to(device)
model = nn.DataParallel(model)

100%|██████████| 404400730/404400730 [00:30<00:00, 13172885.31B/s]


In [None]:
train_dataset = SMTDataset(train_tagged_sent)
eval_dataset = SMTDataset(eval_tagged_sent)

train_iter = data.DataLoader(dataset=train_dataset,
                             batch_size=16,
                             shuffle=True,
                             num_workers=1,
                             collate_fn=pad)
test_iter = data.DataLoader(dataset=eval_dataset,
                             batch_size=16,
                             shuffle=False,
                             num_workers=1,
                             collate_fn=pad)

optimizer = optim.Adam(model.parameters(), lr = 0.0001)

criterion = nn.CrossEntropyLoss(ignore_index=0)

In [None]:
train(model, train_iter, optimizer, criterion)
eval(model, test_iter)

step: 0, loss: 4.287162780761719
step: 10, loss: 2.1621651649475098
step: 20, loss: 0.8451504111289978
step: 30, loss: 0.8115382194519043
step: 40, loss: 0.6449341177940369
step: 50, loss: 0.31196263432502747
step: 60, loss: 0.38537073135375977
step: 70, loss: 0.29861655831336975
step: 80, loss: 0.26663848757743835
step: 90, loss: 0.259267657995224
step: 100, loss: 0.2070050835609436
step: 110, loss: 0.12061232328414917
step: 120, loss: 0.08054866641759872
step: 130, loss: 0.05581691861152649
step: 140, loss: 0.0645441934466362
step: 150, loss: 0.18107633292675018
step: 160, loss: 0.1619790643453598
step: 170, loss: 0.19197355210781097
step: 180, loss: 0.09833307564258575
step: 190, loss: 0.28944987058639526
step: 200, loss: 0.09460791945457458
step: 210, loss: 0.06915973871946335
step: 220, loss: 0.055473748594522476
step: 230, loss: 0.2306976169347763
step: 240, loss: 0.10406902432441711
step: 250, loss: 0.17786699533462524
step: 260, loss: 0.11540894210338593
step: 270, loss: 0.0595

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TypeError: ignored

In [None]:
open('result.txt', 'r').read().splitlines()[:250]

In [None]:
from collections import Counter, OrderedDict
import matplotlib.pylab as plt

file = open("result.txt", "r")

words, correct_class, wrong_class = [], [], []

for line in file:
  line = line.split()
  if len(line) > 1:
    if line[1] != line[2]:
      words.append(line[0])
      correct_class.append(line[1])
      wrong_class.append(line[2])      
file.close()

correct_counts = Counter(correct_class)
wrong_counts = Counter(wrong_class)

correct_counts = dict(sorted(correct_counts.items(), key=lambda item: item[1], reverse=True))

x = [] # keys
y = [] # values
for key, value in correct_counts.items():
	if value >= 4:
		x.append(key)
		y.append(value)
plt.bar(x,y, color = 'green')

for i in range(len(x)):
    plt.text(i, y[i]//2, y[i], ha = 'center', fontsize = 8, fontweight='bold')


plt.title('Occurences of the Incorrectly Classified Semantic Tags by BERT', fontsize = 14, fontweight='bold', fontfamily='serif')
plt.xlabel('Semantic tag', fontweight='bold', fontfamily='serif', fontsize = 12)
plt.ylabel('Number of occurences', fontweight='bold', fontfamily='serif', fontsize = 12)
#plt.savefig("incorrectly_classified_baseline.png")
plt.show()
