### In Colab

In [1]:
from google.colab import drive
drive.mount('drive', force_remount=True)

Mounted at drive


In [2]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=bcccb85987f049640feaf2b4a14650dee4ff7608b1b9a61c0e33c8f2747f20e6
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


### Imports

In [3]:
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from seqeval.metrics import classification_report

device = 'cuda' if torch.cuda.is_available() else 'cpu'

### Read Dataset

In [4]:
with open('drive/MyDrive/Experiments/SNLP2/week_07/all_data.data', 'rt', encoding='utf8') as fr:
    all_data = fr.read().split('\n')
    all_labels = set()
    X, Y, xx, yy = [], [], [], []
    for line in all_data:
        if line.strip():
            w, label, _, _, _, _ = line.split('\t')
            all_labels.add(label)
            xx.append(w.lower())
            yy.append(label)
        else:
            X.append(xx.copy())
            Y.append(yy.copy())
            xx.clear()
            yy.clear()
assert len(X) == len(Y)
print(f'data documents: {len(X)}\n'
      f'sent: {X[0]}\n'
      f'labels: {Y[0]}')

data documents: 1312
sent: ['analysis', 'of', 'the', 'efficacy', 'of', 'diet', 'and', 'short-term', 'probiotic', 'intervention', 'on', 'depressive', 'symptoms', 'in', 'patients', 'after', 'bariatric', 'surgery', ':', 'a', 'randomized', 'double-blind', 'placebo', 'controlled', 'pilot', 'study', '.']
labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DDF', 'I-DDF', 'O', 'B-human', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [5]:
print(f'labels: {len(all_labels), all_labels}')

labels: (27, {'B-DDF', 'B-food', 'B-gene', 'B-human', 'B-anatomical%location', 'B-biomedical%technique', 'I-bacteria', 'B-animal', 'B-drug', 'I-human', 'B-statistical%technique', 'I-food', 'O', 'B-chemical', 'I-statistical%technique', 'B-microbiome', 'B-dietary%supplement', 'I-animal', 'I-microbiome', 'I-dietary%supplement', 'B-bacteria', 'I-DDF', 'I-gene', 'I-biomedical%technique', 'I-chemical', 'I-anatomical%location', 'I-drug'})


In [6]:
label2id = {label: i for i, label in enumerate(list(all_labels))}
id2label = {v: k for k, v in label2id.items()}

### Split into Train and Test

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=42)

print(len(X_train), len(X_test))

1049 263


In [8]:
vocab = ['<UNK>'] + sorted({w for w in X_train for w in w})
vocab2id = {w: i for i, w in enumerate(vocab)}
id2vocab = {v: k for k, v in vocab2id.items()}

print(f'size of vocab: {len(vocab)}')

size of vocab: 4167


### Encode the input

In [9]:
X_train_tensors = [torch.tensor([vocab2id[w] for w in train_sent], dtype=torch.long) for train_sent in X_train]
X_test_tensors = [torch.tensor([vocab2id.get(w, vocab2id['<UNK>']) for w in test_sent], dtype=torch.long) for test_sent
                  in X_test]

print(len(X_train_tensors), len(X_test_tensors))

1049 263


In [10]:
Y_train_tensors = [torch.tensor([label2id[label] for label in train_y], dtype=torch.long) for train_y in Y_train]
Y_test_tensors = [torch.tensor([label2id[label] for label in test_y], dtype=torch.long) for test_y in Y_test]

print(len(Y_train_tensors), len(Y_test_tensors))

1049 263


### GRU Network, Loss Function and Optimizer

In [11]:
class Net(nn.Module):
    def __init__(self, input_size, emb_size, hidden_size, output_size, bidirectional, num_layers):
        super(Net, self).__init__()
        self.hidden_size = hidden_size
        self.bidirectional = bidirectional
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, emb_size)
        self.gru = nn.GRU(emb_size, hidden_size, bidirectional=bidirectional, num_layers=num_layers)
        self.clf1 = nn.Linear(hidden_size, output_size)

    def forward(self, X, h0):
        e = self.embedding(X)
        o, h0 = self.gru(e, h0)
        if self.bidirectional:
            # sum two directions
            forward = o[:, : o.shape[1] // 2]
            backward = o[:, o.shape[1] // 2:]
            o = forward + backward
        return self.clf1(o)

    def init_hidden(self):
        dims = 1 if not self.bidirectional else 2
        return torch.zeros(dims * self.num_layers, self.hidden_size)

In [52]:
### hyperparameters
input_size = len(vocab2id)
emb_size = 512
hidden_size = 2 * emb_size
output_size = len(label2id)
lr = 5e-5
epochs = 50
bidirectional = True
num_layers = 2
model_name = 'gru-2-bidirectional-50-epochs'

net = Net(input_size=input_size, emb_size=emb_size, hidden_size=hidden_size, output_size=output_size,
          bidirectional=bidirectional, num_layers=num_layers).to(device)

loss_fn = nn.CrossEntropyLoss()
opt = torch.optim.AdamW(net.parameters(), lr=lr)

### Training Loop

In [53]:
net.train()

for epoch in tqdm(range(epochs)):
    epoch_loss = 0
    for i in range(len(X_train_tensors)):
        h0 = net.init_hidden().to(device)
        train_tensor, train_y = X_train_tensors[i].to(device), Y_train_tensors[i].to(device)
        opt.zero_grad()
        pred_y = net.forward(train_tensor, h0)
        loss = loss_fn(pred_y, train_y)
        loss.backward()
        opt.step()

        epoch_loss += loss.item()
    print(f'loss: {epoch_loss / len(X_train_tensors)}')

  2%|▏         | 1/50 [00:07<05:59,  7.35s/it]

loss: 0.7496948129576484


  4%|▍         | 2/50 [00:14<05:53,  7.37s/it]

loss: 0.39466586012439403


  6%|▌         | 3/50 [00:22<05:47,  7.39s/it]

loss: 0.23160258515242327


  8%|▊         | 4/50 [00:29<05:39,  7.37s/it]

loss: 0.12765127075418786


 10%|█         | 5/50 [00:36<05:31,  7.36s/it]

loss: 0.07134179751611461


 12%|█▏        | 6/50 [00:44<05:23,  7.34s/it]

loss: 0.045913834554866216


 14%|█▍        | 7/50 [00:51<05:16,  7.35s/it]

loss: 0.0314870713486375


 16%|█▌        | 8/50 [00:58<05:07,  7.33s/it]

loss: 0.020457778742154757


 18%|█▊        | 9/50 [01:06<05:00,  7.34s/it]

loss: 0.018015495952164208


 20%|██        | 10/50 [01:13<04:53,  7.35s/it]

loss: 0.025374433941813615


 22%|██▏       | 11/50 [01:20<04:45,  7.33s/it]

loss: 0.012846277904585089


 24%|██▍       | 12/50 [01:28<04:39,  7.34s/it]

loss: 0.007466400224460987


 26%|██▌       | 13/50 [01:35<04:33,  7.39s/it]

loss: 0.007702782444238884


 28%|██▊       | 14/50 [01:43<04:26,  7.41s/it]

loss: 0.01832464520867258


 30%|███       | 15/50 [01:50<04:19,  7.40s/it]

loss: 0.005214582849382954


 32%|███▏      | 16/50 [01:57<04:11,  7.39s/it]

loss: 0.006298498384222797


 34%|███▍      | 17/50 [02:05<04:03,  7.39s/it]

loss: 0.0067421471860531585


 36%|███▌      | 18/50 [02:12<03:56,  7.39s/it]

loss: 0.00898928512096208


 38%|███▊      | 19/50 [02:19<03:48,  7.37s/it]

loss: 0.003857944685370622


 40%|████      | 20/50 [02:27<03:40,  7.37s/it]

loss: 0.004010026915144427


 42%|████▏     | 21/50 [02:34<03:34,  7.38s/it]

loss: 0.015778552776647112


 44%|████▍     | 22/50 [02:42<03:26,  7.37s/it]

loss: 0.006083397026474627


 46%|████▌     | 23/50 [02:49<03:18,  7.37s/it]

loss: 0.005411266621175443


 48%|████▊     | 24/50 [02:56<03:11,  7.36s/it]

loss: 0.004929966003781173


 50%|█████     | 25/50 [03:04<03:04,  7.37s/it]

loss: 0.006653786444024039


 52%|█████▏    | 26/50 [03:11<02:56,  7.36s/it]

loss: 0.005332385280294105


 54%|█████▍    | 27/50 [03:18<02:49,  7.35s/it]

loss: 0.006056766038978878


 56%|█████▌    | 28/50 [03:26<02:41,  7.35s/it]

loss: 0.004149066562389773


 58%|█████▊    | 29/50 [03:33<02:33,  7.33s/it]

loss: 0.008548971299728283


 60%|██████    | 30/50 [03:40<02:26,  7.34s/it]

loss: 0.002448337845103605


 62%|██████▏   | 31/50 [03:48<02:19,  7.34s/it]

loss: 0.0011776849203472977


 64%|██████▍   | 32/50 [03:55<02:12,  7.34s/it]

loss: 0.006282248211500154


 66%|██████▌   | 33/50 [04:02<02:04,  7.34s/it]

loss: 0.007511104479502741


 68%|██████▊   | 34/50 [04:10<01:57,  7.32s/it]

loss: 0.0034331130774306923


 70%|███████   | 35/50 [04:17<01:49,  7.33s/it]

loss: 0.0012267688984266023


 72%|███████▏  | 36/50 [04:24<01:42,  7.33s/it]

loss: 0.0018869273985151066


 74%|███████▍  | 37/50 [04:32<01:35,  7.34s/it]

loss: 0.004139483552249796


 76%|███████▌  | 38/50 [04:39<01:28,  7.35s/it]

loss: 0.011884942344811444


 78%|███████▊  | 39/50 [04:46<01:20,  7.33s/it]

loss: 0.0020120834770610166


 80%|████████  | 40/50 [04:54<01:13,  7.34s/it]

loss: 0.0001123191552567579


 82%|████████▏ | 41/50 [05:01<01:06,  7.35s/it]

loss: 5.6691700863324885e-05


 84%|████████▍ | 42/50 [05:08<00:58,  7.33s/it]

loss: 3.884518752900005e-05


 86%|████████▌ | 43/50 [05:16<00:51,  7.34s/it]

loss: 2.6906760777856968e-05


 88%|████████▊ | 44/50 [05:23<00:44,  7.34s/it]

loss: 1.8316915033400815e-05


 90%|█████████ | 45/50 [05:30<00:36,  7.34s/it]

loss: 1.2227952254647946e-05


 92%|█████████▏| 46/50 [05:38<00:29,  7.34s/it]

loss: 8.001243780447093e-06


 94%|█████████▍| 47/50 [05:45<00:21,  7.33s/it]

loss: 5.144249355240337e-06


 96%|█████████▌| 48/50 [05:52<00:14,  7.34s/it]

loss: 3.2608907681972655e-06


 98%|█████████▊| 49/50 [06:00<00:07,  7.33s/it]

loss: 2.045058253977783e-06


100%|██████████| 50/50 [06:07<00:00,  7.35s/it]

loss: 1.2716008066697665e-06





### Inference

In [54]:
net.eval()

Y_pred = []
with torch.no_grad():
    for i in tqdm(range(len(X_test_tensors))):
        h0 = net.init_hidden().to(device)
        test_tensor, test_y = X_test_tensors[i].to(device), Y_test_tensors[i].to(device)
        pred_y = net.forward(test_tensor, h0)

        # print([id2label[_id_.item()] for _id_ in torch.argmax(pred_y, dim=1)])
        Y_pred.append([id2label[_id_.item()] for _id_ in torch.argmax(pred_y, dim=1)])


100%|██████████| 263/263 [00:00<00:00, 527.89it/s]


In [56]:
print(f'model: {model_name}\n{classification_report(Y_test, Y_pred)}')

model: gru-2-bidirectional-50-epochs
                       precision    recall  f1-score   support

                  DDF       0.75      0.75      0.75       234
  anatomical%location       0.79      0.68      0.73        22
               animal       0.69      0.58      0.63        19
             bacteria       0.72      0.68      0.70        84
 biomedical%technique       0.64      0.28      0.39        32
             chemical       0.61      0.48      0.54        52
   dietary%supplement       0.76      0.67      0.71        42
                 drug       0.67      0.50      0.57         8
                 food       0.00      0.00      0.00         4
                 gene       0.33      0.11      0.17         9
                human       0.74      0.75      0.75        85
           microbiome       0.74      0.80      0.77        74
statistical%technique       0.50      0.60      0.55         5

            micro avg       0.73      0.67      0.70       670
            macr

  _warn_prf(average, modifier, msg_start, len(result))
