### In Colab

In [None]:
# from google.colab import drive
# drive.mount('drive', force_remount=True)

Mounted at drive


In [None]:
# !pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=5b9e112bb2fc280011ba7d2afb1c14a2b9c2c50052e49334e5ae5cd50873e4ab
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


### Imports

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from seqeval.metrics import f1_score
from copy import deepcopy

device = 'cuda' if torch.cuda.is_available() else 'cpu'

### Read Dataset

In [None]:
with open('data/all_data.data', 'rt', encoding='utf8') as fr:
    all_data = fr.read().split('\n')
    all_labels = set()

    X, Y, xx, yy = [], [], [], []
    for line in all_data:
        if line.strip():
            w, label, _, _, _, _ = line.split('\t')
            all_labels.add(label)
            xx.append(w.lower())
            yy.append(label)
        else:
            X.append(xx.copy())
            Y.append(yy.copy())
            xx.clear()
            yy.clear()

assert len(X) == len(Y)

print(f'data documents: {len(X)}\n'
      f'sent: {X[0]}\n'
      f'labels: {Y[0]}')

data documents: 1312
sent: ['analysis', 'of', 'the', 'efficacy', 'of', 'diet', 'and', 'short-term', 'probiotic', 'intervention', 'on', 'depressive', 'symptoms', 'in', 'patients', 'after', 'bariatric', 'surgery', ':', 'a', 'randomized', 'double-blind', 'placebo', 'controlled', 'pilot', 'study', '.']
labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DDF', 'I-DDF', 'O', 'B-human', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [None]:
print(f'labels: {len(all_labels), all_labels}')

labels: (27, {'B-dietary%supplement', 'I-food', 'B-statistical%technique', 'O', 'I-bacteria', 'B-DDF', 'B-bacteria', 'I-chemical', 'B-microbiome', 'I-DDF', 'I-biomedical%technique', 'B-anatomical%location', 'I-animal', 'B-drug', 'I-drug', 'B-human', 'I-statistical%technique', 'B-biomedical%technique', 'B-food', 'B-animal', 'I-gene', 'I-dietary%supplement', 'B-gene', 'I-human', 'I-anatomical%location', 'B-chemical', 'I-microbiome'})


In [None]:
label2id = {label: i for i, label in enumerate(list(all_labels))}
id2label = {v: k for k, v in label2id.items()}

### Split into Train and Test

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=42)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=.1, random_state=42)

print(len(X_train), len(X_test), len(X_val))

944 263 105


In [None]:
vocab = ['<UNK>', '<PAD>'] + sorted({w for w in X_train for w in w})
vocab2id = {w: i for i, w in enumerate(vocab)}
id2vocab = {v: k for k, v in vocab2id.items()}

print(f'size of vocab: {len(vocab)}')

size of vocab: 3984


### Encode the input

In [None]:
class NERDataset(Dataset):
    def __init__(self, X, Y, vocab2id, label2id, max_seq_length):
        self.X, self.Y = X, Y
        self.max_seq_length = max_seq_length
        self.vocab2id = vocab2id
        self.label2id = label2id

    def __len__(self):
        return len(self.Y)

    def __getitem__(self, idx):
        tokens, labels = self.X[idx], self.Y[idx]

        if len(tokens) > self.max_seq_length:
            tokens = tokens[: self.max_seq_length]
            labels = labels[: self.max_seq_length]
        else:
            tokens = tokens + ['<PAD>'] * (self.max_seq_length - len(tokens))
            labels = labels + ['O'] * (self.max_seq_length - len(labels))

        tokens_tensor = torch.tensor(
            [self.vocab2id.get(token, self.vocab2id['<UNK>']) for token in tokens]).to(device)
        labels_tensor = torch.tensor([self.label2id.get(label) for label in labels]).to(device)
        return tokens_tensor, labels_tensor


### Load Datasets

In [None]:
train_dataset = NERDataset(X=X_train, Y=Y_train, vocab2id=vocab2id, label2id=label2id, max_seq_length=256)
val_dataset = NERDataset(X=X_val, Y=Y_val, vocab2id=vocab2id, label2id=label2id, max_seq_length=256)
test_dataset = NERDataset(X=X_test, Y=Y_test, vocab2id=vocab2id, label2id=label2id, max_seq_length=256)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

### Save Datasets

- we do not want to process the same data many times, especially if it is larger

In [None]:
torch.save({'train': train_loader, 'val': val_loader, 'test': test_loader, 'vocab2id': vocab2id, 'id2vocab': id2vocab, 'label2id':label2id, 'id2label':id2label},
           'data/loaders.pt')

### LSTM Network

In [None]:
class MyLSTM(nn.Module):
    def __init__(self, input_size, emb_size, hidden_size, output_size, bidirectional, num_layers):
        super(MyLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_directions = 2 if bidirectional else 1
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, emb_size)
        self.lstm = nn.LSTM(emb_size, hidden_size, bidirectional=bidirectional, num_layers=num_layers, batch_first=True)
        self.clf1 = nn.Linear(hidden_size * self.num_directions, output_size)

    def forward(self, X):
        e = self.embedding(X)
        h0 = torch.zeros(self.num_directions * self.num_layers, X.shape[0], self.hidden_size).to(device)
        c0 = torch.zeros(self.num_directions * self.num_layers, X.shape[0], self.hidden_size).to(device)
        o, (h0,c0) = self.lstm(e, (h0,c0))

        return self.clf1(o)

### GRU Network

In [None]:
class MyGRU(nn.Module):
    def __init__(self, input_size, emb_size, hidden_size, output_size, bidirectional, num_layers):
        super(MyGRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_directions = 2 if bidirectional else 1
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, emb_size)
        self.gru = nn.GRU(emb_size, hidden_size, bidirectional=bidirectional, num_layers=num_layers, batch_first=True)
        self.clf1 = nn.Linear(hidden_size * self.num_directions, output_size)

    def forward(self, X):
        e = self.embedding(X)
        h0 = torch.zeros(self.num_directions * self.num_layers, X.shape[0], self.hidden_size).to(device)
        # o contains all hidden states, h0 will be the final hidden state
        o, h0 = self.gru(e, h0)
        # if self.bidirectional:
        #     # sum or concatenate the two directions
        #     # be careful, if you decide to concatenate you will need to match the dimensions of the Linear Layer
        #     # or leave it as it is and multiply hidden_size * 2
        #     forward = o[:, : o.shape[1] // 2]
        #     backward = o[:, o.shape[1] // 2:]
        #     o = forward + backward
        return self.clf1(o)

### Model Initialization, Loss Function, Optimizer, Hyperparameters

In [None]:
input_size = len(vocab2id)
emb_size = 1024
hidden_size = 2 * emb_size
output_size = len(label2id)
lr = 0.001
epochs = 10
bidirectional = True
num_layers = 2
model_name = 'lstm-2-bidirectional-10-epochs'

# model = MyGRU(input_size=input_size, emb_size=emb_size, hidden_size=hidden_size, output_size=output_size,
#           bidirectional=bidirectional, num_layers=num_layers).to(device)

model = MyLSTM(input_size=input_size, emb_size=emb_size, hidden_size=hidden_size, output_size=output_size,
          bidirectional=bidirectional, num_layers=num_layers).to(device)

loss_fn = nn.CrossEntropyLoss()
opt = torch.optim.AdamW(model.parameters(), lr=lr)

### Training Loop

In [None]:
model.train()

best_f1, model_dict = 0.0, None

for epoch in tqdm(range(epochs), 'Training'):
    epoch_loss = 0
    for Xs,Ys in train_loader:
        Xs, Ys = Xs.to(device), Ys.to(device)

        opt.zero_grad()

        # flat the batch
        pred_y = model.forward(Xs)
        pred_y = pred_y.view(-1, pred_y.shape[-1])

        Ys = Ys.view(-1)

        loss = loss_fn(pred_y, Ys)
        loss.backward()
        opt.step()

        epoch_loss += loss.item()



    print(f'loss: {epoch_loss / len(train_loader)}')

    with torch.no_grad():
        Y_pred, Y_val = [], []
        for Xsval, Ysval in val_loader:
            Xsval, Ysval = Xsval.to(device), Ysval.to(device)

            pred_y_val = model.forward(Xsval)
            pred_y_val = torch.argmax(pred_y_val.view(-1, pred_y_val.shape[-1]), dim=-1)

            Ysval = Ysval.view(-1)

            Y_pred.append([id2label[_id_.item()] for _id_ in pred_y_val])
            Y_val.append([id2label[_id_.item()] for _id_ in Ysval])


        f1 = f1_score(Y_val, Y_pred, average='micro')
        print(f'f1-micro: {f1}')

        if best_f1 < f1:
            best_f1 = f1
            model_dict = deepcopy(model.state_dict())

Training:   0%|          | 0/10 [00:00<?, ?it/s]

loss: 0.1649706006302672


Training:  10%|█         | 1/10 [00:09<01:27,  9.71s/it]

f1-micro: 0.15028901734104047
loss: 0.0728221282489219


Training:  20%|██        | 2/10 [00:19<01:17,  9.63s/it]

f1-micro: 0.4341880341880342
loss: 0.04437654730627092


Training:  30%|███       | 3/10 [00:28<01:07,  9.59s/it]

f1-micro: 0.6061643835616437
loss: 0.0232091184973843


Training:  40%|████      | 4/10 [00:38<00:57,  9.57s/it]

f1-micro: 0.605095541401274
loss: 0.011754456110361773


Training:  50%|█████     | 5/10 [00:47<00:47,  9.57s/it]

f1-micro: 0.6473429951690821
loss: 0.004539991340252681


Training:  60%|██████    | 6/10 [00:57<00:38,  9.56s/it]

f1-micro: 0.690846286701209
loss: 0.0017989045393779495


Training:  70%|███████   | 7/10 [01:07<00:28,  9.56s/it]

f1-micro: 0.6879194630872484
loss: 0.000872275413465844


Training:  80%|████████  | 8/10 [01:16<00:19,  9.55s/it]

f1-micro: 0.7247863247863249
loss: 0.0004894064061819197


Training:  90%|█████████ | 9/10 [01:26<00:09,  9.55s/it]

f1-micro: 0.6921850079744817
loss: 0.00018888276938045144


Training: 100%|██████████| 10/10 [01:35<00:00,  9.57s/it]

f1-micro: 0.7234782608695651





### Save Model and Hyperparameters

In [None]:
save_dict = {'state_dict' : model_dict,
            'input_size': input_size,
            'emb_size': emb_size,
            'hidden_size': hidden_size,
            'output_size': output_size,
            'lr': lr,
            'epochs': epochs,
            'bidirectional': bidirectional,
            'num_layers': num_layers,
            'model_name': model_name,
            'loss_fn': loss_fn.__class__.__name__,
            'opt': opt.__class__.__name__}


torch.save(save_dict, f'models/{model_name}.pt')


### Inference
Go to the other notebook:

[rnn_sequence_labeling_inference.ipynb](rnn_sequence_labeling_inference.ipynb)