In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 26.5 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 58.9 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 7.2 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 42.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 53.9 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml

In [3]:
import pandas as pd
import numpy as np
import torch.nn as nn
import torch

import transformers
from transformers import AdamW, get_linear_schedule_with_warmup
import json

In [33]:
class ChemProtDataset:
    def __init__(self, tokenizer, sentence, label, max_len, subj, obj):
        self.sentence = sentence
        self.subj = subj
        self.obj = obj
        self.label = label
        
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.sentence)
        
    def __getitem__(self, item):
        sentence = str(self.sentence[item])
        subj = str(self.subj[item])
        obj = str(self.obj[item])

        inputs = self.tokenizer.encode_plus(
            sentence,
            subj + " " + obj,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
        )
        
        ids = inputs['input_ids']
        token_type_ids = inputs['token_type_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'label': torch.tensor(self.label[item], dtype=torch.long),

        } 
    
class REModel(nn.Module):
    def __init__(self):
        super(REModel, self).__init__()
        self.bert = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.hidden_size = 768
        self.out = nn.Linear(self.hidden_size, 5)
        self.softmax = nn.Softmax(dim=1)    

            
    def forward(self, ids, mask, token_type_ids):
        _, outputs = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        return self.softmax(outputs)
    
    
def loss_fn(outputs, targets):
    return nn.CrossEntropyLoss()(outputs, targets)


def train_loop_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()
    for bi, d in enumerate(data_loader):
        ids = d['ids']
        mask = d['mask']
        token_type_ids = d['token_type_ids']
        label = d['label']

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        label = label.to(device, dtype=torch.long)

        optimizer.zero_grad()
        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, label)
        loss.backward()
        optimizer.step()
        if scheduler is not None:
            scheduler.step()
        if bi % 50 == 0:
            print(f'bi={bi}, loss={loss}')


def eval_loop_fn(data_loader, model, device):
    model.eval()
    fin_labels = []
    fin_outputs = []
    for bi, d in enumerate(data_loader):
        with torch.no_grad():
            ids = d['ids'].to(device, dtype=torch.long)
            mask = d['mask'].to(device, dtype=torch.long)
            token_type_ids = d['token_type_ids'].to(device, dtype=torch.long)
            label = d['label'].to(device, dtype=torch.long)
          
            outputs = model(ids, mask, token_type_ids)
          
            fin_labels.append(label.cpu().detach().numpy())
            fin_outputs.append(outputs.cpu().detach().numpy())

    return np.vstack(fin_outputs), np.hstack(fin_labels)


def read_data(path):
    with open(path) as f:
        result = []
        for line in f:
            res = json.loads(line)
            text = res['text']
            subj = text[text.find("<<")+2:text.find(">>")]
            obj = text[text.find("[[")+12:text.find("]]")]
            
            res['subj'], res['obj'] = subj, obj
            res['label'] = LABEL_DICT[res['label']]
            result.append(res)
    return pd.DataFrame(result)

In [38]:
LABEL_DICT = {'UPREGULATOR': 0, 'ACTIVATOR': 0, 'INDIRECT-UPREGULATOR': 0,
              'DOWNREGULATOR': 1, 'INHIBITOR': 1, 'INDIRECT-DOWNREGULATOR': 1,
              'AGONIST': 2,'AGONIST-ACTIVATOR': 2,'AGONIST-INHIBITOR': 2,
              'ANTAGONIST': 3, 'SUBSTRATE': 4, 'PRODUCT-OF': 4, 'SUBSTRATE_PRODUCT-OF': 4}
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
EPOCHS = 4
SEED = 20
LEARNING_RATE = 3e-5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
model = REModel().to(device)

df_train = read_data('./drive/MyDrive/RE/data/chemprot/train.txt')
df_test = read_data('./drive/MyDrive/RE/data/chemprot/test.txt')

train_dataset = ChemProtDataset(
    sentence=df_train.text.values,
    label=df_train.label.values,
    tokenizer=tokenizer,
    max_len=MAX_LEN,
    subj = df_train.subj.values, 
    obj = df_train.obj.values
)
train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True
)

test_dataset = ChemProtDataset(
    sentence=df_test.text.values,
    label=df_test.label.values,
    tokenizer=tokenizer,
    max_len=MAX_LEN,
    subj = df_test.subj.values, 
    obj = df_test.obj.values
)
test_data_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=4,
    shuffle=True,
    drop_last=True
)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
num_training_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)


for epoch in range(EPOCHS):
    train_loop_fn(train_data_loader, model, optimizer, device, scheduler)
outputs, labels = eval_loop_fn(test_data_loader, model, device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max 

bi=0, loss=6.644169807434082
bi=50, loss=6.636192798614502
bi=100, loss=6.635920524597168
bi=150, loss=6.635893821716309
bi=200, loss=6.635881423950195
bi=250, loss=6.6358747482299805
bi=300, loss=6.635870933532715
bi=350, loss=6.635867595672607
bi=400, loss=6.635865688323975
bi=450, loss=6.6358642578125
bi=500, loss=6.635863304138184
bi=0, loss=6.635863304138184
bi=50, loss=6.635861873626709
bi=100, loss=6.635860919952393
bi=150, loss=6.635860443115234
bi=200, loss=6.635860443115234
bi=250, loss=6.635859966278076
bi=300, loss=6.635859489440918
bi=350, loss=6.635859489440918
bi=400, loss=6.635857582092285
bi=450, loss=6.635857582092285
bi=500, loss=6.635857582092285
bi=0, loss=6.635857582092285
bi=50, loss=6.635857582092285
bi=100, loss=6.635857582092285
bi=150, loss=6.635857582092285
bi=200, loss=6.635857105255127
bi=250, loss=6.635857105255127
bi=300, loss=6.635857105255127
bi=350, loss=6.635857105255127
bi=400, loss=6.635857105255127
bi=450, loss=6.635857105255127
bi=500, loss=6.635

In [39]:
from sklearn.metrics import f1_score
f1_score(np.argmax(outputs, axis=1), labels, average='micro')

0.4806805074971165