In [2]:
import pandas as pd
import numpy as np
import torch.nn as nn
import torch
from torch.nn.parallel import DataParallel
import transformers
from transformers import AdamW, get_linear_schedule_with_warmup
import json
import os
from transformers import BertTokenizer

In [3]:
class ChemProtDataset:
    def __init__(self, tokenizer, sentence, label, max_len, subj, obj):
        self.sentence = sentence
        self.subj = subj
        self.obj = obj
        self.label = label
        
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.sentence)
        
    def __getitem__(self, item):
        sentence = str(self.sentence[item])
        subj = str(self.subj[item])
        obj = str(self.obj[item])

        inputs = self.tokenizer.encode_plus(
            sentence,
            subj + " " + obj,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length'
        )
        
        ids = inputs['input_ids']
        token_type_ids = inputs['token_type_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'label': torch.tensor(self.label[item], dtype=torch.long),

        } 
    
class REModel(nn.Module):
    def __init__(self, model_path):
        super(REModel, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(model_path)
        self.hidden_size = 768
        self.out = nn.Linear(self.hidden_size, 5)
        self.softmax = nn.Softmax(dim=1)    

            
    def forward(self, ids, mask):
        _, outputs = self.bert(ids, attention_mask=mask, return_dict=False)
        return self.softmax(outputs)
    
    
def loss_fn(outputs, targets):
    return nn.CrossEntropyLoss()(outputs, targets)


def train_loop_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()
    for bi, d in enumerate(data_loader):
        ids = d['ids']
        mask = d['mask']
        #token_type_ids = d['token_type_ids']
        label = d['label']

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        #token_type_ids = token_type_ids.to(device, dtype=torch.long)
        label = label.to(device, dtype=torch.long)

        optimizer.zero_grad()
        outputs = model(ids, mask)

        loss = loss_fn(outputs, label)
        loss.backward()
        optimizer.step()
        if scheduler is not None:
            scheduler.step()
        if bi % 50 == 0:
            print(f'bi={bi}, loss={loss}')


def eval_loop_fn(data_loader, model, device):
    model.eval()
    fin_labels = []
    fin_outputs = []
    for bi, d in enumerate(data_loader):
        with torch.no_grad():
            ids = d['ids'].to(device, dtype=torch.long)
            mask = d['mask'].to(device, dtype=torch.long)
            token_type_ids = d['token_type_ids'].to(device, dtype=torch.long)
            label = d['label'].to(device, dtype=torch.long)
          
            outputs = model(ids, mask)
          
            fin_labels.append(label.cpu().detach().numpy())
            fin_outputs.append(outputs.cpu().detach().numpy())

    return np.vstack(fin_outputs), np.hstack(fin_labels)


def read_data(path):
    with open(path) as f:
        result = []
        for line in f:
            res = json.loads(line)
            text = res['text']
            subj = text[text.find("<<")+2:text.find(">>")]
            obj = text[text.find("[[")+12:text.find("]]")]
            
            res['subj'], res['obj'] = subj, obj
            res['label'] = LABEL_DICT[res['label']]
            result.append(res)
    return pd.DataFrame(result)

In [4]:
LABEL_DICT = {'UPREGULATOR': 0, 'ACTIVATOR': 0, 'INDIRECT-UPREGULATOR': 0,
              'DOWNREGULATOR': 1, 'INHIBITOR': 1, 'INDIRECT-DOWNREGULATOR': 1,
              'AGONIST': 2,'AGONIST-ACTIVATOR': 2,'AGONIST-INHIBITOR': 2,
              'ANTAGONIST': 3, 'SUBSTRATE': 4, 'PRODUCT-OF': 4, 'SUBSTRATE_PRODUCT-OF': 4}
MAX_LEN = 512
TRAIN_BATCH_SIZE = 64
EPOCHS = 24
SEED = 20
LEARNING_RATE = 3e-5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_path = 'bert-base-uncased'
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
model = REModel(model_path)
model = DataParallel(model)
model.to(device)
print(f'Using these GPUs: {model.device_ids}, with this model: {model_path}')

df_train = read_data('./RE_data/chemprot/train.txt')
df_test = read_data('./RE_data/chemprot/test.txt')

train_dataset = ChemProtDataset(
    sentence=df_train.text.values,
    label=df_train.label.values,
    tokenizer=tokenizer,
    max_len=MAX_LEN,
    subj = df_train.subj.values, 
    obj = df_train.obj.values
)
train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=os.cpu_count()
)

test_dataset = ChemProtDataset(
    sentence=df_test.text.values,
    label=df_test.label.values,
    tokenizer=tokenizer,
    max_len=MAX_LEN,
    subj = df_test.subj.values, 
    obj = df_test.obj.values
)
test_data_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=4,
    shuffle=True,
    drop_last=False,
    num_workers=os.cpu_count()
)

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
num_training_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)


for epoch in range(EPOCHS):
    train_loop_fn(train_data_loader, model, optimizer, device, scheduler)
outputs, labels = eval_loop_fn(test_data_loader, model, device)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using these GPUs: [0, 1, 2, 3, 4, 5, 6, 7], with this model: bert-base-uncased
bi=0, loss=6.644034385681152
bi=50, loss=6.639397144317627
bi=0, loss=6.638356685638428
bi=50, loss=6.636747360229492
bi=0, loss=6.636565685272217
bi=50, loss=6.636210918426514
bi=0, loss=6.636209011077881
bi=50, loss=6.636039733886719
bi=0, loss=6.636013031005859
bi=50, loss=6.635954856872559
bi=0, loss=6.635944366455078
bi=50, loss=6.635910987854004
bi=0, loss=6.635923862457275
bi=50, loss=6.635875225067139
bi=0, loss=6.635883808135986
bi=50, loss=6.635865211486816
bi=0, loss=6.635855674743652
bi=50, loss=6.6358513832092285
bi=0, loss=6.635845184326172
bi=50, loss=6.635842323303223
bi=0, loss=6.63584041595459
bi=50, loss=6.635795593261719
bi=0, loss=6.635793685913086
bi=50, loss=6.635772228240967
bi=0, loss=6.6357855796813965
bi=50, loss=6.635782241821289
bi=0, loss=6.63577127456665
bi=50, loss=6.635758876800537
bi=0, loss=6.635778903961182
bi=50, loss=6.635756015777588
bi=0, loss=6.63576078414917
bi=50, l

In [5]:
from sklearn.metrics import f1_score, classification_report
f1_score(np.argmax(outputs, axis=1), labels, average='micro')

0.5260882098587489

In [6]:
print(classification_report(np.argmax(outputs, axis=1), labels, zero_division=0))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.51      0.67      3278
           2       0.59      0.83      0.69       141
           3       0.15      0.90      0.26        50
           4       0.00      0.00      0.00         0

    accuracy                           0.53      3469
   macro avg       0.35      0.45      0.33      3469
weighted avg       0.97      0.53      0.67      3469



In [9]:
torch.cuda.empty_cache()

In [1]:
!nvidia-smi

Fri Apr  8 13:49:53 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.54       Driver Version: 510.54       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  Off  | 00000001:00:00.0 Off |                  Off |
| N/A   28C    P0    38W / 250W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE...  Off  | 00000002:00:00.0 Off |                  Off |
| N/A   28C    P0    36W / 250W |      0MiB / 16384MiB |      2%      Default |
|       

In [24]:
labels = [test_dataset[i]['label'].detach().cpu().tolist() for i in range(len(test_dataset))]

In [11]:
pd.Series(labels).value_counts()

1    1667
0     667
4     644
3     293
2     198
dtype: int64

In [14]:
pd.Series(np.argmax(outputs, axis=1)).value_counts()

1    3462
2       7
dtype: int64

In [29]:
test_dataset[0].keys()

dict_keys(['ids', 'mask', 'token_type_ids', 'label'])

In [5]:
tok = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
tok.encode_plus(