In [1]:
"""Imports"""
import pandas as pd
import numpy as np
import random
import json
import csv
# importing the itertools library
import itertools
import torch
from transformers import BertTokenizer, BertForSequenceClassification

In [2]:
data_fname = './data/test_data2.csv'
test_df = pd.read_csv(data_fname)

In [3]:
#entities_fname = "../data/test/entities_test.csv"
#entities_df = pd.read_csv(entities_fname,sep='\t')

In [4]:
#entityid2type ={ entity_ids: type for entity_ids, type in entities_df[['entity_ids','type']].values }

In [5]:
"""Load models &  resources"""
model_dir = './outputs/biobert_models'
model_fname = model_dir + '/model_epoch_10/'
print('where is the model?: ',model_fname)

where is the model?:  ./outputs/biobert_models/model_epoch_10/


In [6]:
label2id = {
    'Association': 0,
    'Positive_Correlation': 1,
    'Negative_Correlation': 2,
    'Bind': 3,
    'Cotreatment': 4,
    'Comparison':5,
    'Drug_Interaction':6,
    'Conversion':7,
    'Negative':8
      }

id2label = {}
for key,value in label2id.items():
    id2label[value] = key


In [7]:
def classify_relation(row):
    text = row['sentence'] + str('[SEP]') + row['entity_1_mention'] + str('[SEP]') + row['entity_2_mention']
    input_ids = torch.tensor(tokenizer.encode(text, 
                                              add_special_tokens=True,
                                              max_length=128)).unsqueeze(0)  # Batch size 1
    labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1

    input_ids= input_ids.to(device) 
    labels= labels.to(device)
    with torch.no_grad():
        outputs = model(input_ids, labels=labels)
    logits = outputs[1]
    result = np.argmax(logits.cpu().numpy(),axis=1)[0]
    
    label = id2label[result]

    return label

In [8]:
# Instantiate the Bert tokenizer
tokenizer = BertTokenizer.from_pretrained(model_fname, do_lower_case=False)
model = BertForSequenceClassification.from_pretrained(model_fname,num_labels=len(label2id))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# send model to device
model.to(device);

In [9]:
test_df['type']= test_df.apply(lambda x: classify_relation(x), axis=1)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [10]:
test_df.head()

Unnamed: 0,abstract_id,sentence,entity_1_mention,entity_2_mention,entity_1_id,entity_2_id,type
0,1711760,"Under isoflurane anesthesia, the MCA of 14 spo...",hypertensive,isoflurane,D006973,D007530,Negative_Correlation
1,1711760,"Under isoflurane anesthesia, the MCA of 14 spo...",isoflurane,hypertensive,D007530,D006973,Negative_Correlation
2,1711760,The extent of neuronal injury was determined b...,"2,3,5-triphenyltetrazolium",neuronal injury,C009591,D009410,Association
3,1711760,The extent of neuronal injury was determined b...,neuronal injury,"2,3,5-triphenyltetrazolium",D009410,C009591,Association
4,1711760,The data indicate that phenylephrine-induced h...,phenylephrine,MCAO,D010656,D020244,Negative_Correlation


In [11]:
#Predict novelity

In [12]:
"""Load models &  resources"""
model_dir = 'outputs/models'
model_fname = model_dir + '/bert_model_epoch6/'
print('where is the model?: ',model_fname)

where is the model?:  outputs/models/bert_model_epoch6/


In [13]:
label2id = {
    'Novel': 0,
    'No': 1,
      }

id2label = {}
for key,value in label2id.items():
    id2label[value] = key


In [14]:
def classify_novelity(row):
    text = row['sentence'] + str('[SEP]') + row['entity_1_mention'] + str('[SEP]') + row['entity_2_mention']
    input_ids = torch.tensor(tokenizer.encode(text, 
                                              add_special_tokens=True,
                                              max_length=128)).unsqueeze(0)  # Batch size 1
    labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1

    input_ids= input_ids.to(device) 
    labels= labels.to(device)
    with torch.no_grad():
        outputs = model(input_ids, labels=labels)
    logits = outputs[1]
    result = np.argmax(logits.cpu().numpy(),axis=1)[0]
    
    label = id2label[result]

    return label

In [15]:
# Instantiate the Bert tokenizer
tokenizer = BertTokenizer.from_pretrained(model_fname, do_lower_case=False)
model = BertForSequenceClassification.from_pretrained(model_fname,num_labels=len(label2id))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# send model to device
model.to(device);

In [16]:
test_df['novel']= test_df.apply(lambda x: classify_novelity(x), axis=1)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [17]:
test_df['id'] = test_df.index

In [18]:
submission_df = test_df[['id','abstract_id','type','entity_1_id','entity_2_id','novel']]

In [19]:
submission_df.head()

Unnamed: 0,id,abstract_id,type,entity_1_id,entity_2_id,novel
0,0,1711760,Negative_Correlation,D006973,D007530,No
1,1,1711760,Negative_Correlation,D007530,D006973,No
2,2,1711760,Association,C009591,D009410,No
3,3,1711760,Association,D009410,C009591,No
4,4,1711760,Negative_Correlation,D010656,D020244,No


In [20]:
len(submission_df)

8480

In [21]:
submission_df = submission_df[submission_df['type']!='Negative']

In [22]:
len(submission_df)

4729

In [23]:
submission_df.to_csv('./data/submission_26.csv',sep='\t',index=None)