In [1]:
"""Imports"""
import pandas as pd
import numpy as np
import random
import json
import csv
# importing the itertools library
import itertools
import torch
from transformers import BertTokenizer, BertForSequenceClassification

In [45]:
data_fname = '../data/test_data2.csv'
test_df1 = pd.read_csv(data_fname)

In [46]:
test_df1

Unnamed: 0,abstract_id,sentences,entity_1_id,entity_2_id,entity_1_mention,entity_2_mention,type,kfold
0,15811908,Physicians and other healthcare professionals ...,D000077582,D063325,amisulpride,tiapride,Drug_Interaction,0
1,11009181,A number of small scale clinical trials have u...,D001058,D007980,apomorphine,levodopa,Negative_Correlation,0
2,28368423,"Our data show that in pre-malignant B cells, M...",11651,17869,Akt,MYC,Positive_Correlation,0
3,10788334,"Three BRCA1 abnormalities - 5382insC, C61G, an...",OMIM:604370,c|DEL|4153|A,BRCA1 abnormalities,4153delA,Positive_Correlation,0
4,18507837,This SNP is part of a large linkage disequilib...,25862,5225,USP49,PGC,Negative,0
...,...,...,...,...,...,...,...,...
7824,29183288,RESULTS: Arg1 ablation had no influence on the...,11846,20540,Arg1,arginine transporters,Positive_Correlation,4
7825,20621845,CONCLUSION: This study establishes a TAA model...,D002122,D017545,CaCl(2),TAA,Positive_Correlation,4
7826,17345627,Dysregulated expression of bone morphogenetic ...,D001848,9606,axial skeletal abnormalities,human,Negative,4
7827,17033686,"In summary, our findings demonstrate for the f...",D008850,10084,microphthalmia,PQBP1,Negative,4


In [26]:
data_fname = 'submission_10.csv'
test_df = pd.read_csv(data_fname, sep='\t')

In [27]:
test_df.head()

Unnamed: 0,id,abstract_id,type,entity_1_id,entity_2_id,novel
0,0,1711760,Positive_Correlation,D007530,D006973,Novel
1,1,1711760,Negative_Correlation,D006973,D007530,Novel
2,2,1711760,Association,C009591,D009410,Novel
3,4,1711760,Negative_Correlation,D007511,D010656,Novel
4,5,1711760,Negative_Correlation,D020244,D010656,Novel


In [28]:
test_df= test_df.merge(test_df1, on=['abstract_id','entity_1_id','entity_2_id'])

In [29]:
test_df= test_df.drop_duplicates(subset=['abstract_id','entity_1_id','entity_2_id'])

In [31]:
"""Load models &  resources"""
model_dir = 'outputs/models'
model_fname = model_dir + '/bert_model_epoch6/'
print('where is the model?: ',model_fname)

where is the model?:  outputs/models/bert_model_epoch6/


In [32]:
label2id = {
    'Novel': 0,
    'No': 1,
      }

id2label = {}
for key,value in label2id.items():
    id2label[value] = key


In [33]:
def classify_relation(row):
    text = row['sentence'] + str('[SEP]') + row['entity_1_mention'] + str('[SEP]') + row['entity_2_mention']
    input_ids = torch.tensor(tokenizer.encode(text, 
                                              add_special_tokens=True,
                                              max_length=128)).unsqueeze(0)  # Batch size 1
    labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1

    input_ids= input_ids.to(device) 
    labels= labels.to(device)
    with torch.no_grad():
        outputs = model(input_ids, labels=labels)
    logits = outputs[1]
    result = np.argmax(logits.cpu().numpy(),axis=1)[0]
    
    label = id2label[result]

    return label

In [34]:
# Instantiate the Bert tokenizer
tokenizer = BertTokenizer.from_pretrained(model_fname, do_lower_case=False)
model = BertForSequenceClassification.from_pretrained(model_fname,num_labels=len(label2id))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# send model to device
model.to(device);

In [35]:
test_df['novel']= test_df.apply(lambda x: classify_relation(x), axis=1)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [36]:
test_df.head()

Unnamed: 0,id,abstract_id,type,entity_1_id,entity_2_id,novel,sentence,entity_1_mention,entity_2_mention
0,0,1711760,Positive_Correlation,D007530,D006973,No,"Under isoflurane anesthesia, the MCA of 14 spo...",isoflurane,hypertensive
1,1,1711760,Negative_Correlation,D006973,D007530,No,"Under isoflurane anesthesia, the MCA of 14 spo...",hypertensive,isoflurane
2,2,1711760,Association,C009591,D009410,No,The extent of neuronal injury was determined b...,"2,3,5-triphenyltetrazolium",neuronal injury
3,4,1711760,Negative_Correlation,D007511,D010656,No,The data indicate that phenylephrine-induced h...,ischemic,phenylephrine
4,5,1711760,Negative_Correlation,D020244,D010656,Novel,The data indicate that phenylephrine-induced h...,MCAO,phenylephrine


In [37]:
test_df['id'] = test_df.index
submission_df = test_df[['id','abstract_id','type','entity_1_id','entity_2_id','novel']]
#submission_df['novel'] = 'Novel'

In [38]:
submission_df.sample(5)

Unnamed: 0,id,abstract_id,type,entity_1_id,entity_2_id,novel
10001,10001,22369755,Negative_Correlation,57152,207,Novel
3064,3064,16629641,Negative_Correlation,D006526,3439,Novel
3676,3676,17000021,Association,D003922,rs2476601,Novel
4021,4021,17391797,Association,10400,D010714,No
10207,10207,22729903,Association,6775,57819,Novel


In [39]:
len(submission_df)

2065

In [40]:
#submission_df.to_csv('../dataset/submission_4.csv',sep='\t',index=None)

In [41]:
submission_df = submission_df[submission_df['type']!='Negative']

In [42]:
len(submission_df)

2065

In [43]:
submission_df.to_csv('submission_25.csv',sep='\t',index=None)