In [7]:
import warnings
warnings.filterwarnings('ignore')

In [8]:
%%capture
%%bash
git clone https://github.com/dmis-lab/biobert.git       #clone biobert repo
gdown --id 1R84voFKHfWV9xjzeLzWBbmY1uOMYpnyD            #download biobert embeddings
gdown --id 1ZEmMGZCBsfeeFrrJs2uUHK8mzzuNkzTR            #download medline dataset
tar -xzvf "biobert_v1.1_pubmed.tar.gz" -C "./biobert/"  #unzip biobert embeddings into repo
wait
cd biobert
gdown --id 1n7-fIN7hTCkTMCXyZZw9kBHkyYGdgdGC            #download medline_re.py
./download.sh                                           #download test datasets
wait
pip install -r requirements.txt                         #install required packages
pip install pandas                                      #install pandas
pip install scikit-learn                                #install scikit-learn
pip install transformers                               #install transformers

In [9]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
import os
import spacy

In [10]:
def annotate_sentence(s):
    """
    Annotates Sentence by replacing words with respective tags and the rest with Os
    Input: <TREAT> Intravenous immune globulin </TREAT> for <DIS> recurrent spontaneous abortion </DIS> .
    Output: ['B-TREAT', 'I-TREAT', 'I-TREAT', O, 'B-DIS', 'I-DIS', 'I-DIS', O]
    """
    s = s.split(' ')
    subtag = []
    for x in range(len(s)):
        m = re.match('<(.*?)>', s[x])
        if m:
            subtag.append([x, m.group(1).replace('/', '')])
    subtag_index =  [[x[0], y[0], reduce_tag(x[1])] for x, y in zip(subtag[::2], subtag[1::2])]
    subtag_loc = [x[0] for x in subtag]
    replaced = []
    for x in subtag_index:
        for z,y in enumerate(range(x[0]+1, x[1]),1):
            s[y] = f"{'B' if z == 1 else 'I'}-{x[2]}"
            replaced.append(y)
    replaced_tags = replaced + subtag_loc
    retag = [y for y in [x for x in range(len(s))] if y not in replaced_tags]
    for x in retag:
        s[x] = 'O'
    for x in subtag_loc[::-1]:
        del s[x]
    return s
    
def reduce_tag(x):
    """
    Simplify Tags to either DIS or TREAT
    Input:
        Data: String
    Output:
        Data: String
    """
    return 'DIS' if 'DIS' in x else 'TREAT' if 'TREAT' in x else None
def clean_labels(data):
    """
    Take raw dataframe and blend labels together. Remove TO_SEE instances.
    Input:
        Data: Pandas Dataframe
    Output:
        Data: Pandas Dataframe
    """
    data = data[data['Label'] != 'TO_SEE']
    label_dict = {'NONE': 'NONE', 'DISONLY': 'OTHER', 'TREATONLY': 'OTHER', 'PREVENT': 'PREVENT', 'VAGUE': 'OTHER', 'TO_SEE': 'OTHER',
       'TREAT_FOR_DIS': 'CURE', 'SIDE_EFF': 'SIDE_EFF', 'TREAT_NO_FOR_DIS': 'NO_CURE'}
    data = data.replace({'Label': label_dict})
    return data
with open('sentences_with_roles_and_relations.txt', encoding = "ISO-8859-1") as f:      #Read in Data
    lines = f.readlines()

def clean_data(sentence):
    """
    Remove 
    Input:
        Data: String
    Output:
        Data: String
    """
    tokens = sentence.split()
    for token in tokens:
      if(re.match('<.*?>',token)):
        tokens.remove(token)
    return tokens

def clean_data_2(tokens):
  for token in tokens:
    if(re.match('<.*?>',token)):
      tokens.remove(token)
  return tokens

def add_pos_tags(sentence):
  pos_tags = []
  sen = sp(sentence)
  for i in range(0,len(sen)):
    pos_tags.append(sen[i].pos_)
  return pos_tags

def remove_labels(sentence):
  sentence = re.sub('<.*?>', '',sentence)
  sentence = re.sub('[ \t]+',' ',sentence)
  sentence = sentence.replace(' .','.')
  return sentence.strip()

def replace_with_tags(sentence, annotation):
  new_sentence = []
  new_annotation = []
  for i in range(len(annotation)):
    if annotation[i] == 'O':
      new_sentence.append(sentence[i])
      new_annotation.append(annotation[i])
    elif annotation[i][:1] == 'B':
      new_sentence.append(f'@{annotation[i][2:]}$')
      new_annotation.append(annotation[i])
  new_sentence = ' '.join(new_sentence)
  new_sentence = new_sentence.replace(' .','.')
  new_sentence = new_sentence.replace(' ,',',')
  new_sentence = new_sentence.replace(' !','!')
  new_sentence = new_sentence.replace(' ?','?')
  new_sentence = new_sentence.replace(' ;',';')
  return new_sentence


In [11]:
data_df = pd.DataFrame({'Data':lines})                                                  #Convert to Dataframe
data_df['Label'] = data_df['Data'].apply(lambda x: x.split('||')[1].replace('\n', ''))  #Split Label into new column - Y1 Input
data_df['Data'] = data_df['Data'].apply(lambda x: x.split('||')[0].strip())             #Split Data into new column
data_df['Clean Sentence'] = data_df['Data'].apply(lambda x:remove_labels(x))            #Remove excess spaces in clean sentences
data_df['Data_Clean'] = data_df['Data'].apply(lambda x: clean_data(x))                  #Remove Tags from Data into new column - X Input
data_df['Data_Clean'] = data_df['Data_Clean'].apply(lambda x: clean_data_2(x))          
data_df['Annot'] = data_df['Data'].apply(lambda x: annotate_sentence(x))                #Annotate Sentences - Y2 Input
data_df['Data'].iloc[872] = '<TREATONLY> Primary thrombolytic treatment </TREATONLY> ( within 24 hours of diagnosis ) was given to 169 patients ( 23.5 % ) , whereas the remaining 550 patients were initially treated with <TREATONLY> heparin </TREATONLY> alone .' #Manually correct annotation mistake in source dataset
data_df = clean_labels(data_df)                                                         #Consolidate labels into 6 classes
label_map = {'NONE':0,"OTHER":1,'PREVENT':2,'CURE':3,'SIDE_EFF':4,'NO_CURE':5}
data_df.replace({'Label':label_map},inplace=True)                                       #Label encode classes based on label_map
data_df['Tagged_sentence'] = data_df.apply(lambda x: replace_with_tags(x['Data_Clean'], x['Annot']), axis = 1)  #
bert_RE_input = data_df[['Tagged_sentence', 'Label']]
bert_RE_input.reset_index(drop=True, inplace=True)
bert_RE_input.columns = ['sentence', 'label']
bert_RE_input = bert_RE_input[bert_RE_input['label'] != 0]
bert_RE_input = bert_RE_input[bert_RE_input['label'] != 1]

In [12]:
#Generate Test Train Split files

r_state = 2
train_df, test_df = train_test_split(bert_RE_input, test_size=0.2, random_state=r_state)
dev_df, test_df  = train_test_split(test_df, test_size=0.5, random_state=r_state)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
test_df.index.names = ['index']
print(train_df['label'].value_counts())
print(dev_df['label'].value_counts())
print(test_df['label'].value_counts())


3    665
2     46
4     27
5      3
Name: label, dtype: int64
3    84
2     9
Name: label, dtype: int64
3    81
2     8
4     3
5     1
Name: label, dtype: int64


In [13]:
%%bash
cd biobert/datasets/RE
mkdir medline_input

In [14]:
#create tsv input file
train_df.to_csv('./biobert/datasets/RE/medline_input/train.tsv', sep = '\t', header = False, index = False)
test_df.to_csv('./biobert/datasets/RE/medline_input/test.tsv', sep = '\t', )
dev_df.to_csv('./biobert/datasets/RE/medline_input/dev.tsv', sep = '\t', header = False, index = False)


In [15]:
label_map = {'NONE':0,"OTHER":1,'PREVENT':2,'CURE':3,'SIDE_EFF':4,'NO_CURE':5}

print(label_map)
print(bert_RE_input['label'].value_counts())

{'NONE': 0, 'OTHER': 1, 'PREVENT': 2, 'CURE': 3, 'SIDE_EFF': 4, 'NO_CURE': 5}
3    830
2     63
4     30
5      4
Name: label, dtype: int64


In [16]:
%%bash
cd biobert
export RE_DIR=./datasets/RE/medline_input
export OUTPUT_DIR=./datasets/RE/medline_input
export TASK_NAME=medline
export BIOBERT_DIR=./biobert_v1.1_pubmed
export model_dir=./biobert_v1.1_pubmed
python medline_run_re.py --task_name=$TASK_NAME --do_train=true --do_eval=true --do_predict=true --vocab_file=$BIOBERT_DIR/vocab.txt --bert_config_file=$BIOBERT_DIR/bert_config.json --init_checkpoint=$BIOBERT_DIR/model.ckpt-1000000 --max_seq_length=128 --train_batch_size=8 --learning_rate=4e-5 --num_train_epochs=10.0 --do_lower_case=false --data_dir=$RE_DIR --output_dir=$OUTPUT_DIR --model_dir=$model_dir





W1202 05:19:15.465004 140040244045696 module_wrapper.py:139] From medline_run_re.py:907: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.


W1202 05:19:15.465259 140040244045696 module_wrapper.py:139] From medline_run_re.py:907: The name tf.logging.INFO is deprecated. Please use tf.compat.v1.logging.INFO instead.


W1202 05:19:15.465738 140040244045696 module_wrapper.py:139] From /content/biobert/modeling.py:92: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.


W1202 05:19:15.466421 140040244045696 module_wrapper.py:139] From medline_run_re.py:938: The name tf.gfile.MakeDirs is deprecated. Please use tf.io.gfile.makedirs instead.

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for

In [17]:
#Results

import sklearn
test_map = dict(zip(range(len(test_df['label'].unique())),[{y:x for x,y in label_map.items()}[x] for x in sorted(test_df['label'].unique())]))
print('Train_df size', len(train_df))
print('\nLabel Map - ', label_map)
print('Train_df Distribution')
print(pd.DataFrame(train_df['label']).value_counts(sort=False))

results_pd = pd.read_csv('./biobert/datasets/RE/medline_input/test_results.tsv', sep='\t', header=None)
y_pred = [x.index(max(x)) for x in results_pd.values.tolist()]
y_true = test_df['label'].tolist()
print('\nAccuracy', sklearn.metrics.accuracy_score(y_true,y_pred))
print('Confusion Matrix\n', sklearn.metrics.confusion_matrix(y_true,y_pred))
conf_matrix = sklearn.metrics.confusion_matrix(y_pred,y_true)

df = pd.DataFrame(sklearn.metrics.precision_recall_fscore_support(y_true, y_pred)).rename(index={0:'precision', 1:'recall', 2:'fscore', 3:'support'})
df = df.rename(columns=test_map)
print(df)

print('\nTest_df Distribution')
print(pd.DataFrame(y_true).value_counts(sort=False))


Train_df size 741

Label Map -  {'NONE': 0, 'OTHER': 1, 'PREVENT': 2, 'CURE': 3, 'SIDE_EFF': 4, 'NO_CURE': 5}
Train_df Distribution
label
2         46
3        665
4         27
5          3
dtype: int64

Accuracy 0.9354838709677419
Confusion Matrix
 [[ 6  1  1  0]
 [ 1 80  0  0]
 [ 0  2  1  0]
 [ 0  1  0  0]]
            PREVENT       CURE  SIDE_EFF  NO_CURE
precision  0.857143   0.952381  0.500000      0.0
recall     0.750000   0.987654  0.333333      0.0
fscore     0.800000   0.969697  0.400000      0.0
support    8.000000  81.000000  3.000000      1.0

Test_df Distribution
2     8
3    81
4     3
5     1
dtype: int64
