In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
def annotate_sentence(s):
    #Annotates Sentence by replacing words with respective tags and the rest with Os
    #Example Input: <TREAT> Intravenous immune globulin </TREAT> for <DIS> recurrent spontaneous abortion </DIS> .
    #Example Output: ['B-TREAT', 'I-TREAT', 'I-TREAT', O, 'B-DIS', 'I-DIS', 'I-DIS', O]
    s = s.split(' ')
    subtag = []
    for x in range(len(s)):
        m = re.match('<(.*?)>', s[x])
        if m:
            subtag.append([x, m.group(1).replace('/', '')])
    subtag_index =  [[x[0], y[0], reduce_tag(x[1])] for x, y in zip(subtag[::2], subtag[1::2])]
    subtag_loc = [x[0] for x in subtag]
    replaced = []
    for x in subtag_index:
        for z,y in enumerate(range(x[0]+1, x[1]),1):
            s[y] = f"{'B' if z == 1 else 'I'}-{x[2]}"
            replaced.append(y)
    replaced_tags = replaced + subtag_loc
    retag = [y for y in [x for x in range(len(s))] if y not in replaced_tags]
    for x in retag:
        s[x] = 'O'
    for x in subtag_loc[::-1]:
        del s[x]
    return s
    
def reduce_tag(x):
    return 'DIS' if 'DIS' in x else 'TREAT' if 'TREAT' in x else None

In [4]:
def clean_labels(data):
    """
    Take raw dataframe and blend labels together. Remove TO_SEE instances.
    Input:
        Data: Pandas Dataframe
    Output:
        Data: Pandas Dataframe
    """
    data = data[data['Label'] != 'TO_SEE']
    label_dict = {'NONE': 'NONE', 'DISONLY': 'OTHER', 'TREATONLY': 'OTHER', 'PREVENT': 'PREVENT', 'VAGUE': 'OTHER', 'TO_SEE': 'OTHER',
       'TREAT_FOR_DIS': 'CURE', 'SIDE_EFF': 'SIDE_EFF', 'TREAT_NO_FOR_DIS': 'NO_CURE'}
    data = data.replace({'Label': label_dict})
    return data

In [5]:
with open('sentences_with_roles_and_relations.txt', encoding = "ISO-8859-1") as f:      #Read in Data
    lines = f.readlines()

data_df = pd.DataFrame({'Data':lines})                                                  #Convert to Dataframe
data_df['Label'] = data_df['Data'].apply(lambda x: x.split('||')[1].replace('\n', ''))  #Split Label into new column - Y1 Input
data_df['Data'] = data_df['Data'].apply(lambda x: x.split('||')[0].strip())             #Split Data into new column
data_df['Data_Clean'] = data_df['Data'].apply(lambda x: [re.sub('<.*?>', '', y ) for y in x.split(' ')]) #Remove Tags from Data into new column - X Input
data_df['Annot'] = data_df['Data'].apply(lambda x: annotate_sentence(x))                #Annotate Sentences - Y2 Input
data_df['Data'].iloc[872] = '<TREATONLY> Primary thrombolytic treatment </TREATONLY> ( within 24 hours of diagnosis ) was given to 169 patients ( 23.5 % ) , whereas the remaining 550 patients were initially treated with <TREATONLY> heparin </TREATONLY> alone .'
data_df = clean_labels(data_df)
print(data_df.shape)

(3580, 4)


0
