In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
def annotate_sentence(s):
    #Annotates Sentence by replacing words with respective tags and the rest with Os
    #Example Input: <TREAT> Intravenous immune globulin </TREAT> for <DIS> recurrent spontaneous abortion </DIS> .
    #Example Output: ['B-TREAT', 'I-TREAT', 'I-TREAT', O, 'B-DIS', 'I-DIS', 'I-DIS', O]
    s = s.split(' ')
    subtag = []
    for x in range(len(s)):
        m = re.match('<(.*?)>', s[x])
        if m:
            subtag.append([x, m.group(1).replace('/', '')])
    subtag_index =  [[x[0], y[0], reduce_tag(x[1])] for x, y in zip(subtag[::2], subtag[1::2])]
    subtag_loc = [x[0] for x in subtag]
    replaced = []
    for x in subtag_index:
        for z,y in enumerate(range(x[0]+1, x[1]),1):
            s[y] = f"{'B' if z == 1 else 'I'}-{x[2]}"
            replaced.append(y)
    replaced_tags = replaced + subtag_loc
    retag = [y for y in [x for x in range(len(s))] if y not in replaced_tags]
    for x in retag:
        s[x] = 'O'
    for x in subtag_loc[::-1]:
        del s[x]
    return s
    
def reduce_tag(x):
    return 'DIS' if 'DIS' in x else 'TREAT' if 'TREAT' in x else None

In [3]:
with open('sentences_with_roles_and_relations.txt', encoding = "ISO-8859-1") as f:      #Read in Data
    lines = f.readlines()

data_df = pd.DataFrame({'Data':lines})                                                  #Convert to Dataframe
data_df['Label'] = data_df['Data'].apply(lambda x: x.split('||')[1].replace('\n', ''))  #Split Label into new column - Y1 Input
data_df['Data'] = data_df['Data'].apply(lambda x: x.split('||')[0].strip())             #Split Data into new column
data_df['Data_Clean'] = data_df['Data'].apply(lambda x: [re.sub('<.*?>', '', y ) for y in x.split(' ')]) #Remove Tags from Data into new column - X Input
data_df['Annot'] = data_df['Data'].apply(lambda x: annotate_sentence(x))                #Annotate Sentences - Y2 Input
print(data_df.shape)

(3655, 4)


In [4]:
data_df

Unnamed: 0,Data,Label,Data_Clean,Annot
0,All live births > or = 23 weeks at the Univers...,NONE,"[All, live, births, >, or, =, 23, weeks, at, t...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,The total cesarean rate was 14.4 % ( 344 of 23...,NONE,"[The, total, cesarean, rate, was, 14.4, %, (, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,Abnormal presentation was the most common indi...,NONE,"[Abnormal, presentation, was, the, most, commo...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
3,The `` corrected '' cesarean rate ( maternal-f...,NONE,"[The, ``, corrected, '', cesarean, rate, (, ma...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"Furthermore , when all deliveries were analyze...",NONE,"[Furthermore, ,, when, all, deliveries, were, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...,...,...
3650,Special report : <TREAT> pressure-reducing sup...,TREAT_FOR_DIS,"[Special, report, :, , pressure-reducing, supp...","[O, O, O, B-TREAT, I-TREAT, I-TREAT, O, O, O, ..."
3651,<TREAT> Intravenous immune globulin </TREAT> f...,TREAT_FOR_DIS,"[, Intravenous, immune, globulin, , for, , rec...","[B-TREAT, I-TREAT, I-TREAT, O, B-DIS, I-DIS, I..."
3652,<TREAT> External counterpulsation </TREAT> for...,TREAT_FOR_DIS,"[, External, counterpulsation, , for, treatmen...","[B-TREAT, I-TREAT, O, O, O, B-DIS, I-DIS, I-DI..."
3653,<TREAT> Intra-articular hyaluronan injections ...,TREAT_FOR_DIS,"[, Intra-articular, hyaluronan, injections, , ...","[B-TREAT, I-TREAT, I-TREAT, O, O, O, B-DIS, O,..."


In [5]:
s = "based on the results of three well-designed studies demonstrating the vaccine 's safety , immunogenicity , and efficacy , the <TREAT_PREV> vaccine </TREAT_PREV> is safe and effective for active immunization of children < 2 years of age against <DIS_PREV> invasive disease caused by seven streptococcus pneumoniae serotypes </DIS_PREV> included in the vaccine."
print(annotate_sentence(s))


['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-TREAT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DIS', 'I-DIS', 'I-DIS', 'I-DIS', 'I-DIS', 'I-DIS', 'I-DIS', 'I-DIS', 'O', 'O', 'O', 'O']


In [6]:
x = 3651
s = data_df[x:x+1]['Data'][x]
print(s)
print(len(s.split(' ')))
print(s.split(' '))

<TREAT> Intravenous immune globulin </TREAT> for <DIS> recurrent spontaneous abortion </DIS> .
12
['<TREAT>', 'Intravenous', 'immune', 'globulin', '</TREAT>', 'for', '<DIS>', 'recurrent', 'spontaneous', 'abortion', '</DIS>', '.']


In [7]:
print(annotate_sentence(s))
print(len(annotate_sentence(s)))

['B-TREAT', 'I-TREAT', 'I-TREAT', 'O', 'B-DIS', 'I-DIS', 'I-DIS', 'O']
8
