### Raw Data Processing

In [1]:
# import libraries

import os
import io
import re
import csv
from string import punctuation
from time import time
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/lea/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Helper Functions

In [3]:
def process_ann(ann_file):
    """Helper function that reads a .ann file,
       strips out newline characters, splits the tab-delimited entries,
       and extracts information for identifying entities and relations 
       in corresponding .txt file
       
       Input:
       ann_file = tab-delimited brat annotation file with the following format
                  NER: [entity_ID]\t[label start_offset end_offset]\t[entity]
                  RE:  [relation_ID]\t[relation_type argument1 argument2]
       
       Outputs:
       cleaned_offsets = list of tuples for labeling corresponding .txt file
                         format: (offset, label, entity ID)
       relations = list of tuples for extracting relations from corresponding .txt file
                   format: (relation ID, relation_type, entity ID #1, entity ID #2)"""
    
    with io.open(ann_file, 'r', encoding='utf-8', errors='ignore') as f:
        text = [x.strip().split('\t') for x in f.readlines()]
        
    ann = [x for x in text if x[0][0] == 'T']
    rel = [x for x in text if x[0][0] == 'R']
    
    # extract information for identifying entities
    offsets = []
    
    for x in ann:
        entity_id = x[0]
        start = int(x[1].split()[1])
        end = int(x[1].split()[2])
        label = x[1].split()[0]
        
        offsets.append((start, 'S', label, entity_id))
        offsets.append((end, 'E', label, entity_id))
    
    # sort offsets and clean overlapping entries
    sorted_offsets = sorted(offsets, key=lambda x:x[0])
    
    cleaned_offsets = []
    corrections = {}
    
    hold = None
    indicator = None
    
    for tup in sorted_offsets:
        
        if indicator == 'S':
            if tup[1] == 'E':
                cleaned_offsets.append(hold)
                hold = (tup[0], 'O', 'X')
                indicator = tup[1]
            elif tup[1] == 'S':
                corrections.update({tup[3]:hold[2]})
                indicator = '*'
        
        elif indicator == 'E':
            cleaned_offsets.append(hold)
            hold = (tup[0], tup[2], tup[3])
            indicator = tup[1]
        
        elif indicator == '*':
            indicator = 'S'

        else:
            hold = (tup[0], tup[2], tup[3])
            indicator = tup[1]
            
    cleaned_offsets.append(hold)
    
    # extract information for identifying relations
    relations = []
    
    for r in rel:
        relation_id = r[0]
        relation_type = r[1].split()[0]
        entity1 = r[1].split()[1][5:]
        entity2 = r[1].split()[2][5:]
        
        if entity1 in corrections.keys():
            entity1 = corrections[entity1]
        if entity2 in corrections.keys():
            entity2 = corrections[entity2]
        
        relations.append((relation_id, relation_type, entity1, entity2))
    
    return cleaned_offsets, relations

In [4]:
def ann_chunker(txt_file, offsets):
    """Helper function that reads in a .txt file as one string,
       divides it based on the cleaned offsets from its .ann file
       and labels chunks with NER tags
       
       Inputs:
       txt_file = file that contains all the patent text
                  considered as one sentence in this task
       offsets = list of tuples for labeling corresponding .txt file
                 format: (offset, label, entity ID)
       
       Output:
       ann_chunks = list of annotated chunks based on .ann file offsets
                    format: (chunk, label, entity ID)"""
    
    with io.open(txt_file, 'r', encoding='utf-8', errors='ignore') as text:
        full_text = text.read()
    
    start = 0
    end = offsets[0][0]
    label = 'O'
    entity_id = 'X'
    
    ann_chunks = [(full_text[:end], label, entity_id)]
    
    for i in range(len(offsets)):
        start = offsets[i][0]
        label = offsets[i][1]
        entity_id = offsets[i][2]
        
        if i < len(offsets) - 1:
            end = offsets[i+1][0]
            term = [(full_text[start:end], label, entity_id)]
            if term[0]:
                ann_chunks.extend(term)
        
        else:
            term = [(full_text[start:], label, entity_id)]  
            ann_chunks.extend(term)
    
    return ann_chunks

In [5]:
def relation_input(snippet_id, rel_tup, chunks):
    """Helper function that creates one input snippet for BERT SRE
    
    Inputs:
    snippet_id = filename of snippet
    rel_tup = tuple from relations list generated by process_ann()
          format: (relation ID, relation_type, entity ID #1, entity ID #2)
    chunks = list of annotated chunks from ann_chunker()
    
    Output:
    rel_input = input snippet ready for BERT SRE
                format: [snippet_id+relation_id]/t[relation_type]/t[cleaned snippet with ner markers]"""
    
    # unpack relation_tup
    relation_id = rel_tup[0]
    relation_type = rel_tup[1]
    entity_list = [rel_tup[2], rel_tup[3]]
    
    new_id = snippet_id + '-' + relation_id
    
    # build cleaned snippet with ner markers
    snippets = []
    i = 1
    
    for tup in chunks:
        chunk, label, entity = tup
        
        # clean chunk: remove punctuation,
        # word tokenize if not an entity
        # split by whitespace if entity
        processed_chunk = []
            
        if label == 'O':
            nopunct = re.sub(r'[,/()":\-\[\]\']', '', chunk.strip())
            sentences = sent_tokenize(nopunct)
            if sentences:
                for s in sentences:
                    for x in word_tokenize(s):
                        processed_chunk.append(x)
                
        else:
            nopunct = re.sub(r'[,/()":\-\[\]\']', '', chunk)
            tokens = [x for x in nopunct.split(' ') if x]
            for t in tokens:
                processed_chunk.append(t)
        
        # add ner markers before and after entities in relation
        if entity in entity_list:
            snippets.append(f'[E{i}]')
            snippets.extend(processed_chunk)
            snippets.append(f'[/E{i}]')
            i += 1
        
        else:
            snippets.extend(processed_chunk)

    # join snippet chunks to one clean snippet
    cleaned_snippet = ' '.join(snippets)
    
    return [new_id, relation_type, cleaned_snippet]

In [16]:
def generate_re_files(filepaths, output_path):
    """Helper function that reads .txt and corresponding .ann files from a path
       and generates csv file with snippets ready for BERT SRE (one snippet per line)
       
       Inputs:
       filepaths = filepaths (folder + filename, but no extension) for .txt and .ann files
       output_path = filepath (folder + filename, but no extension) for output file"""
    
    start = time()
    
    snippets = []
    
    for file in filepaths:
        
        snippet_id = file[-4:]
        
        cleaned_offsets, relations = process_ann(f'{file}.ann')
        chunks = ann_chunker(f'{file}.txt', cleaned_offsets)
        
        for tup in relations:
            line = relation_input(snippet_id, tup, chunks)
            snippets.append(line)   
    
    with open(f'{output_path}.csv', 'w') as f:
        writer = csv.writer(f, delimiter='\t')
        writer.writerows(snippets)
    
    end = time() - start
    print(f'Finished in {end:.3f} seconds')

#### Test the Helper Functions

In [6]:
test_path = '../raw_data/sample_ee/0015'

In [7]:
with io.open(f'{test_path}.txt', 'r', encoding='utf-8', errors='ignore') as text:
    full_text = text.read()
full_text

'[EXAMPLE 12]\nSynthesis of compound I-044\nStep 1 Synthesis of compound 44a\n(R)-Isopropylidene glycerol (3.71 mL, 30.0 mmol) was dissolved in tetrahydrofuran (50.0 mL). The solution was cooled to 0°C. Triphenylphosphine (8.66 g, 33.0 mmol), N-hydroxyphthalimide (5.38 g, 33.0 mmol), and a 2.7 mol/L solution of dimethyl azodicarboxylate in toluene (12.22 mL, 33.0 mmol) were added to the solution. The mixture was stirred at room temperature for 30 minutes. The reaction mixture was concentrated under reduced pressure. Methanol was added to the residue. The mixture was stirred. The obtained solid was collected by filtration, washed by methanol, and dried to afford the compound 44a (1.78 g, yield 21%).'

In [8]:
with io.open(f'{test_path}.ann', 'r', encoding='utf-8', errors='ignore') as text:
    ann = [x.strip().split('\t') for x in text.readlines()] #if x.strip().split('\t')[0][0] == 'R']
ann[:3]

[['T0', 'TEMPERATURE 423 439', 'room temperature'],
 ['T1', 'YIELD_OTHER 684 690', '1.78 g'],
 ['T2', 'YIELD_PERCENT 698 701', '21%']]

In [9]:
test_offsets, test_relations = process_ann(f'{test_path}.ann')

In [10]:
test_offsets[:3]

[(9, 'EXAMPLE_LABEL', 'T13'), (11, 'O', 'X'), (35, 'OTHER_COMPOUND', 'T3')]

In [11]:
test_relations[:3]

[('R0', 'ARG1', 'T18', 'T9'),
 ('R1', 'ARGM', 'T18', 'T1'),
 ('R2', 'ARGM', 'T18', 'T2')]

In [12]:
trial_sentence = ann_chunker(f'{test_path}.txt', test_offsets)

In [13]:
trial_sentence[:3]

[('[EXAMPLE ', 'O', 'X'),
 ('12', 'EXAMPLE_LABEL', 'T13'),
 (']\nSynthesis of compound ', 'O', 'X')]

In [14]:
trial_snippet = relation_input('0000', test_relations[0], trial_sentence)
trial_snippet

['0000-R0',
 'ARG1',
 'EXAMPLE 12 Synthesis of compound I044 Step 1 Synthesis of compound 44a RIsopropylidene glycerol 3.71 mL 30.0 mmol was dissolved in tetrahydrofuran 50.0 mL . The solution was cooled to 0°C . Triphenylphosphine 8.66 g 33.0 mmol Nhydroxyphthalimide 5.38 g 33.0 mmol and a 2.7 molL solution of dimethyl azodicarboxylate in toluene 12.22 mL 33.0 mmol were added to the solution . The mixture was stirred at room temperature for 30 minutes . The reaction mixture was concentrated under reduced pressure . Methanol was added to the residue . The mixture was stirred . The obtained solid was collected by filtration washed by methanol and dried to [E1] afford [/E1] the compound [E2] 44a [/E2] 1.78 g yield 21% .']

In [17]:
generate_re_files([test_path], '../raw_data/test')
with io.open('../raw_data/test.csv', 'r', encoding='utf-8', errors='ignore') as sample:
    output = sample.readlines()
output[:2]

Finished in 0.070 seconds


['0015-R0\tARG1\tEXAMPLE 12 Synthesis of compound I044 Step 1 Synthesis of compound 44a RIsopropylidene glycerol 3.71 mL 30.0 mmol was dissolved in tetrahydrofuran 50.0 mL . The solution was cooled to 0°C . Triphenylphosphine 8.66 g 33.0 mmol Nhydroxyphthalimide 5.38 g 33.0 mmol and a 2.7 molL solution of dimethyl azodicarboxylate in toluene 12.22 mL 33.0 mmol were added to the solution . The mixture was stirred at room temperature for 30 minutes . The reaction mixture was concentrated under reduced pressure . Methanol was added to the residue . The mixture was stirred . The obtained solid was collected by filtration washed by methanol and dried to [E1] afford [/E1] the compound [E2] 44a [/E2] 1.78 g yield 21% .\n',
 '0015-R1\tARGM\tEXAMPLE 12 Synthesis of compound I044 Step 1 Synthesis of compound 44a RIsopropylidene glycerol 3.71 mL 30.0 mmol was dissolved in tetrahydrofuran 50.0 mL . The solution was cooled to 0°C . Triphenylphosphine 8.66 g 33.0 mmol Nhydroxyphthalimide 5.38 g 33.0

#### Process the Raw Data

In [18]:
# generate sample set
path_sample = '../raw_data/sample_ee'
filenames_sample = list({x[:4] for x in os.listdir(path_sample) if x[0] != '.'})
filepath_sample = [f'{path_sample}/{x}' for x in filenames_sample]

output_sample = '../data/sre_em/sre_em_sample'
generate_re_files(filepath_sample, output_sample)

with io.open(f'{output_sample}.csv', 'r', encoding='utf-8', errors='ignore') as sample:
    output = sample.readlines()
check = [1 for x in output]
print(f'Number of sample snippets: {len(check)}')

Finished in 1.938 seconds
Number of sample snippets: 658


In [19]:
# generate filename list for train, dev, and test sets
path_train = '../raw_data/EE/ee_train'
filenames_train = list({x[:4] for x in os.listdir(path_train) if x[0] != '.'})
print(f'Number of train files: {len(filenames_train)}')

path_dev = '../raw_data/EE/ee_dev'
filenames_dev = list({x[:4] for x in os.listdir(path_dev) if x[0] != '.'})
print(f'Number of dev files: {len(filenames_dev)}')

path_test = '../raw_data/EE/ee_test'
filenames_test = list({x[:4] for x in os.listdir(path_test) if x[0] != '.'})
print(f'Number of test files: {len(filenames_test)}')

path_test_ann = '../raw_data/EE/ee_test_ann'
filenames_test_ann = list({x[:4] for x in os.listdir(path_test_ann) if x[0] != '.'})
print(f'Number of test .ann files: {len(filenames_test_ann)}')

Number of train files: 900
Number of dev files: 225
Number of test files: 9999
Number of test .ann files: 375


In [20]:
# check how many test .txt files match the .ann files
intersect = list(set(filenames_test) & set(filenames_test_ann))
len(intersect)

375

In [21]:
# generate train set
filepath_train = [f'{path_train}/{x}' for x in filenames_train]

output_train = '../data/sre_em/sre_em_train'
generate_re_files(filepath_train, output_train)

with io.open(f'{output_train}.csv', 'r', encoding='utf-8', errors='ignore') as sample:
    output = sample.readlines()
check = [1 for x in output]
print(f'Number of train snippets: {len(check)}')

Finished in 54.522 seconds
Number of train snippets: 14310


In [22]:
# generate dev set
filepath_dev = [f'{path_dev}/{x}' for x in filenames_dev]

output_dev = '../data/sre_em/sre_em_dev'
generate_re_files(filepath_dev, output_dev)

with io.open(f'{output_dev}.csv', 'r', encoding='utf-8', errors='ignore') as sample:
    output = sample.readlines()
check = [1 for x in output]
print(f'Number of dev sentences: {len(check)}')

Finished in 11.255 seconds
Number of dev sentences: 3332


In [23]:
# generate test set
filepath_test = [f'{path_test}/{x}' for x in intersect]

output_test = '../data/sre_em/sre_em_test'
generate_re_files(filepath_test, output_test)

with io.open(f'{output_test}.csv', 'r', encoding='utf-8', errors='ignore') as sample:
    output = sample.readlines()
check = [1 for x in output if x[:8]]
print(f'Number of test sentences: {len(check)}')

Finished in 21.594 seconds
Number of test sentences: 5803
