### Raw Data Processing

In [2]:
# import libraries

import os
import io
import re
import csv
from itertools import chain
from string import punctuation
from time import time
from nltk.tokenize import sent_tokenize, word_tokenize

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/lea/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Helper Functions

In [4]:
def process_ann(ann_file):
    """Helper function that reads a .ann file,
       strips out newline characters, splits the tab-delimited entries,
       and extracts information for labeling corresponding .txt file
       
       Input:
       ann_file = tab-delimited brat annotation file with the following format
                  NER: [entity_ID]\t[label start_offset end_offset]\t[entity]
                  RE:  [relation_ID]\t[relation_type argument1 argument2]
       
       Outputs:
       cleaned_offsets = list of tuples for labeling corresponding .txt file
                         format: (offset, label, entity ID)
       corrections = dictionary of entity ID mappings for overlapping offsets"""
    
    with io.open(ann_file, 'r', encoding='utf-8', errors='ignore') as text:
        ann = [x.strip().split('\t') for x in text.readlines() if x.strip().split('\t')[0][0] == 'T']
    
    offsets = []
    
    for x in ann:
        entity_id = x[0]
        start = int(x[1].split()[1])
        end = int(x[1].split()[2])
        label = x[1].split()[0]
        
        offsets.append((start, 'S', label, entity_id))
        offsets.append((end, 'E', label, entity_id))
    
    sorted_offsets = sorted(offsets, key=lambda x:x[0])
    
    cleaned_offsets = []
    corrections = {}
    
    hold = None
    indicator = None
    
    for tup in sorted_offsets:
        
        if indicator == 'S':
            if tup[1] == 'E':
                cleaned_offsets.append(hold)
                hold = (tup[0], 'O', 'X')
                indicator = tup[1]
            elif tup[1] == 'S':
                corrections.update({tup[3]:hold[2]})
                indicator = '*'
        
        elif indicator == 'E':
            cleaned_offsets.append(hold)
            hold = (tup[0], tup[2], tup[3])
            indicator = tup[1]
        
        elif indicator == '*':
            indicator = 'S'

        else:
            hold = (tup[0], tup[2], tup[3])
            indicator = tup[1]
            
    cleaned_offsets.append(hold)
    
    return cleaned_offsets, corrections

In [5]:
def ann_chunker(txt_file, offsets):
    """Helper function that reads in a .txt file as one string,
       divides it based on the cleaned offsets from its .ann file
       and labels chunks with NER tags
       
       Inputs:
       txt_file = file that contains all the patent text
                  considered as one sentence in this task
       offsets = list of tuples for labeling corresponding .txt file
                 format: (offset, label, entity ID)
       
       Output:
       ann_chunks = list of annotated chunks based on .ann file offsets
                    format: (chunk, label)"""
    
    with io.open(txt_file, 'r', encoding='utf-8', errors='ignore') as text:
        full_text = text.read()
    
    start = 0
    end = offsets[0][0]
    label = 'O'
    
    ann_chunks = [(full_text[:end], label)]
    
    for i in range(len(offsets)):
        start = offsets[i][0]
        label = offsets[i][1]
        
        if i < len(offsets) - 1:
            end = offsets[i+1][0]
            term = [(full_text[start:end], label)]
            if term[0]:
                ann_chunks.extend(term)
        
        else:
            term = [(full_text[start:], label)]  
            ann_chunks.extend(term)
    
    return ann_chunks

In [6]:
def bio_labeler(chunks):
    """Helper function that further processes annotated chunks from ann_chunker()
       Tokenizes the chunks and applies BIO labels to each token
       
       Inputs:
       chunks = list of annotated chunks based on .ann file offsets
                format: (chunk, label)
       
       Output:
       bio_doc = document transformed into a list of tokens with bio labels"""
    
    bio_tokens = []
    
    for tup in chunks:
        chunk, label = tup
        
        if label == 'O':
            sentences = sent_tokenize(chunk.strip())
            if sentences:
                for s in sentences:
                    for x in word_tokenize(s):
                        bio_tokens.append([x, label])
                
        else:
            tokens = [x for x in chunk.split(' ')]
            for i in range(len(tokens)):
                if i == 0:
                    bio_tokens.append([tokens[i], 'B-' + label])
                else:
                    bio_tokens.append([tokens[i], 'I-' + label])
    
    return bio_tokens

In [7]:
def generate_ner_files(filepaths, output_path):
    """Helper function that reads .txt and corresponding .ann files from a path
       and generates csv file in CoNLL 2003 format (for use in NER task)
       
       Inputs:
       filepaths = filepaths (folder + filename, but no extension) for .txt and .ann files
       output_path = filepath (folder + filename, but no extension) for output file"""
    
    start = time()
    
    docs = []
    corrections = []
    
    for file in filepaths:
        
        docs.append([f'SNIPPET: {file[-4:]}'])
        corrections.append([f'SNIPPET: {file[-4:]}'])
        
        cleaned_offsets, file_corrections = process_ann(f'{file}.ann')
        corrections.append(file_corrections)
        
        chunks = ann_chunker(f'{file}.txt', cleaned_offsets)
        bio_tokens = bio_labeler(chunks)
        docs.extend(bio_tokens)
    
    with open(f'{output_path}.csv', 'w') as f:
        writer = csv.writer(f, delimiter='\t')
        writer.writerows(docs)
    
    with open(f'{output_path}_corrections.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerows(corrections)
    
    end = time() - start
    print(f'Finished in {end:.3f} seconds')

In [22]:
def count_doc_length(filepaths):
    """Helper function that reads .txt and corresponding .ann files from a path
       and counts the number of tokens for each after processing
       
       Input:
       filepaths = filepaths (folder + filename, but no extension) for .txt and .ann files
       
       Output:
       doc_lengths = dictionary of document lengths"""
    
    doc_lengths = {}
    
    for file in filepaths:
        
        cleaned_offsets, file_corrections = process_ann(f'{file}.ann')
        chunks = ann_chunker(f'{file}.txt', cleaned_offsets)
        bio_tokens = bio_labeler(chunks)
        
        #doc_lengths.update({file[-4:]: len(chunks)})
        doc_lengths.update({file[-4:]: len(bio_tokens)})
    
    return doc_lengths

#### Test the Helper Functions

In [8]:
test_path = '../raw_data/sample_ee/0000'

In [9]:
with io.open(f'{test_path}.txt', 'r', encoding='utf-8', errors='ignore') as text:
    full_text = text.read()
full_text

'Example 194\n3-Isobutyl-5-methyl-1-(oxetan-2-ylmethyl)-6-[(2-oxoimidazolidin-1-yl)methyl]thieno[2,3-d]pyrimidine-2,4(1H,3H)-dione (racemate)\n813 mg (1.84 mmol) of the compound from Example 243A were dissolved in 40 ml of dioxane, and 461 mg (2.76 mmol) of CDI were added. The mixture was stirred at RT for 16 h. The reaction solution was then concentrated on a rotary evaporator. The residue was dissolved in 15 ml of DMSO and this solution was purified by means of preparative HPLC (Method 14). Combination of the product fractions and freeze-drying gave 383 mg (42% of theory) of the title compound'

In [10]:
with io.open(f'{test_path}.ann', 'r', encoding='utf-8', errors='ignore') as text:
    ann = [x.strip().split('\t') for x in text.readlines()] #if x.strip().split('\t')[0][0] == 'T']
ann[:3]

[['T0', 'OTHER_COMPOUND 417 421', 'DMSO'],
 ['T1', 'TIME 305 309', '16 h'],
 ['T2', 'REACTION_PRODUCT 585 599', 'title compound']]

In [11]:
test_offsets, test_corrections = process_ann(f'{test_path}.ann')

In [12]:
test_offsets[:3]

[(8, 'EXAMPLE_LABEL', 'T8'), (11, 'O', 'X'), (12, 'REACTION_PRODUCT', 'T6')]

In [13]:
test_corrections

{}

In [14]:
trial_sentence = ann_chunker(f'{test_path}.txt', test_offsets)

In [15]:
trial_sentence[:3]

[('Example ', 'O'), ('194', 'EXAMPLE_LABEL'), ('\n', 'O')]

In [16]:
trial_doc = bio_labeler(trial_sentence)
trial_doc[:3]

[['Example', 'O'],
 ['194', 'B-EXAMPLE_LABEL'],
 ['3-Isobutyl-5-methyl-1-(oxetan-2-ylmethyl)-6-[(2-oxoimidazolidin-1-yl)methyl]thieno[2,3-d]pyrimidine-2,4(1H,3H)-dione',
  'B-REACTION_PRODUCT']]

In [17]:
len(trial_doc)

100

In [18]:
generate_ner_files([test_path], '../raw_data/test')
with io.open('../raw_data/test.csv', 'r', encoding='utf-8', errors='ignore') as sample:
    output = sample.readlines()
output[:3]

Finished in 0.051 seconds


['SNIPPET: 0000\n', 'Example\tO\n', '194\tB-EXAMPLE_LABEL\n']

In [19]:
output[0][:8]

'SNIPPET:'

In [20]:
len(output)

101

#### Process the Raw Data

In [23]:
# generate sample set
path_sample = '../raw_data/sample_ee'
filenames_sample = list({x[:4] for x in os.listdir(path_sample) if x[0] != '.'})
filepath_sample = [f'{path_sample}/{x}' for x in filenames_sample]

output_sample = '../data/sample/sample_ner'
generate_ner_files(filepath_sample, output_sample)

with io.open(f'{output_sample}.csv', 'r', encoding='utf-8', errors='ignore') as sample:
    output = sample.readlines()
check = [1 for x in output if x[:8] == 'SNIPPET:']
print(f'Number of train snippets: {len(check)}')

sample_lengths = count_doc_length(filepath_sample)
print(f'Minimum snippet length: {min(sample_lengths.values())}')
print(f'Maximum snippet length: {max(sample_lengths.values())}')

Finished in 0.579 seconds
Number of train snippets: 50
Minimum snippet length: 38
Maximum snippet length: 212


In [24]:
# generate filename list for train, dev, and test sets
path_train = '../raw_data/EE/ee_train'
filenames_train = list({x[:4] for x in os.listdir(path_train) if x[0] != '.'})
print(f'Number of train files: {len(filenames_train)}')

path_dev = '../raw_data/EE/ee_dev'
filenames_dev = list({x[:4] for x in os.listdir(path_dev) if x[0] != '.'})
print(f'Number of dev files: {len(filenames_dev)}')

path_test = '../raw_data/EE/ee_test'
filenames_test = list({x[:4] for x in os.listdir(path_test) if x[0] != '.'})
print(f'Number of test files: {len(filenames_test)}')

path_test_ann = '../raw_data/EE/ee_test_ann'
filenames_test_ann = list({x[:4] for x in os.listdir(path_test_ann) if x[0] != '.'})
print(f'Number of test .ann files: {len(filenames_test_ann)}')

Number of train files: 900
Number of dev files: 225
Number of test files: 9999
Number of test .ann files: 375


In [25]:
# check how many test .txt files match the .ann files
intersect = list(set(filenames_test) & set(filenames_test_ann))
len(intersect)

375

In [26]:
# generate train set
filepath_train = [f'{path_train}/{x}' for x in filenames_train]

output_train = '../data/ner/ee_ner_train'
generate_ner_files(filepath_train, output_train)

with io.open(f'{output_train}.csv', 'r', encoding='utf-8', errors='ignore') as sample:
    output = sample.readlines()
check = [1 for x in output if x[:8] == 'SNIPPET:']
print(f'Number of train sentences: {len(check)}')

train_lengths = count_doc_length(filepath_train)
print(f'Minimum snippet length: {min(train_lengths.values())}')
print(f'Maximum snippet length: {max(train_lengths.values())}')

Finished in 15.154 seconds
Number of train sentences: 900
Minimum snippet length: 32
Maximum snippet length: 1053


In [27]:
# how many documents are larger than BERT base and large?
train_large = [key for key, value in train_lengths.items() if value > 1022]
print(f'Snippets larger than BERT large: {len(train_large)}, {train_large}')

train_base = [key for key, value in train_lengths.items() if value > 510]
print(f'Snippets larger than BERT base: {len(train_base)}, {train_base}')

Snippets larger than BERT large: 1, ['0344']
Snippets larger than BERT base: 10, ['1122', '1123', '0929', '1307', '0729', '0311', '1378', '0110', '0532', '0344']


In [34]:
train_other = [key for key, value in train_lengths.items() if value > 900]
print(f'Snippets larger than BERT base: {len(train_other)}, {train_other}')

Snippets larger than BERT base: 3, ['1123', '0311', '0344']


In [29]:
# generate dev set
filepath_dev = [f'{path_dev}/{x}' for x in filenames_dev]

output_dev = '../data/ner/ee_ner_dev'
generate_ner_files(filepath_dev, output_dev)

with io.open(f'{output_dev}.csv', 'r', encoding='utf-8', errors='ignore') as sample:
    output = sample.readlines()
check = [1 for x in output if x[:8] == 'SNIPPET:']
print(f'Number of train sentences: {len(check)}')

dev_lengths = count_doc_length(filepath_dev)
print(f'Minimum snippet length: {min(dev_lengths.values())}')
print(f'Maximum snippet length: {max(dev_lengths.values())}')

Finished in 4.394 seconds
Number of train sentences: 225
Minimum snippet length: 33
Maximum snippet length: 542


In [30]:
# how many documents are larger than BERT base and large?
dev_large = [key for key, value in dev_lengths.items() if value > 1022]
print(f'Snippets larger than BERT large: {len(dev_large)}, {dev_large}')

dev_base = [key for key, value in dev_lengths.items() if value > 500]
print(f'Snippets larger than BERT base: {len(dev_base)}, {dev_base}')

Snippets larger than BERT large: 0, []
Snippets larger than BERT base: 1, ['0389']


In [31]:
# generate test set
filepath_test = [f'{path_test}/{x}' for x in intersect]

output_test = '../data/ner/ee_ner_test'
generate_ner_files(filepath_test, output_test)

with io.open(f'{output_test}.csv', 'r', encoding='utf-8', errors='ignore') as sample:
    output = sample.readlines()
check = [1 for x in output if x[:8] == 'SNIPPET:']
print(f'Number of train sentences: {len(check)}')

test_lengths = count_doc_length(filepath_test)
print(f'Minimum snippet length: {min(test_lengths.values())}')
print(f'Maximum snippet length: {max(test_lengths.values())}')

Finished in 6.896 seconds
Number of train sentences: 375
Minimum snippet length: 32
Maximum snippet length: 1009


In [32]:
# how many documents are larger than BERT base and large?
test_large = [key for key, value in test_lengths.items() if value > 1022]
print(f'Snippets larger than BERT large: {len(test_large)}, {test_large}')

test_base = [key for key, value in test_lengths.items() if value > 500]
print(f'Snippets larger than BERT base: {len(test_base)}, {test_base}')

Snippets larger than BERT large: 0, []
Snippets larger than BERT base: 3, ['7980', '6846', '1283']
