### Count NER document length after processing

In [40]:
# import libraries

import os
import io
import re
import csv
from itertools import chain
from string import punctuation
from time import time
from nltk.tokenize import sent_tokenize, word_tokenize

import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib.ticker import PercentFormatter
import numpy as np

In [26]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/lea/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Helper Functions

In [27]:
def process_ann(ann_file):
    """Helper function that reads a .ann file,
       strips out newline characters, splits the tab-delimited entries,
       and extracts information for labeling corresponding .txt file
       
       Input:
       ann_file = tab-delimited brat annotation file with the following format
                  NER: [entity_ID]\t[label start_offset end_offset]\t[entity]
                  RE:  [relation_ID]\t[relation_type argument1 argument2]
       
       Outputs:
       cleaned_offsets = list of tuples for labeling corresponding .txt file
                         format: (offset, label, entity ID)
       corrections = dictionary of entity ID mappings for overlapping offsets"""
    
    with io.open(ann_file, 'r', encoding='utf-8', errors='ignore') as text:
        ann = [x.strip().split('\t') for x in text.readlines() if x.strip().split('\t')[0][0] == 'T']
    
    offsets = []
    
    for x in ann:
        entity_id = x[0]
        start = int(x[1].split()[1])
        end = int(x[1].split()[2])
        label = x[1].split()[0]
        
        offsets.append((start, 'S', label, entity_id))
        offsets.append((end, 'E', label, entity_id))
    
    sorted_offsets = sorted(offsets, key=lambda x:x[0])
    
    cleaned_offsets = []
    corrections = {}
    
    hold = None
    indicator = None
    
    for tup in sorted_offsets:
        
        if indicator == 'S':
            if tup[1] == 'E':
                cleaned_offsets.append(hold)
                hold = (tup[0], 'O', 'X')
                indicator = tup[1]
            elif tup[1] == 'S':
                corrections.update({tup[3]:hold[2]})
                indicator = '*'
        
        elif indicator == 'E':
            cleaned_offsets.append(hold)
            hold = (tup[0], tup[2], tup[3])
            indicator = tup[1]
        
        elif indicator == '*':
            indicator = 'S'

        else:
            hold = (tup[0], tup[2], tup[3])
            indicator = tup[1]
            
    cleaned_offsets.append(hold)
    
    return cleaned_offsets, corrections

In [28]:
def ann_chunker(txt_file, offsets):
    """Helper function that reads in a .txt file as one string,
       divides it based on the cleaned offsets from its .ann file
       and labels chunks with NER tags
       
       Inputs:
       txt_file = file that contains all the patent text
                  considered as one sentence in this task
       offsets = list of tuples for labeling corresponding .txt file
                 format: (offset, label, entity ID)
       
       Output:
       ann_chunks = list of annotated chunks based on .ann file offsets
                    format: (chunk, label)"""
    
    with io.open(txt_file, 'r', encoding='utf-8', errors='ignore') as text:
        full_text = text.read()
    
    start = 0
    end = offsets[0][0]
    label = 'O'
    
    ann_chunks = [(full_text[:end], label)]
    
    for i in range(len(offsets)):
        start = offsets[i][0]
        label = offsets[i][1]
        
        if i < len(offsets) - 1:
            end = offsets[i+1][0]
            term = [(full_text[start:end], label)]
            if term[0]:
                ann_chunks.extend(term)
        
        else:
            term = [(full_text[start:], label)]  
            ann_chunks.extend(term)
    
    return ann_chunks

In [29]:
def bio_labeler(chunks):
    """Helper function that further processes annotated chunks from ann_chunker()
       Tokenizes the chunks and applies BIO labels to each token
       
       Inputs:
       chunks = list of annotated chunks based on .ann file offsets
                format: (chunk, label)
       
       Output:
       bio_doc = document transformed into a list of tokens with bio labels"""
    
    bio_tokens = []
    
    for tup in chunks:
        chunk, label = tup
        
        if label == 'O':
            if chunk[:1] == '\n':
                bio_tokens.append([])
            sentences = sent_tokenize(chunk.strip())
            if sentences:
                for s in sentences:
                    for x in word_tokenize(s):
                        bio_tokens.append([x, label])
                        if x == '.':
                            bio_tokens.append([])
                
        else:
            tokens = [x for x in word_tokenize(chunk)]
            for i in range(len(tokens)):
                if i == 0:
                    bio_tokens.append([tokens[i], 'B-' + label])
                else:
                    bio_tokens.append([tokens[i], 'I-' + label])
    
    return bio_tokens

In [30]:
def count_doc_length(filepaths):
    """Helper function that reads .txt and corresponding .ann files from a path
       and counts the number of tokens for each after processing
       
       Input:
       filepaths = filepaths (folder + filename, but no extension) for .txt and .ann files
       
       Output:
       doc_lengths = dictionary of document lengths"""
    
    doc_lengths = {}
    
    for file in filepaths:
        
        cleaned_offsets, file_corrections = process_ann(f'{file}.ann')
        chunks = ann_chunker(f'{file}.txt', cleaned_offsets)
        bio_tokens = bio_labeler(chunks)
        
        #doc_lengths.update({file[-4:]: len(chunks)})
        doc_lengths.update({file[-4:]: len(bio_tokens)})
    
    return doc_lengths

#### Count document lengths

In [64]:
# sample set
path_sample = 'raw_data/sample_ee'
filenames_sample = list({x[:4] for x in os.listdir(path_sample) if x[0] != '.'})
filepath_sample = [f'{path_sample}/{x}' for x in filenames_sample]

sample_lengths = count_doc_length(filepath_sample)

print(f'Number of files: {len(sample_lengths.keys())}')
print(f'Minimum snippet length: {min(sample_lengths.values())}')
print(f'Maximum snippet length: {max(sample_lengths.values())}')

Number of files: 50
Minimum snippet length: 48
Maximum snippet length: 318


In [65]:
# train set
path_train = 'raw_data/EE/ee_train'
filenames_train = list({x[:4] for x in os.listdir(path_train) if x[0] != '.'})
filepath_train = [f'{path_train}/{x}' for x in filenames_train]

train_lengths = count_doc_length(filepath_train)

print(f'Number of files: {len(train_lengths.keys())}')
print(f'Minimum snippet length: {min(train_lengths.values())}')
print(f'Maximum snippet length: {max(train_lengths.values())}')

Number of files: 900
Minimum snippet length: 36
Maximum snippet length: 1301


In [66]:
# how many documents are larger than BERT base and large?
train_large = [key for key, value in train_lengths.items() if value > 1022]
print(f'Snippets larger than BERT large: {len(train_large)}, {train_large}')

train_base = [key for key, value in train_lengths.items() if value > 500]
print(f'Snippets larger than BERT base: {len(train_base)}, {train_base}')

Snippets larger than BERT large: 3, ['1123', '0344', '0311']
Snippets larger than BERT base: 11, ['1378', '1123', '0729', '0344', '0110', '1122', '1307', '0311', '0242', '0929', '0532']


In [72]:
train_other = [key for key, value in train_lengths.items() if value > 400]
print(f'Snippets larger than BERT base: {len(train_other)}, {train_other}')

Snippets larger than BERT base: 17, ['1378', '0762', '1123', '1289', '0729', '0344', '0110', '1122', '0411', '0129', '1307', '0311', '0810', '0242', '0929', '0951', '0532']


In [68]:
# dev set
path_dev = 'raw_data/EE/ee_dev'
filenames_dev = list({x[:4] for x in os.listdir(path_dev) if x[0] != '.'})
filepath_dev = [f'{path_dev}/{x}' for x in filenames_dev]

dev_lengths = count_doc_length(filepath_dev)

print(f'Number of files: {len(dev_lengths.keys())}')
print(f'Minimum snippet length: {min(dev_lengths.values())}')
print(f'Maximum snippet length: {max(dev_lengths.values())}')

Number of files: 225
Minimum snippet length: 43
Maximum snippet length: 652


In [69]:
# how many documents are larger than BERT base and large?
dev_large = [key for key, value in dev_lengths.items() if value > 1022]
print(f'Snippets larger than BERT large: {len(dev_large)}, {dev_large}')

dev_base = [key for key, value in dev_lengths.items() if value > 500]
print(f'Snippets larger than BERT base: {len(dev_base)}, {dev_base}')

Snippets larger than BERT large: 0, []
Snippets larger than BERT base: 1, ['0389']


In [70]:
# test set
path_test = 'raw_data/EE/ee_test'
filenames_test = list({x[:4] for x in os.listdir(path_test) if x[0] != '.'})
path_test_ann = 'raw_data/EE/ee_test_ann'
filenames_test_ann = list({x[:4] for x in os.listdir(path_test_ann) if x[0] != '.'})
intersect = list(set(filenames_test) & set(filenames_test_ann))
filepath_test = [f'{path_test}/{x}' for x in intersect]

test_lengths = count_doc_length(filepath_test)

print(f'Number of files: {len(test_lengths.keys())}')
print(f'Minimum snippet length: {min(test_lengths.values())}')
print(f'Maximum snippet length: {max(test_lengths.values())}')

Number of files: 375
Minimum snippet length: 43
Maximum snippet length: 1155


In [71]:
# how many documents are larger than BERT base and large?
test_large = [key for key, value in test_lengths.items() if value > 1022]
print(f'Snippets larger than BERT large: {len(test_large)}, {test_large}')

test_base = [key for key, value in test_lengths.items() if value > 500]
print(f'Snippets larger than BERT base: {len(test_base)}, {test_base}')

Snippets larger than BERT large: 1, ['7980']
Snippets larger than BERT base: 3, ['7980', '6846', '1283']
