### Raw Data EDA

In [1]:
# import libraries

import os
import io
import re
import csv
from string import punctuation
from time import time
from nltk.tokenize import sent_tokenize, word_tokenize

from statistics import *
import numpy as np

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/lea/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Helper Functions

In [3]:
def process_ann(ann_file):
    """Helper function that reads a .ann file,
       strips out newline characters, splits the tab-delimited entries,
       and extracts information for identifying entities and relations 
       in corresponding .txt file
       
       Input:
       ann_file = tab-delimited brat annotation file with the following format
                  NER: [entity_ID]\t[label start_offset end_offset]\t[entity]
                  RE:  [relation_ID]\t[relation_type argument1 argument2]
       
       Outputs:
       cleaned_offsets = list of tuples for labeling corresponding .txt file
                         format: (offset, label, entity ID)
       relations = list of tuples for extracting relations from corresponding .txt file
                   format: (relation ID, relation_type, entity ID #1, entity ID #2)"""
    
    with io.open(ann_file, 'r', encoding='utf-8', errors='ignore') as f:
        text = [x.strip().split('\t') for x in f.readlines()]
        
    ann = [x for x in text if x[0][0] == 'T']
    rel = [x for x in text if x[0][0] == 'R']
    
    # extract information for identifying entities
    offsets = []
    
    for x in ann:
        entity_id = x[0]
        start = int(x[1].split()[1])
        end = int(x[1].split()[2])
        label = x[1].split()[0]
        
        offsets.append((start, 'S', label, entity_id))
        offsets.append((end, 'E', label, entity_id))
    
    # sort offsets and clean overlapping entries
    sorted_offsets = sorted(offsets, key=lambda x:x[0])
    
    cleaned_offsets = []
    corrections = {}
    
    hold = None
    indicator = None
    
    for tup in sorted_offsets:
        
        if indicator == 'S':
            if tup[1] == 'E':
                cleaned_offsets.append(hold)
                hold = (tup[0], 'O', 'X')
                indicator = tup[1]
            elif tup[1] == 'S':
                corrections.update({tup[3]:hold[2]})
                indicator = '*'
        
        elif indicator == 'E':
            cleaned_offsets.append(hold)
            hold = (tup[0], tup[2], tup[3])
            indicator = tup[1]
        
        elif indicator == '*':
            indicator = 'S'

        else:
            hold = (tup[0], tup[2], tup[3])
            indicator = tup[1]
            
    cleaned_offsets.append(hold)
    
    # extract information for identifying relations
    relations = []
    
    for r in rel:
        relation_id = r[0]
        relation_type = r[1].split()[0]
        entity1 = r[1].split()[1][5:]
        entity2 = r[1].split()[2][5:]
        
        if entity1 in corrections.keys():
            entity1 = corrections[entity1]
        if entity2 in corrections.keys():
            entity2 = corrections[entity2]
        
        relations.append((relation_id, relation_type, entity1, entity2))
    
    return cleaned_offsets, relations

#### Test the Helper Functions

In [4]:
test_path = '../raw_data/sample_ee/0003'

In [5]:
with io.open(f'{test_path}.txt', 'r', encoding='utf-8', errors='ignore') as text:
    full_text = text.read()
full_text

"Example 51-5 : Preparation of 2'-amino-6-(2-amino-6-morpholinopyrimidin-4-yl)-3'-fluoro-[2,4'-bipyridin]-5-ol (LXXVI)\n6-(2-Amino-6-morpholinopyrimidin-4-yl)-3'-fluoro-5-methoxy-[2,4'-bipyridin]-2'-amine (120 mg, 301.95 µmol) and pyridine hydrochloride (Pyridine HCl) (523.41 mg, 4.53 mmol) were stirred in a sealed tube at 170 °C for 30 min. The resulting mixture was cooled to room temperature, neutralized with 2 N NaOH solution to provide a solid. The solid was filtered, washed with diethylether and dried to give the title compound (72 mg, 62 %)."

In [6]:
with io.open(f'{test_path}.ann', 'r', encoding='utf-8', errors='ignore') as text:
    ann = [x.strip().split('\t') for x in text.readlines()] #if x.strip().split('\t')[0][0] == 'R']
ann[:3]

[['T0', 'TIME 334 340', '30 min'],
 ['T1', 'TEMPERATURE 378 394', 'room temperature'],
 ['T2', 'YIELD_PERCENT 545 549', '62 %']]

In [7]:
test_offsets, test_relations = process_ann(f'{test_path}.ann')

In [8]:
test_offsets[:3]

[(8, 'EXAMPLE_LABEL', 'T9'), (12, 'O', 'X'), (30, 'REACTION_PRODUCT', 'T7')]

In [9]:
test_relations[:3]

[('R0', 'ARG1', 'T14', 'T13'),
 ('R1', 'ARG1', 'T15', 'T12'),
 ('R2', 'ARGM', 'T15', 'T6')]

#### EDA on training data

In [10]:
path_train = '../raw_data/EE/ee_train'
filenames_train = list({x[:4] for x in os.listdir(path_train) if x[0] != '.'})
print(f'Number of train files: {len(filenames_train)}')

Number of train files: 900


In [11]:
positive_relations = []
unique_triggers = []
entities = []
example_labels = []
example_check = []
pairs_per_trigger = []
before_count = []
span_before = []
after_count = []
span_after = []

for file in filenames_train:
    
    offsets, relations = process_ann(f'{path_train}/{file}.ann')
    
    # how many positive relations per snippet?
    positive_relations.append(len(relations))
    
    # how many unique trigger words per snippet?
    arg1 = {x[2] for x in relations}
    unique_triggers.append(len(arg1))
    
    arg2 = {x[3] for x in relations}
    example = []
    for x in offsets:
        if x[1] == 'EXAMPLE_LABEL':
            example.append(x[2])
    
    # how many example labels per snippet?
    example_labels.append(len(example))
    
    # is example label ever in a relation?
    for x in example:
        if x in arg2:
            example_check.append(1)
        else:
            example_check.append(0)
    
    # how many pairs per trigger word?
    for x in arg1:
        count = [1 for r in relations if r[2] == x]
        pairs_per_trigger.append(len(count))
        
    # what is the span?
    entity_order = [x[2] for x in offsets if (x[2] != 'X' and x[1] != 'EXAMPLE_LABEL')]
    entities.append(len(entity_order))
    
    for x in arg1:
        index = entity_order.index(x)
        rels = [entity_order.index(r[3]) for r in relations if r[2] == x]
        
        bf = [x for x in rels if x < index]
        if bf:
            before_count.append(len(bf))
            span = index - min(bf)
            span_before.append(span)
        
        af = [x for x in rels if x > index]
        if af:
            after_count.append(len(af))
            span = max(af) - index
            span_after.append(span)

In [12]:
print(f'Is example label ever in a relation? {set(example_check)}')
print(f'Max example labels in train set: {max(example_labels)}')

Is example label ever in a relation? {0}
Max example labels in train set: 2


In [13]:
print(f'Total positive relations in train set: {sum(positive_relations)}')
print()
print(f'Total unique trigger words in train set: {sum(unique_triggers)}')
print(f'Total number of entities in train set (excluding EXAMPLE_LABEL): {sum(entities)}')
print()
print(f'Median unique trigger words in train set: {median(unique_triggers):.1f}')
print(f'Average unique trigger words in train set: {mean(unique_triggers):.1f}')
print(f'Max unique trigger words in train set: {max(unique_triggers)}')
print()
print(f'Median pairs per trigger word in train set: {median(pairs_per_trigger):.1f}')
print(f'Average pairs per trigger word in train set: {mean(pairs_per_trigger):.1f}')
print(f'Max pairs per trigger word in train set: {max(pairs_per_trigger)}')
print()
print(f'Median entities in train set: {median(entities):.1f}')
print(f'Average entities in train set: {mean(entities):.1f}')
print(f'Max entities in train set: {max(entities)}')

Total positive relations in train set: 14310

Total unique trigger words in train set: 6852
Total number of entities in train set (excluding EXAMPLE_LABEL): 22297

Median unique trigger words in train set: 7.0
Average unique trigger words in train set: 7.6
Max unique trigger words in train set: 62

Median pairs per trigger word in train set: 2.0
Average pairs per trigger word in train set: 2.1
Max pairs per trigger word in train set: 13

Median entities in train set: 24.0
Average entities in train set: 24.8
Max entities in train set: 182


In [14]:
print('# of pairs   count')

for i in range(max(pairs_per_trigger)):
    count = [1 for x in pairs_per_trigger if x == (i+1)]
    print(f'{(i+1):^10}   {len(count)}')

# of pairs   count
    1        2985
    2        1674
    3        1395
    4        410
    5        253
    6        92
    7        24
    8        14
    9        1
    10       2
    11       0
    12       0
    13       2


In [15]:
print(f'Median entities before trigger word in train set: {median(before_count):.1f}')
print(f'Average entities before trigger word in train set: {mean(before_count):.1f}')
print(f'Max entities before trigger word in train set: {max(before_count)}')
print()
print(f'Median entities after trigger word in train set: {median(after_count):.1f}')
print(f'Average entities after trigger word in train set: {mean(after_count):.1f}')
print(f'Max entities after trigger word in train set: {max(after_count)}')

Median entities before trigger word in train set: 2.0
Average entities before trigger word in train set: 1.8
Max entities before trigger word in train set: 8

Median entities after trigger word in train set: 2.0
Average entities after trigger word in train set: 1.8
Max entities after trigger word in train set: 13


In [16]:
more_than_before = [x for x in before_count if x > 5]
more_than_after = [x for x in after_count if x > 5]

print(f'Number of trigger words with more than 5 linked entities before: {len(more_than_before)}')
print(f'Number of trigger words with more than 5 linked entities after: {len(more_than_after)}')

Number of trigger words with more than 5 linked entities before: 17
Number of trigger words with more than 5 linked entities after: 16


In [17]:
print(f'Median span before trigger word in train set: {median(span_before):.1f}')
print(f'Average span before trigger word in train set: {mean(span_before):.1f}')
print(f'Max span before trigger word in train set: {max(span_before)}')
print()
print(f'Median span after trigger word in train set: {median(span_after):.1f}')
print(f'Average span after trigger word in train set: {mean(span_after):.1f}')
print(f'Max span after trigger word in train set: {max(span_after)}')

Median span before trigger word in train set: 2.0
Average span before trigger word in train set: 1.9
Max span before trigger word in train set: 8

Median span after trigger word in train set: 2.0
Average span after trigger word in train set: 1.8
Max span after trigger word in train set: 13


In [18]:
span_more_than_before = [x for x in span_before if x > 5]
span_more_than_after = [x for x in span_after if x > 5]

print(f'Number of trigger words with span more than 5 linked entities before: {len(span_more_than_before)}')
print(f'Number of trigger words with span more than 5 linked entities after: {len(span_more_than_after)}')

Number of trigger words with span more than 5 linked entities before: 18
Number of trigger words with span more than 5 linked entities after: 26


In [19]:
before_leftout = [x-5 for x in span_more_than_before]
after_leftout = [x-5 for x in span_more_than_after]

total_leftout = sum(before_leftout) + sum(after_leftout)
fraction_leftout = total_leftout / sum(positive_relations)

print(f'Number of relations left out: {total_leftout}')
print(f'Fraction left out: {fraction_leftout:.3f}')

Number of relations left out: 77
Fraction left out: 0.005


#### EDA revisted with updated processing function

In [22]:
def process_ann2(ann_file):
    """Helper function that reads a .ann file,
       strips out newline characters, splits the tab-delimited entries,
       and extracts information for identifying entities and relations 
       in corresponding .txt file.
       Also adds negative relations within the span of +/- entities 
       from each trigger word.
       
       Input:
       ann_file = tab-delimited brat annotation file with the following format
                  NER: [entity_ID]\t[label start_offset end_offset]\t[entity]
                  RE:  [relation_ID]\t[relation_type argument1 argument2]
       
       Outputs:
       cleaned_offsets = list of tuples for labeling corresponding .txt file
                         format: (offset, label, entity ID)
       relations = list of tuples for extracting relations from corresponding .txt file
                   format: (relation ID, relation_type, entity ID #1, entity ID #2)"""
    
    with io.open(ann_file, 'r', encoding='utf-8', errors='ignore') as f:
        text = [x.strip().split('\t') for x in f.readlines()]
        
    ann = [x for x in text if x[0][0] == 'T']
    rel = [x for x in text if x[0][0] == 'R']
    
    # extract information for identifying entities
    offsets = []
    
    for x in ann:
        entity_id = x[0]
        start = int(x[1].split()[1])
        end = int(x[1].split()[2])
        label = x[1].split()[0]
        
        offsets.append((start, 'S', label, entity_id))
        offsets.append((end, 'E', label, entity_id))
    
    # sort offsets and clean overlapping entries
    sorted_offsets = sorted(offsets, key=lambda x:x[0])
    
    cleaned_offsets = []
    corrections = {}
    
    hold = None
    indicator = None
    
    for tup in sorted_offsets:
        
        if indicator == 'S':
            if tup[1] == 'E':
                cleaned_offsets.append(hold)
                hold = (tup[0], 'O', 'X')
                indicator = tup[1]
            elif tup[1] == 'S':
                corrections.update({tup[3]:hold[2]})
                indicator = '*'
        
        elif indicator == 'E':
            cleaned_offsets.append(hold)
            hold = (tup[0], tup[2], tup[3])
            indicator = tup[1]
        
        elif indicator == '*':
            indicator = 'S'

        else:
            hold = (tup[0], tup[2], tup[3])
            indicator = tup[1]
            
    cleaned_offsets.append(hold)
    
    # extract information for identifying relations
    relations = []
    positives = []
    
    # add positive relations
    for r in rel:
        relation_id = r[0]
        relation_type = r[1].split()[0]
        entity1 = r[1].split()[1][5:]
        entity2 = r[1].split()[2][5:]
        
        if entity1 in corrections.keys():
            entity1 = corrections[entity1]
        if entity2 in corrections.keys():
            entity2 = corrections[entity2]
        
        relations.append((relation_id, relation_type, entity1, entity2))
        positives.append((entity1, entity2))
    
    # negative relations
    negatives = []
    triggers = {x[2] for x in cleaned_offsets if (x[1] == 'WORKUP' or x[1] == 'REACTION_STEP')}
    entity_order = [x[2] for x in cleaned_offsets if (x[2] != 'X' and x[1] != 'EXAMPLE_LABEL')]
    trigger_indices = {entity_order.index(x) for x in triggers}
    
    # find negative relations
    for t in triggers:
        index = entity_order.index(t)
        
        # find indices in span of +/- 5 from trigger
        find_span = [index - (i+1) for i in range(5)] + [index + (i+1) for i in range(5)]
        real_span = [i for i in find_span if (i >= 0 and i < len(entity_order))]
        span = [i for i in real_span if i not in trigger_indices]
        
        # make tuples of trigger words and entities from span indices
        potential_pairs = [(t, entity_order[i]) for i in span]
        
        # check if tuple is in positives
        # only add to negatives if not in positives
        for pair in potential_pairs:
            if pair in positives:
                continue
            else:
                negatives.append(pair)
    
    # make a list of negative relations 
    # in same format as positives to add to relations list
    i = 0
    for pair in negatives:
        negative_id = f'N{i}'
        relations.append((negative_id, 'NONE', pair[0], pair[1]))
        i += 1
    
    return cleaned_offsets, relations

In [23]:
negative_relations = []

for file in filenames_train:
    
    offsets, relations = process_ann2(f'{path_train}/{file}.ann')
    negatives = [1 for x in relations if x[0][0] == 'N']
    negative_relations.append(len(negatives))

In [24]:
print(f'Total negative relations in train set: {sum(negative_relations)}')
print()
print(f'Median negative relations in train set: {median(negative_relations):.1f}')
print(f'Average negative relations in train set: {mean(negative_relations):.1f}')
print(f'Max negative relations in train set: {max(negative_relations)}')

Total negative relations in train set: 31511

Median negative relations in train set: 34.0
Average negative relations in train set: 35.0
Max negative relations in train set: 314


#### Truncated snippet lengths

In [25]:
def ann_chunker(txt_file, offsets):
    """Helper function that reads in a .txt file as one string,
       divides it based on the cleaned offsets from its .ann file
       and labels chunks with NER tags
       
       Inputs:
       txt_file = file that contains all the patent text
                  considered as one sentence in this task
       offsets = list of tuples for labeling corresponding .txt file
                 format: (offset, label, entity ID)
       
       Output:
       ann_chunks = list of annotated chunks based on .ann file offsets
                    format: (chunk, label, entity ID)"""
    
    with io.open(txt_file, 'r', encoding='utf-8', errors='ignore') as text:
        full_text = text.read()
    
    start = 0
    end = offsets[0][0]
    label = 'O'
    entity_id = 'X'
    
    ann_chunks = [(full_text[:end], label, entity_id)]
    
    for i in range(len(offsets)):
        start = offsets[i][0]
        label = offsets[i][1]
        entity_id = offsets[i][2]
        
        if i < len(offsets) - 1:
            end = offsets[i+1][0]
            term = [(full_text[start:end], label, entity_id)]
            if term[0]:
                ann_chunks.extend(term)
        
        else:
            term = [(full_text[start:], label, entity_id)]  
            ann_chunks.extend(term)
    
    return ann_chunks

In [26]:
def relation_input(snippet_id, rel_tup, chunks):
    """Helper function that creates one input snippet for BERT SRE
    
    Inputs:
    snippet_id = filename of snippet
    rel_tup = tuple from relations list generated by process_ann()
          format: (relation ID, relation_type, entity ID #1, entity ID #2)
    chunks = list of annotated chunks from ann_chunker()
    
    Output:
    rel_input = input snippet ready for BERT SRE
                format: [snippet_id+relation_id]/t[relation_type]/t[cleaned snippet with ner markers]"""
    
    # unpack relation_tup
    relation_id = rel_tup[0]
    relation_type = rel_tup[1]
    entity_list = [rel_tup[2], rel_tup[3]]
    
    new_id = snippet_id + '-' + relation_id
    
    # build cleaned snippet with ner markers
    snippets = []
    i = 1
    
    for tup in chunks:
        chunk, label, entity = tup
        
        # clean chunk: remove punctuation,
        # word tokenize if not an entity
        # split by whitespace if entity
        processed_chunk = []
            
        if label == 'O':
            nopunct = re.sub(r'[,/()":\-\[\]\']', '', chunk.strip())
            sentences = sent_tokenize(nopunct)
            if sentences:
                for s in sentences:
                    for x in word_tokenize(s):
                        processed_chunk.append(x)
                
        else:
            nopunct = re.sub(r'[,/()":\-\[\]\']', '', chunk)
            tokens = [x for x in nopunct.split(' ') if x]
            for t in tokens:
                processed_chunk.append(t)
        
        # add ner markers before and after entities in relation
        if entity in entity_list:
            snippets.append(f'[E{i}]')
            snippets.extend(processed_chunk)
            snippets.append(f'[/E{i}]')
            i += 1
        
        else:
            snippets.extend(processed_chunk)

    # join snippet chunks to one clean snippet
    cleaned_snippet = ' '.join(snippets)
    
    return [new_id, relation_type, cleaned_snippet]

In [27]:
snippet_lengths = []

for file in filenames_train:
    
    snippet_id = file[-4:]

    cleaned_offsets, relations = process_ann2(f'{path_train}/{file}.ann')
    chunks = ann_chunker(f'{path_train}/{file}.txt', cleaned_offsets)

    for tup in relations:
        line = relation_input(snippet_id, tup, chunks)
        snippets = line[2].split(' ')
        snippet_lengths.append(len(snippets))

In [28]:
print(f'Median snippet length in train set: {median(snippet_lengths)}')
print(f'Average snippet length in train set: {mean(snippet_lengths):.1f}')
print(f'Max snippet length in train set: {max(snippet_lengths)}')

Median snippet length in train set: 132
Average snippet length in train set: 175.6
Max snippet length in train set: 969


In [29]:
count = [1 for x in snippet_lengths]
print(f'Total number of snippets: {len(count)}') 

count1 = [1 for x in snippet_lengths if x > 200]
print(f'Number of snippets with length > 200: {len(count1)}')

count2 = [1 for x in snippet_lengths if x > 250]
print(f'Number of snippets with length > 250: {len(count2)}')

count3 = [1 for x in snippet_lengths if x > 300]
print(f'Number of snippets with length > 300: {len(count3)}')

count4 = [1 for x in snippet_lengths if x > 400]
print(f'Number of snippets with length > 400: {len(count4)}')

count5 = [1 for x in snippet_lengths if x > 500]
print(f'Number of snippets with length > 500: {len(count5)}')

Total number of snippets: 45821
Number of snippets with length > 200: 7752
Number of snippets with length > 250: 4378
Number of snippets with length > 300: 3974
Number of snippets with length > 400: 3168
Number of snippets with length > 500: 2773
