In [1]:
import csv
import numpy as np
import pandas as pd

def preprocess_conll(data):

    """
    Quick preprocessing on the CONLL data so it can be combined
    with the Emerging Entities data (which doesn't need any
    preprocessing).
    
    Takes `data`, which is a list of strings read in from
    a CONLL data file.
    """

    # Remove DOCSTART lines to make CONLL data consistent
    # with the Emerging Entities dataset
    data = [line for line in data if 'DOCSTART' not in line]

    # Add appropriate tabbing and spacing to match EE data
    data = ['\t'.join([line.split()[0], line.split()[3]]) + '\n'
            if line != '\n'
            else line
            for line in data]
    return data

def create_combined_en_dataset(dataset_path_list, combined_path):

    """
    Takes a dataset_path_list of the two English datasets (can be edited
    to accommodate more datasets later), and a combined_path, which
    is a path string describing where to save the data.
    
    Combines the two English datasets such that they have the same formatting;
    specifically, each line should look like this: TOKEN\tLABEL\n.
    See example below.
    ['EU\tB-ORG\n',
    'rejects\tO\n',
    'German\tB-MISC\n',
    'call\tO\n',
    'to\tO\n',
    'boycott\tO\n',
    'British\tB-MISC\n',
    'lamb\tO\n',
    '.\tO\n',
    '\n', ...]
    """

    for path in dataset_path_list:
        # indicates that these are the CONLL files
        conll_paths = ['test.txt', 'train.txt', 'valid.txt']
        if path in ['./data/en/CONLL2003/' + p for p in conll_paths]:
            with open(path, 'r') as conll:
                conll_data = preprocess_conll(conll.readlines())

        else:
            with open(path, 'r') as ee:
                ee_data = ee.readlines()

    # Combine the two datasets
    ee_data.extend(conll_data)

    # Write out to specified path
    with open(combined_path, 'w+') as new:
        new.writelines(ee_data)


    # Print success message
    print('Combined {} and saved new dataset to {}.'
         .format(dataset_path_list, combined_path))

    return None


def map_to_standardized_labels(label):

    """
    Meant to be used w/ pd.apply().
    Maps a label to a standardized set of labels, because
    the CONLL and EE data include different labelsets and
    labeling conventions (EE has a larger # of classes,
    and writes out labels as "person", "location", etc.,
    while CONLL uses "PER", "LOC", and so on).
    """

    if pd.isna(label):
        return label

    # [:2] keeps the 'B-' or 'I-' part of the label
    elif 'loc' in label.lower():
        label = label[:2] + 'LOC'

    elif 'per' in label.lower():
        label = label[:2] + 'PER'

    elif any([d in label.lower() for d in ['problem', 'treatment', 'test']]):
        label = label[:2] + 'DIS'

    elif any([s in label.lower() for s in ['org', 'corp', 'group']]):
        label = label[:2] + 'ORG'

    # For any leftover labels that are not 'O': map them to MISC
    elif label != 'O':
        label = label[:2] + 'MISC'

    return label


def standardize_labels_and_save(dataset_file_list):

    """
    Standardizes the labels for each dataset and saves them
    under the same filename for 'standardized'.
    """

    for file in dataset_file_list:

        # `sep`, `quoting`, and skip_blank_lines args help preserve data structure
        data_df = pd.read_table(
                    file, header=None, skip_blank_lines=False,
                    sep=' |\t', quoting=csv.QUOTE_NONE, engine='python'
                    ).replace([None], np.nan)

        data_df[1] = data_df[1].apply(map_to_standardized_labels)

        data_df.to_csv(f'{file[:-4]}.txt', header=False, index=False,
                        sep=' ', quoting=csv.QUOTE_NONE)

        print(f'Saved standardized data to {file}.')

    return None


In [2]:
conll_path = './data/en/CONLL2003/'
ee_path = './data/en/emerging_entities_17/'

combined_path = './data/en/'

dataset_filenames = ["train_combined.txt", "valid_combined.txt", "test_combined.txt"]
dataset_file_list = [combined_path + fn for fn in dataset_filenames]

# Training set
create_combined_en_dataset(
    [conll_path + "train.txt", ee_path + "wnut17train.conll"],
    combined_path + "train_combined.txt",
)


# Validation set
create_combined_en_dataset(
    [conll_path + "valid.txt", ee_path + "emerging.dev.conll"],
    combined_path + "valid_combined.txt",
)

# Test set
create_combined_en_dataset(
    [conll_path + "test.txt", ee_path + "emerging.test.annotated"],
    combined_path + "test_combined.txt",
)

#Standardize the labels on all 3 combined datasets
standardize_labels_and_save(dataset_file_list)

Combined ['./data/en/CONLL2003/train.txt', './data/en/emerging_entities_17/wnut17train.conll'] and saved new dataset to ./data/en/train_combined.txt.
Combined ['./data/en/CONLL2003/valid.txt', './data/en/emerging_entities_17/emerging.dev.conll'] and saved new dataset to ./data/en/valid_combined.txt.
Combined ['./data/en/CONLL2003/test.txt', './data/en/emerging_entities_17/emerging.test.annotated'] and saved new dataset to ./data/en/test_combined.txt.
Saved standardized data to ./data/en/train_combined.txt.
Saved standardized data to ./data/en/valid_combined.txt.
Saved standardized data to ./data/en/test_combined.txt.


In [3]:
def load_data(file_path):
    ''' Converts data from:
    word \t label \n word \t label \n \n word \t label
    to: sentence, {entities : [(start, end, label), (stard, end, label)]}
    '''
    file = open(file_path, 'r')
    training_data, entities, sentence, unique_labels = [], [], [], []
    current_annotation = None
    start =0
    end = 0 # initialize counter to keep track of start and end characters
    for line in file:
        line = line.strip("\n").split("\t")
        # lines with len > 1 are words
        if len(line) > 1:
            label = line[1]
            if(label != 'O'):
                label = line[1]+"_MED" # the .txt is formatted: label \t word, label[0:2] = label_type
            #label_type = line[0][0] # beginning of annotations - "B", intermediate - "I"
            word = line[0]
            sentence.append(word)
            start = end
            end += (len(word) + 1)  # length of the word + trailing space

            if label == 'I_MED' :  # if at the end of an annotation
                entities.append(( start,end-1, label))  # append the annotation

            if label == 'B_MED': # if beginning new annotation
                entities.append(( start,end-1, label))# start annotation at beginning of word



            if label != 'O' and label not in unique_labels:
                unique_labels.append(label)

        # lines with len == 1 are breaks between sentences
        if len(line) == 1:
            if(len(entities) > 0):
                sentence = " ".join(sentence)
                training_data.append([sentence, {'entities' : entities}])
            # reset the counters and temporary lists
            end = 0 
            start = 0
            entities, sentence = [], []

    file.close()
    return training_data, unique_labels



train_filenames = ['./data/Disease/BC2GM/train.tsv', './data/Disease/BC4CHEMD/train.tsv', './data/Disease/BC5CDR-chem/train.tsv', './data/Disease/BC5CDR-disease/train.tsv', './data/Disease/JNLPBA/train.tsv', './data/Disease/linnaeus/train.tsv', './data/Disease/NCBI-disease/train.tsv', './data/Disease/s800/train.tsv'] 
test_filenames = ['./data/Disease/BC2GM/test.tsv', './data/Disease/BC4CHEMD/test.tsv', './data/Disease/BC5CDR-chem/test.tsv', './data/Disease/BC5CDR-disease/test.tsv', './data/Disease/JNLPBA/test.tsv', './data/Disease/linnaeus/test.tsv', './data/Disease/NCBI-disease/test.tsv', './data/Disease/s800/test.tsv'] 
val_filenames = ['./data/Disease/BC2GM/train_dev.tsv', './data/Disease/BC4CHEMD/train_dev.tsv', './data/Disease/BC5CDR-chem/train_dev.tsv', './data/Disease/BC5CDR-disease/train_dev.tsv', './data/Disease/JNLPBA/train_dev.tsv', './data/Disease/linnaeus/train_dev.tsv', './data/Disease/NCBI-disease/train_dev.tsv', './data/Disease/s800/train_dev.tsv'] 
# Open file3 in write mode 
with open('data/Disease/train.txt', 'w') as outfile: 
  
    # Iterate through list 
    for names in train_filenames: 

        # Open each file in read mode 
        with open(names) as infile:
            outfile.write(infile.read()) 

        # Add '\n' to enter data of file2 
        # from next line 
        outfile.write("\n")
        outfile.write("\n")
    #outfile.close

with open('./data/Disease/test.txt', 'w') as outfile: 
  
    # Iterate through list 
    for names in test_filenames: 

        # Open each file in read mode 
        with open(names) as infile:
            outfile.write(infile.read()) 

        # Add '\n' to enter data of file2 
        # from next line 
        outfile.write("\n")
        outfile.write("\n")
    #outfile.close

with open('./data/Disease/valid.txt', 'w') as outfile: 
  
    # Iterate through list 
    for names in val_filenames: 

        # Open each file in read mode 
        with open(names) as infile:
            outfile.write(infile.read()) 

        # Add '\n' to enter data of file2 
        # from next line 
        outfile.write("\n")
        outfile.write("\n")
    #outfile.close

TRAIN_DATA, LABELS = load_data("./data/Disease/train.txt")
print(len(TRAIN_DATA))
TEST_DATA, _ = load_data("./data/Disease/test.txt")
print(len(TEST_DATA))
VALID_DATA, _ = load_data("./data/Disease/valid.txt")
print(len(VALID_DATA))





# ADD _MED tags
file = open('./data/Disease/train.txt', 'r')
combined_list = []
for line in file:
    line = line.strip("\n").split("\t")
    if len(line) > 1:
        label = line[1]
        if(label != 'O'):
            label = line[1]+"-MED"
            result_combination = line[0] + ' ' + label
            combined_list.append(result_combination)
        else:
            result_combination = line[0] + ' ' + line[1]
            combined_list.append(result_combination)
    if len(line) == 1:
        combined_list.append("")

with open('data/Disease/train.txt', 'w') as outfile:  
    for names in combined_list: 
        outfile.write(names) 
        outfile.write("\n")


file = open('./data/Disease/test.txt', 'r')
combined_list = []
for line in file:
    line = line.strip("\n").split("\t")
    if len(line) > 1:
        label = line[1]
        if(label != 'O'):
            label = line[1]+"-MED"
            result_combination = line[0] + ' ' + label
            combined_list.append(result_combination)
        else:
            result_combination = line[0] + ' ' + line[1]
            combined_list.append(result_combination)
    if len(line) == 1:
        combined_list.append("")

with open('./data/Disease/test.txt', 'w') as outfile:  
    for names in combined_list: 
        outfile.write(names) 
        outfile.write("\n")


file = open('./data/Disease/valid.txt', 'r')
combined_list = []
for line in file:
    line = line.strip("\n").split("\t")
    if len(line) > 1:
        label = line[1]
        if(label != 'O'):
            label = line[1]+"-MED"
            result_combination = line[0] + ' ' + label
            combined_list.append(result_combination)
        else:
            result_combination = line[0] + ' ' + line[1]
            combined_list.append(result_combination)
    if len(line) == 1:
        combined_list.append("")

with open('./data/Disease/valid.txt', 'w') as outfile:  
    for names in combined_list: 
        outfile.write(names) 
        outfile.write("\n")

44934
25688
70965


In [5]:
import nltk
from nltk.corpus import stopwords
dis_list = []
dis_dict = {}
Med_Bank = ["./data/Disease/train.txt", "./data/Disease/valid.txt", "./data/Disease/test.txt"]
stop_words = set(stopwords.words('english'))

for filenames in Med_Bank:
    with open(filenames) as f_in:
        lines = (line.rstrip() for line in f_in)
        SF = (line for line in lines if line)
        for line in SF:
            a = line.split()
            if a[1] != "O" and a[0].isalpha() and a[0].lower() not in stop_words:
                dis_list.append(a[0])
                dis_dict[a[0]] = a[1]
print("Med_Bank created!")
samples = dis_list


b=""
fin = open("./data/en/train_combined.txt", "rt")
fout = open("./data/train_en.txt", "wt")
for line in fin:
    if line.strip() == '' or line.strip() == '\n' or line.strip() == 'O' or line.lower() == 'o':
        b += "\n"
    else:
        a = line.split()
        if a[0] in samples:
            a[1] = dis_dict[a[0]]
        b += ""+a[0]+" "+a[1] + "\n"
fout.write(b)
fin.close()
fout.close()
print("train processed!")


b=""
fin = open("./data/en/valid_combined.txt", "rt")
fout = open("./data/valid_en.txt", "wt")
for line in fin:
    if line.strip() == '' or line.strip() == '\n' or line.strip() == 'O' or line.lower() == 'o':
        b += "\n"
    else:
        a = line.split()
        if a[0] in samples:
            a[1] = dis_dict[a[0]]
        b += ""+a[0]+" "+a[1] + "\n"
fout.write(b)
fin.close()
fout.close()
print("valid processed!")


b=""
fin = open("./data/en/test_combined.txt", "rt")
fout = open("./data/test_en.txt", "wt")
for line in fin:
    if line.strip() == '' or line.strip() == '\n' or line.strip() == 'O' or line.lower() == 'o':
        b += "\n"
    else:
        a = line.split()
        if a[0] in samples:
            a[1] = dis_dict[a[0]]
        b += ""+a[0]+" "+a[1] + "\n"
fout.write(b)
fin.close()
fout.close()
print("test processed!")

Med_Bank created
train processed!
valid processed!
test processed!


In [11]:
import nltk
from nltk.corpus import stopwords
en_list = []
en_dict = {}
En_Bank = ["./data/en/train_combined.txt", "./data/en/valid_combined.txt", "./data/en/test_combined.txt"]
stop_words = set(stopwords.words('english'))

for filenames in En_Bank:
    with open(filenames) as f_in:
        lines = (line.rstrip() for line in f_in)
        SF = (line for line in lines if line)
        for line in SF:
            if line.strip() != '' and line.strip() != '\n' and line.strip() != 'O' and line.lower() != 'o':
#                 print(line)
                a = line.split()
#                 print(a)
                if a[1] != "O" and a[0].isalpha() and a[0].lower() not in stop_words:
                    en_list.append(a[0])
                    en_dict[a[0]] = a[1]
print("En_Bank created!")
samples = en_list


b=""
fin = open("./data/Disease/train.txt", "rt")
fout = open("./data/train_med.txt", "wt")
for line in fin:
    if line.strip() == '' or line.strip() == '\n' or line.strip() == 'O' or line.lower() == 'o':
        b += "\n"
    else:
        a = line.split()
        if a[0] in samples:
            a[1] = en_dict[a[0]]
        b += ""+a[0]+" "+a[1] + "\n"
fout.write(b)
fin.close()
fout.close()
print("train processed!")


b=""
fin = open("./data/Disease/valid.txt", "rt")
fout = open("./data/valid_med.txt", "wt")
for line in fin:
    if line.strip() == '' or line.strip() == '\n' or line.strip() == 'O' or line.lower() == 'o':
        b += "\n"
    else:
        a = line.split()
        if a[0] in samples:
            a[1] = en_dict[a[0]]
        b += ""+a[0]+" "+a[1] + "\n"
fout.write(b)
fin.close()
fout.close()
print("valid processed!")


b=""
fin = open("./data/Disease/test.txt", "rt")
fout = open("./data/test_med.txt", "wt")
for line in fin:
    if line.strip() == '' or line.strip() == '\n' or line.strip() == 'O' or line.lower() == 'o':
        b += "\n"
    else:
        a = line.split()
        if a[0] in samples:
            a[1] = en_dict[a[0]]
        b += ""+a[0]+" "+a[1] + "\n"
fout.write(b)
fin.close()
fout.close()
print("test processed!")

En_Bank created!
train processed!
valid processed!
test processed!


In [17]:
train_med_dir = './data/train_med.txt'
valid_med_dir = './data/valid_med.txt'
test_med_dir = './data/test_med.txt'
with open(train_med_dir, 'r') as train_med:
    train_med_data = train_med.readlines()
with open(valid_med_dir, 'r') as valid_med:
    valid_med_data = valid_med.readlines()
with open(test_med_dir, 'r') as test_med:
    test_med_data = test_med.readlines()

    
train_en_dir = './data/train_en.txt'
valid_en_dir = './data/valid_en.txt'
test_en_dir = './data/test_en.txt'
with open(train_en_dir, 'r') as train_en:
    train_en_data = train_en.readlines()
with open(valid_en_dir, 'r') as valid_en:
    valid_en_data = valid_en.readlines()
with open(test_en_dir, 'r') as test_en:
    test_en_data = test_en.readlines()    


train_en_data.extend(train_med_data)
valid_en_data.extend(valid_med_data)
test_en_data.extend(test_med_data)


with open('./data/train.txt', 'w+') as trains:
    trains.writelines(train_en_data)
with open('./data/valid.txt', 'w+') as valids:
    valids.writelines(valid_en_data)
with open('./data/test.txt', 'w+') as tests:
    tests.writelines(test_en_data)