This is the notebook for data preprocessing. 

Reference: Gao S, Kotevska O, Sorokine A, Christian JB (2021) A pre-training and self-training approach for biomedical named entity recognition. PLoS ONE 16(2): e0246310. https://doi.org/10.1371/journal.pone.0246310

Code: https://code.ornl.gov/biomedner/biomedner

Preprocessing for MedMentions Dataset

In [None]:
pip install pysbd

Collecting pysbd
  Downloading pysbd-0.3.4-py3-none-any.whl (71 kB)
[?25l[K     |████▋                           | 10 kB 14.8 MB/s eta 0:00:01[K     |█████████▏                      | 20 kB 20.7 MB/s eta 0:00:01[K     |█████████████▉                  | 30 kB 12.7 MB/s eta 0:00:01[K     |██████████████████▍             | 40 kB 10.2 MB/s eta 0:00:01[K     |███████████████████████         | 51 kB 5.2 MB/s eta 0:00:01[K     |███████████████████████████▋    | 61 kB 6.0 MB/s eta 0:00:01[K     |████████████████████████████████| 71 kB 3.2 MB/s 
[?25hInstalling collected packages: pysbd
Successfully installed pysbd-0.3.4


In [None]:
import re
import gzip
import pickle
import pysbd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
medmentions_gz ='/content/drive/MyDrive/corpus_pubtator.txt.gz'

In [None]:
medmentions_gz

'/content/drive/MyDrive/corpus_pubtator.txt.gz'

In [None]:
seg = pysbd.Segmenter(language="en", clean=False)

In [None]:
def read(ifile):
    obj = {"mention": []}

    HEADER = re.compile(r"(?P<pmid>[0-9]*)\|(?P<t>[t|a])\|(?P<content>.*)")
    MENTIONS = re.compile(r"(?P<pmid>[0-9]*)\t(?P<start>[0-9]*)\t(?P<end>[0-9]*)\t(?P<content>.*)\t(?P<tui>(T.+|UnknownType))\t(?P<cui>C[0-9]+)")

    with gzip.open(ifile, 'r') as fin:
        for line in fin:
            l = line.decode("utf-8")
            h = HEADER.match(l)
            if h:
                obj["pmid"] = int(h.group("pmid"))
                obj[h.group("t")] = h.group("content")
                continue
            m = MENTIONS.match(l)
            if m:
                mention = {"start": m.group("start"),
                           "end": m.group("end"),
                           "content": m.group("content"),
                           "tui": m.group("tui").split(","),
                           "cui": m.group("cui")}
                obj["mention"].append(mention)
                continue
            else:
                yield obj
                obj = {"mention": []}
              


In [None]:
# helper function for separating non-alphanumeric characters
def add_space(text):
    out = []
    for c in text:
        if c.isalnum():
            out.append(c)
        elif c == ' ':
            out.append(c)
        else:
            out.extend([' ',c,' '])
    return ''.join(out)

In [None]:
# save all data to a dictionary
data = {}

In [None]:
# iterate through medmentions gzip file
for i,obj in enumerate(read(medmentions_gz)):

    # load abstract
    title = obj['t']
    pmid = obj['pmid']
    l = len(title) + 1
    abstract = obj['a']

    # label codes for each character in the abstract
    filler = ['0' if c.isalnum() else c for c in abstract]

    # keep track of all named entities
    spans = []
    
    # iterate through each named entity
    for mention in obj["mention"]:
        start = int(mention['start']) - l
        end = int(mention['end']) - l
        if start < 0:
            continue
        span = abstract[start:end]
        spans.append(span)
        
        # generate a label code for each character in the named entity
        # this is necessary because named entities with non-alphanumerics will confuse
        # the BERT tokenizer later
        codes = []
        for c in span:
            if c.isalnum():
                codes.append('1')
            elif c == ' ':
                codes.append(' ')
            else:
                codes.append('¢')
                
        # ignore non-alphanumerics at the beginning of a named entity 
        for i,c in enumerate(codes):
            if c == '¢':
                codes[i] = '#'
            else:
                codes[i] = '2'
                break
                
        # update label codes for named entity span in abstract
        filler[start:end] = codes
        
    # convert label codes into pseudo text
    filler = ''.join(filler)
    filler = add_space(filler)
    abstract = add_space(abstract)
    abstract = abstract.split()
    filler = filler.split()
    
    # convert pseudo text to B,I,O NER labels
    labels = []
    for w in filler:
        if w[0] == '0':
            labels.append('O')
        elif w[0] == '1':
            labels.append('I')
        elif w[0] == '2':
            labels.append('B')
        elif w[0] == '¢':
            labels.append('I')
        else:
            labels.append('O')

    # track any mismatches for debugging
    if len(labels) != len(abstract):
        print(labels,abstract,filler)
        print(len(labels),len(abstract),len(filler))
    
    # sentence boundary detection to convert abstract into individual sentence inputs
    abstract = ' '.join(abstract)
    sentences = [s.split() for s in seg.segment(abstract)]
    sent_labels = []
    i = 0
    for s in sentences:
        l = len(s)
        sent_labels.append(labels[i:i+l])
        i += l
    
    # add cleaned text and NER labels to dictionary
    for i,(s,l) in enumerate(zip(sentences,sent_labels)):
        sample = {}
        sample['sentence'] = s
        sample['labels'] = l
        uid = str(pmid) + "_" + str(i)
        data[uid] = sample


In [None]:
# save to disk
file_name = '/content/drive/MyDrive/medmentions_ner.pkl'
with open(file_name,'wb') as f:
    pickle.dump(data,f)

Preprocessing Step for PubMed Word Embeddings

In [None]:
pip install gensim==3.8.3



In [None]:
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
import logging
import pickle
from sklearn.preprocessing import LabelEncoder

In [None]:
pubmed_w2v = '/content/drive/MyDrive/PubMed-w2v.bin'

In [None]:
#logging setup
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

In [None]:
#load data
file_name = '/content/drive/MyDrive/medmentions_ner.pkl'
with open(file_name,'rb') as f:
    data = pickle.load(f)

In [None]:
#extract data
sentences = []
labels = []

for i,(k,v) in enumerate(data.items()):
    sentences.append(v['sentence'])
    labels.append(v['labels'])


In [None]:
#find max, mean, and std length of sentences
sentence_lens = [len(s) for s in sentences]
sen_std = np.std(sentence_lens)                         
sen_mean = np.mean(sentence_lens)                       
sen_max = np.amax(sentence_lens)                        
sen_precentile = np.percentile(sentence_lens,99)        
print(sen_std,sen_mean,sen_max,sen_precentile)
max_len = 100 
sentence_lens = [i if i < max_len else max_len for i in sentence_lens]

17.26698073028836 24.946385920804527 483 82.0


In [None]:
#convert labels into integers
labels_notype = np.ones((len(labels),max_len)) * -1
for i,label in enumerate(labels):

    #convert to label indices without type
    notype = []
    for l in label[:max_len]:
        if l == 'O':
            notype.append(0)
        elif l == 'B':
            notype.append(1)
        elif l == 'I':
            notype.append(2)
        else:
            raise Exception('invalid code')

    l = len(notype)
    labels_notype[i,:l] = notype


In [None]:
#load word2vec
model = KeyedVectors.load_word2vec_format(pubmed_w2v,binary=True)


2022-04-10 06:24:07,488 : INFO : loading projection weights from /content/drive/MyDrive/PubMed-w2v.bin
2022-04-10 06:24:47,420 : INFO : loaded (2351706, 200) matrix from /content/drive/MyDrive/PubMed-w2v.bin


In [None]:
#save all word embeddings to matrix
vocab = np.zeros((len(model.wv.vocab)+1,200))
word2idx = {}

for key,val in model.wv.vocab.items():
    idx = val.__dict__['index'] + 1
    vocab[idx,:] = model[key]
    word2idx[key] = idx

#add additional word embedding for unknown words
unk = len(vocab)
vocab = np.vstack((vocab, np.zeros((1,200))))

  
  """


In [None]:
#normalize embeddings
vocab -= vocab.mean()
vocab /= (vocab.std()*2.5)
vocab[0,:] = 0

In [None]:
#convert words to indices
text_idx = np.zeros((len(sentences),max_len))
for i,sent in enumerate(sentences):
    idx = [word2idx[word] if word in word2idx else unk for word in sent][:max_len]
    l = len(idx)
    text_idx[i,:l] = idx


In [None]:
#save data
with open('/content/drive/MyDrive/Pubmed/word2idx.pkl','wb') as f:
    pickle.dump(word2idx,f)

np.save('/content/drive/MyDrive/Pubmed/vocab.npy',vocab)
np.save('/content/drive/MyDrive/Pubmed/X_medmentions.npy',text_idx)
np.save('/content/drive/MyDrive/Pubmed/y_medmentions.npy',labels_notype)

In [None]:
with open('/content/drive/MyDrive/Pubmed/sentence_lens_medmentions.pkl','wb') as f:
    pickle.dump(sentence_lens,f)

Data Preprocessing for NER Corpora

In [None]:
import os
import numpy as np
import pickle
import csv

In [None]:
data_path = '/content/drive/MyDrive/NERdata'

In [None]:
# csv dialect
class TSV(csv.Dialect):
    delimiter = '\t'
    doublequote = False
    lineterminator = '\n'
    quoting = csv.QUOTE_NONE
    strict = True

In [None]:
# datasets to test on
datasets = [
            'BC2GM',
            'BC4CHEMD', 
            'NCBI-disease',
            's800'
           ]

In [None]:
# load dictionaries
with open('/content/drive/MyDrive/Pubmed/word2idx.pkl','rb') as f:
    word2idx = pickle.load(f)
unk = len(word2idx)
label2idx = {'O':0, 'B':1, 'I': 2}

In [None]:
#save data
for dataset in datasets:
    
    print('preparing',dataset)
    path = os.path.join(data_path,dataset)

    for split in ['train','train_dev','test']:

        # read in tsv
        reader = csv.reader(open(os.path.join(path,'%s.tsv' % split)), dialect=TSV)
        
        all_sentences = []
        all_labels = []
        
        current_sentence = []
        current_labels = []
        
        for i,row in enumerate(reader):
        
            if len(row) == 0:
                all_sentences.append(current_sentence)
                all_labels.append(current_labels)
                current_sentence = []
                current_labels = []
            else:
                word = row[0]
                word = word2idx[word] if word in word2idx else unk
                label = row[1]
                label = label2idx[label]
                current_sentence.append(word)
                current_labels.append(label)

        # pad everything to the correct max length
        max_len = 50
        doc_lens = [len(s[:max_len]) for s in all_sentences]
        
        X = np.zeros((len(all_sentences),max_len)).astype(np.int32)
        y = np.zeros((len(all_labels),max_len)).astype(np.int32)
        
        for i,(l,s) in enumerate(zip(doc_lens,all_sentences)):
            X[i,:l] = s[:max_len]
        for i,(l,s) in enumerate(zip(doc_lens,all_labels)):
            y[i,:l] = s[:max_len]
        
        np.save("/content/drive/MyDrive/Pubmed/%s_X_%s.npy" % (dataset,split),X)
        np.save("/content/drive/MyDrive/Pubmed/%s_y_%s.npy" % (dataset,split),y)
        with open('/content/drive/MyDrive/Pubmed/%s_senlens_%s.pkl' % (dataset,split),'wb') as f:
            pickle.dump(doc_lens,f)


preparing BC2GM
preparing BC4CHEMD
preparing BC5CDR-chem
preparing BC5CDR-disease
preparing JNLPBA
preparing NCBI-disease
preparing linnaeus
preparing s800
