In [1]:
import json
import nltk
from nltk.corpus import stopwords
import re
from nltk import tokenize
from tqdm import tqdm, tqdm_notebook
from nltk.corpus import words
import glob
from sklearn.utils import shuffle

In [2]:
st = stopwords.words('english')
common_words = words.words()

### Annotation dataset

Labels

1 - GENE

2 - DISEASE

In [4]:
with open("list_genes.txt") as g:
    genes = set([ x.lower().strip() for x in g.readlines()])

In [5]:
st.append("ace")
st.append("large")
st.append("kit")
st.append("impact")
st.append("set")

In [6]:
genes_clean = []
for g in tqdm(genes):
    if g not in st and not g.isdigit() and g not in common_words:
        genes_clean.append(g)

100%|██████████| 16580/16580 [03:56<00:00, 70.86it/s]


In [7]:
len(genes_clean)

16278

In [8]:
len(genes_clean)

16278

In [9]:
with open("list_maladies.txt") as m:
    maladies = list(set([ x.lower().strip() for x in m.readlines()]))
    maladies.remove("disease")

Corpus

In [9]:
datas = open("asthma.json").readlines()
test = open("autism.json/autism.json").readlines()

### Create Training dataset by sentences

In [59]:
%%time
TRAIN_DATA_WORD=[]
train_i = 0
nb = 0
for line in tqdm(datas):
    text = json.loads(line)["ab"]
    train_i += 1
    
    detected_disease = []
    for maladie in maladies:
        if maladie in text:
            detected_disease.append(maladie.lower())
    
    for sentence in tokenize.sent_tokenize(text):
        if len(sentence) > 10:
            uhm = {"entities": []}
            
            for detect in detected_disease:
                if detect in sentence.lower():
                    pmz = [m.start() for m in re.finditer(r'\b%s\b' % re.escape(detect), sentence.lower())]
                    for qs in pmz:
                        uhm["entities"].append( (qs, qs+len(detect), "DISEASE")  )
            
            for n in tokenize.word_tokenize(sentence):
                if n.lower() in genes_clean:
                    pmz = [m.start() for m in re.finditer(r'\b%s\b' % re.escape(n.lower()), sentence.lower())]
                    for qs in pmz:
                        uhm["entities"].append( (qs, qs+len(n), "GENE")  )
            
            TRAIN_DATA_WORD.append( (sentence, uhm) )
    
    if train_i % 2000 == 0:
        with open("train/normal/train_set_"+str(nb)+".json", "w") as t:
            t.write(json.dumps(TRAIN_DATA_WORD))
        nb += 1
        TRAIN_DATA_WORD=[]

100%|████████████████████████████████████████████████████████████████████████████| 44975/44975 [41:47<00:00, 15.75it/s]


Wall time: 41min 47s


### Create test set

In [13]:
%%time
TEST_DATA_WORD=[]
test_i = 0
nb = 0
for line in tqdm(test):
    text = json.loads(line)["ab"]
    test_i += 1
    
    detected_disease = []
    for maladie in maladies:
        if maladie in text:
            detected_disease.append(maladie.lower())
    
    for sentence in tokenize.sent_tokenize(text):
        if len(sentence) > 10:
            uhm = {"entities": []}
            
            for detect in detected_disease:
                if detect in sentence.lower():
                    pmz = [m.start() for m in re.finditer(r'\b%s\b' % re.escape(detect), sentence.lower())]
                    for qs in pmz:
                        uhm["entities"].append( (qs, qs+len(detect), "DISEASE")  )
            
            for n in tokenize.word_tokenize(sentence):
                if n.lower() in genes_clean:
                    pmz = [m.start() for m in re.finditer(r'\b%s\b' % re.escape(n.lower()), sentence.lower())]
                    for qs in pmz:
                        uhm["entities"].append( (qs, qs+len(n), "GENE")  )
            
            TEST_DATA_WORD.append( (sentence, uhm) )
    
    if test_i % 2000 == 0:
        with open("test/normal/test_set_"+str(nb)+".json", "w") as t:
            t.write(json.dumps(TEST_DATA_WORD))
        nb += 1
        TEST_DATA_WORD=[]

100%|████████████████████████████████████████████████████████████████████████████| 16458/16458 [14:05<00:00, 19.47it/s]


Wall time: 14min 5s


### Create train set from NCBItestset_corpus

In [126]:
with open("NCBItrainset_corpus.txt") as g:
    NCBItest = g.readlines()

TEST_NCB = []
art = ["", {"entities":[]}]
text =""
for line in tqdm_notebook(NCBItest):
    
    if "|t|" in line:
        text = text+line.split("|t|")[1]
    
    if "|a|" in line:
        text = text+""+line.split("|a|")[1]
        art[0] = text.decode("utf-8")
        
        for n in tokenize.word_tokenize(text):
                if n.lower() in genes_clean:
                    pmz = [m.start() for m in re.finditer(r'\b%s\b' % re.escape(n.lower()), text.lower())]
                    for qs in pmz:
                        art[1]["entities"].append( (qs, qs+len(n), "GENE")  )
    
    if "Modifier" in line or "SpecificDisease" in line or "DiseaseClass" in line or "CompositeMention" in line:
        d = tokenize.word_tokenize(line)[:-1]
        entity = [int(d[1]), int(d[2]), "DISEASE"]
        art[1]["entities"].append(entity)
    if line == '\n':
        TEST_NCB.append(tuple(art))
        art = ["", {"entities":[]}]
        text=""

HBox(children=(IntProgress(value=0, max=6923), HTML(value=u'')))




In [115]:
TRAIN_DATA = []
files = glob.glob("train/normal/*.json")
for f in files:
    with open(f) as fl:
        js = json.load(fl)
        for j in js:
            TRAIN_DATA.append(tuple(j))

In [129]:
TRAIN_DATA_SAMPLE = shuffle(TRAIN_DATA)[0:1000]

In [130]:
TRAIN_NCBI = shuffle(TRAIN_DATA_SAMPLE + TEST_NCB)

In [131]:
with open("train/normal_NCBI/train_set_0.json", "w") as t:
            t.write(json.dumps(TRAIN_NCBI))

In [132]:
with open("train/normal_NCBI/train_set_NCBI_only.json", "w") as t:
            t.write(json.dumps(TEST_NCB))

In [124]:
# with open("test/normal_NCBI/test_set_NCBI_only.json", "w") as t:
#             t.write(json.dumps(TEST_NCB))

In [127]:
# alld = []
# allg = []
# for dg in TRAIN_DATA_WORD[0:1000]:
#     if dg[1]["entities"] :
#         for d  in dg[1]["entities"]:
#             if d[2] == "DISEASE":
#                 allg.append(dg[0][d[0]:d[1]])
#             if d[2] == "GENE":
# #                 print()
# #                 print(dg[0][d[0]:d[1]], d[2])
# #                 if dg[0][d[0]:d[1]] == "set":
# #                     print(dg)
#                 alld.append(dg[0][d[0]:d[1]])

In [133]:
# set(alld)