In [1]:
# import libraries
import time
from dateparser.search import search_dates
import re
import json
import requests
import pandas as pd 
import datefinder
from tqdm import tqdm
import numpy as np
import datetime
import pytz
import spacy
import multiprocessing
from nltk import tokenize
import concurrent.futures
import winprocess
from concurrent.futures import as_completed, ProcessPoolExecutor

In [2]:
# document from which date need to be extracted
doc = "Given medical Report"

In [28]:
# load pre-require library
start = time.time()
med7 = spacy.load("en_core_med7_lg")
nlp = spacy.load("en_core_web_lg")
end = time.time()
print(end - start)

12.371695041656494


In [4]:
# function to extract unique dates
def extract_unique_dates(doc):    
    extracted_dates = []
    dates = search_dates(doc)
    list_of_extracted_dates = []
    for text_date in dates:
        list_of_extracted_dates.append(text_date[0])
    unique_dates = [x for i, x in enumerate(list_of_extracted_dates) if i == list_of_extracted_dates.index(x)]
    return(unique_dates)

In [5]:
# function to fine start index
def find_starting_index(list_entity,doc):
    start_index = []
    for entity in list_entity:
        start_index.append([m.start() for m in re.finditer(entity, doc)])
    return start_index

In [6]:
# function to find end index 
def find_ending_index(list_entity,doc):
    end_index = []
    for entity in list_entity:
        end_index.append([m.end() for m in re.finditer(entity, doc)])
    return end_index

In [7]:
# function to get length of array
def arr_dimen(a):
    return [len(a)]+arr_dimen(a[0]) if(type(a) == list) else []

In [8]:
# function to find unique values from list and return in array
def unique(list1):
    x = np.array(list1)
    return (np.unique(x))

In [9]:
# function to find unique values from list and return in list
def unique_list(list1):
 
    # intilize a null list
    unique_list = []
     
    # traverse for all elements
    for x in list1:
        # check if exists in unique_list or not
        if x not in unique_list:
            unique_list.append(x)
    return unique_list

In [10]:
# function to get mean of starting and ending index
def get_mean_distance(starting_index_list, ending_index_list):
    if len(arr_dimen(starting_index_list)) > 1:
        mean_index = []
        for i in range(len(starting_index_list)):
            mean_index.append([(g + h) / 2 for g, h in zip(starting_index_list[i], ending_index_list[i])])
    else:
        mean_index = [(g + h) / 2 for g, h in zip(starting_index_list, ending_index_list)]
    return mean_index

In [11]:
# function to calculate distance between entity and date
def calculate_distance(index_date_list,index_entity_list,unique_list_dates):
    date = []
    distance = []
    for index_val in tqdm(range(len(index_date_list))):
        if len(index_date_list[index_val]) > 1:
            for sub_index_value in range(len(index_date_list[index_val])):
                date.append(unique_list_dates[index_val])
                distance.append(abs(index_date_list[index_val][sub_index_value]-index_entity_list[0]))
        elif len(index_date_list[index_val]) == 1:
            date.append(unique_list_dates[index_val])
            distance.append((abs(index_date_list[index_val][0]-index_entity_list[0])))
        else:
            pass
    d = {'Date':date,'Distance':distance}
    cal_distance = pd.DataFrame(d)
    return cal_distance

In [12]:
# function to closed date based on distance
def find_date_based_distance(df_with_date_distance):
    cal_distance = df_with_date_distance.sort_values(by='Distance')
    return cal_distance

In [13]:
# function to perform basic data cleaning on medical report
def basic_clean(doc):
    new_doc = " ".join(doc.split())
    str_1 = re.sub(r"[\[\*\]\{\}\(\)]", " ", new_doc)
    return " ".join(str_1.split())

In [14]:
# function to extract date manually from medical report
def extract_Date_manual(doc):
    my_txt=doc
    matches  = datefinder.find_dates(my_txt, source=True)
    detect_dates=[]
    for match in matches:
        try:
            detect_dates.append(match)
        except calendar.IllegalMonthError:
            print("Caught it!")

    list_of_extracted_dates = []
    for text_date in detect_dates:
            list_of_extracted_dates.append(text_date[1])
    
    unique_dates = [x for i, x in enumerate(list_of_extracted_dates) if i == list_of_extracted_dates.index(x)]
    selected_Date=[]
    for i in unique_dates:
        if "." in i or i.isdigit() == True or "of" in i or "at" in i:
            pass
        else:
            selected_Date.append(i)
    return(selected_Date)

In [15]:
# function to identify medical terms
def med_terms(doc):
    doc1 = med7(doc)
    med_rec = pd.DataFrame([(ent.text, ent.label_) for ent in doc1.ents], columns =['Entity', 'Tag']) 
    Date_ext_med = med_rec[med_rec['Tag'].isin(['DOSAGE', 'DURATION','FORM','FREQUENCY','ROUTE','STRENGTH'])]
    return Date_ext_med
 
# function to identify date using spacy     
def date_spacy(doc):
    text2= nlp(doc)
    Date_ext = pd.DataFrame([(ent.text, ent.label_) for ent in text2.ents], columns =['Entity', 'Tag']) 
    Date_ext = Date_ext[Date_ext['Tag'].isin(['TIME', 'DATE'])]
    return Date_ext

# function to generate check list based on medical terms
def generate_check_string(Date_ext_med):
    check_string = ' '.join(word for word in Date_ext_med.Entity.tolist())
    return(check_string)

# function to get extracted date from spacy
def final_spacy_dates(Date_ext,check_string):
    selected_date = [entity for entity in Date_ext.Entity.tolist() if str(entity) not in check_string]
    return(selected_date)

# function to return manual extracted dates
def final_manual_extracted_date(extract_date,check_string):
    selected_date_1 = [entity for entity in extract_date if str(entity) not in check_string]
    return selected_date_1

# function to perform similarity check between two dates
def similarity_check_dates(selected_date_1,selected_date):
    Final_selected_date = []
    selected_date_1 = [nlp(str(entity)) for entity in selected_date_1]
    selected_date_nlp = [nlp(str(entity)) for entity in selected_date]
    for entity in selected_date_1:
        dummy = []
        for med_ent in selected_date_nlp:
            dummy.append(entity.similarity(med_ent))
        if any(y > 0.90 for y in dummy):
            pass
        else:
            Final_selected_date.append(entity)
    Date_selected = selected_date + Final_selected_date
    return(Date_selected)

In [16]:
# Find Date function which will extract all date from medical report
def find_date(doc):
    doc = basic_clean(doc)
    start = time.time()
    med_term = med_terms(doc)
    spacy_date = date_spacy(doc)
    check_string = generate_check_string(med_term)
    selected_spacy_dates = final_spacy_dates(spacy_date,check_string)
    extracted_date = extract_Date_manual(doc)
    selected_manual_date = final_manual_extracted_date(extracted_date,check_string)
    list_of_dates = similarity_check_dates(selected_manual_date,selected_spacy_dates)
    end = time.time()
    print(end - start)
    return list_of_dates

In [30]:
R1 = find_date(doc)
R1

['2020-07-10 07/10/2020',
 'today',
 '71-year-old',
 '3 years',
 'the past month',
 '08/31/2020',
 '07/11/2020',
 '21:50:44',
 '2020-09-01',
 '2 weeks',
 '2 days',
 'today',
 '75mg daily 4',
 'April 2020',
 '30 years',
 '60s',
 '2020-09-04',
 '09/04/2020 11:53',
 '72 year old',
 'Two weeks ago',
 '08/30/2020',
 '08/31/2020',
 '5 PM',
 'Pregabalin lyrica 150 MG PO bedtime',
 '08/30/2020',
 '24 Hours',
 'age 74-Mother',
 'age 84',
 '2020/09/03',
 '2020/09/03',
 '18 mg',
 '2020/09/03',
 '2020/09/03',
 '2020/09/03',
 '2020/09/03',
 '15:06 - PT',
 'today',
 'today',
 'today',
 '09/04/2020 11:53 Record',
 '2020-09-08',
 '72 year old',
 '1st',
 'today',
 'April 2020',
 '30 years',
 '72 year old',
 'today',
 07/20/2021,
 on 08/31/2020,
 07/11/2020 21:50:44,
 8/30/35,
 1-4,
 on 9/01/35,
 4, 1/6,
 On 08/30/2020,
 On 08/31/2020,
 09/01/2020,
 2, 2/6,
 on -- -- 2020/09/04 09:13,
 2020/09/04 09:13,
 2020/09/03 15:06,
 2020/09/04 07:42,
 2020/09/03 15:09,
 on 09/01,
 09/04,
 on 09/04/2020 11:53,
 on

In [20]:
# function that call document tagger API of Orion Health
def _call_pipeline_n_grams_tagging(text):
    url = "http://172.20.28.32:6010/pipeline/n_gram_tagging"
    body = json.dumps({"text": text})
    param = {"combine_sentences": "True", "threshold": "0.90", "grams_num": "3", "concepts_num": "3",
             "use_gpu_encoder": "True"}
    headers = {'content-type': 'application/json', 'accept': 'application/json'}
    response = requests.post(url, params=param, data=body, headers=headers, allow_redirects=False)
    output = json.loads(response.text)
    return output
tagging_out = _call_pipeline_n_grams_tagging(basic_clean(doc))

In [22]:
# function that find all disorder entity from given medical report
def find_disorder_entity(tagging_out,concept_type_string):   
    entity = []
    entity_starting_index = []
    entity_ending_index = []
    for single_entity_info in tagging_out['entities']:
        idef _call_pipeline_n_grams_tagging(text):
    url = "http://172.20.28.32:6010/pipeline/n_gram_tagging"
    body = json.dumps({"text": text})
    param = {"combine_sentences": "True", "threshold": "0.90", "grams_num": "3", "concepts_num": "3",
             "use_gpu_encoder": "True"}
    headers = {'content-type': 'application/json', 'accept': 'application/json'}
    response = requests.post(url, params=param, data=body, headers=headers, allow_redirects=False)
    output = json.loads(response.text)
    return outputf single_entity_info['possible_concepts'][0]['concept_type'] == concept_type_string:
            entity.append(single_entity_info['entity_content'])
            entity_starting_index.append(single_entity_info['start_index'])
            entity_ending_index.append(single_entity_info['end_index'])
    return entity,entity_starting_index,entity_ending_index

In [17]:
# function that extract closest date for all entity
def find_list_date_by_distance(entity_list, start_index_list, ending_index_list, doc,list_of_dates):
    doc = basic_clean(doc)
    
    start_index_date = find_starting_index(list_of_dates,doc)
    end_index_date = find_ending_index(list_of_dates,doc)
    
    entity_name = []
    str_inx = []
    end_ind = []
    selected_date = []
    f_distance = []
    
    for entity in range(len(entity_list)):
        start_index_entity = [start_index_list[entity]]
        end_index_entity = [ending_index_list[entity]]
    
        mean_index_date = get_mean_distance(start_index_date,end_index_date)
        mean_index_entity = get_mean_distance(start_index_entity,end_index_entity)
    
        cal_dist = calculate_distance(mean_index_date,mean_index_entity,list_of_dates)
        dates_according_distance = find_date_based_distance(cal_dist)
    
        entity_name.append(entity_list[entity])
        str_inx.append(start_index_entity)
        end_ind.append(end_index_entity)
        selected_date.append(dates_according_distance.Date.head(1).tolist()[0])
        f_distance.append(dates_according_distance.Distance.head(1).tolist()[0])
    d = {'Entity':entity_name,'Start_index':str_inx,'End_index':end_ind,'Date':selected_date,'Distance':f_distance}
    c_d = pd.DataFrame(d)
    return c_d

In [21]:
tagging_out = _call_pipeline_n_grams_tagging(basic_clean(doc))

In [23]:
disorder_entity,disorder_entity_starting_index,disorder_entity_ending_index = find_disorder_entity(tagging_out,'disorder')

In [29]:
result = find_list_date_by_distance(disorder_entity,disorder_entity_starting_index,disorder_entity_ending_index,doc,unique_list(list_of_dates))

100%|██████████| 61/61 [00:00<00:00, 196055.59it/s]
100%|██████████| 61/61 [00:00<00:00, 401022.80it/s]
100%|██████████| 61/61 [00:00<00:00, 146368.73it/s]
100%|██████████| 61/61 [00:00<00:00, 155533.46it/s]
100%|██████████| 61/61 [00:00<00:00, 154780.73it/s]
100%|██████████| 61/61 [00:00<00:00, 221901.60it/s]
100%|██████████| 61/61 [00:00<00:00, 250100.24it/s]
100%|██████████| 61/61 [00:00<00:00, 369729.11it/s]
100%|██████████| 61/61 [00:00<00:00, 156102.83it/s]
100%|██████████| 61/61 [00:00<00:00, 379040.81it/s]
100%|██████████| 61/61 [00:00<00:00, 139657.50it/s]
100%|██████████| 61/61 [00:00<00:00, 140270.04it/s]
100%|██████████| 61/61 [00:00<00:00, 219239.54it/s]
100%|██████████| 61/61 [00:00<00:00, 156389.09it/s]
100%|██████████| 61/61 [00:00<00:00, 200041.08it/s]
100%|██████████| 61/61 [00:00<00:00, 123540.58it/s]
100%|██████████| 61/61 [00:00<00:00, 146620.37it/s]
100%|██████████| 61/61 [00:00<00:00, 390614.57it/s]
100%|██████████| 61/61 [00:00<00:00, 183144.27it/s]
100%|███████

In [31]:
# date associate with medical terms 
result.head(10)

Unnamed: 0,Entity,Start_index,End_index,Date,Distance
0,trigeminal neuralgia,[370],[390],71-year-old,56.5
1,Hypothyroidism,[793],[807],the past month,197.0
2,asthma,[826],[832],the past month,226.0
3,trigeminal neuralgia,[1344],[1364],on 08/31/2020,433.5
4,cuts,[1872],[1876],08/31/2020,85.0
5,TD,[2076],[2078],07/11/2020,8.0
6,TR,[2100],[2102],21:50:44,6.0
7,CHF,[2321],[2324],2 weeks,97.0
8,LE,[2608],[2610],8/30/35,61.5
9,Trigeminal neuralgia,[3116],[3136],today,133.5
