# Agenda - Sept 11th 
    - Lexie


- Data cleaning
    - EDA: exploratory data analysis
    - Build up-to-date medical dictionary based on our current answers by doctors
    

- Clustering - kmeans/LDA in topic modeling 
    - get similarity list of a group of answers
    

- Feature extraction 
    - drug names, treatment, disease... 
    - question level and user level
    
    
- Question analysis 
    - based on question setting, classify each question into different groups
    

- Next step: relation extraction
    - get to know more in text(answers)

In [5]:
import pandas as pd
import numpy as np
import pymysql
import re
pd.set_option('display.max_colwidth', -1)
import spacy
nlp = spacy.load('en_core_web_sm')
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer ##文本向量化
from sklearn.decomposition import LatentDirichletAllocation # LDA
import psycopg2 #postgresql
import pyLDAvis
import pyLDAvis.sklearn
import pymetamap
from pymetamap import MetaMap
import string
import psycopg2

In [6]:
import configparser
config = configparser.ConfigParser()
config.read('config.ini')
config.sections()

sos_db_section = config['SOS_DB']
host = sos_db_section['host']
user = sos_db_section['user']
password = sos_db_section['password']
db=sos_db_section['db']
port = int(sos_db_section.get('port', 3306))
db = pymysql.connect(host=host, user=user, password=password, db=db, port=port)

In [7]:
import_db_section = config['docdx_db_production']
host = import_db_section['host']
user = import_db_section['user']
password = import_db_section['password']
db_dx=import_db_section['db']
port = int(sos_db_section.get('port',5432))

conn_str = "host={} dbname={} user={} password={}".format(host, db_dx, user, password)
db_docdx = psycopg2.connect(conn_str)

## 1. Load Data 
- Docdx

In [206]:
#name: topic specialty
#spe: doctor's specialty 
query_docdx = """select CC.npi, BB.first_name, BB.last_name, BB.email, AA.created_at,topic.id, topic.title,topic.description, AA.text, DD.name, GG.spe from comment AA 
inner join public."user" BB 
on AA.created_by = BB.id 
inner join user_profile CC
on CC.user_id = AA.created_by 
inner join topic 
on topic.id = AA.topic_id
inner join 
(select specialty.name, topic_specialty.topic_id from topic_specialty
inner join specialty 
on topic_specialty.specialty_id = specialty.id) DD
on DD.topic_id = AA.topic_id
inner join (select CC.user_id user_id_,EE.name spe from public."user_profile" CC
inner join specialty EE
on EE.id = CC.specialty_id
inner join subspecialty FF
on FF.id = CC.subspecialty_id) GG
on GG.user_id_ = AA.created_by
where AA.reply_to_comment_id is null and AA.deleted_at is null 
order by AA.created_at asc"""
df_docdx = pd.read_sql(query_docdx, db_docdx)

In [207]:
len(df_docdx)

10313

### 1.1 Check valid data 
- test_topics
- topic with multiple specialties

In [208]:
print('Specialty names')
df_docdx.name.unique()

Specialty names


array(['Dermatology', 'Gastroenterology', 'OTO / ENT', 'Pulmonology',
       'Neurology', 'Oncology / Hematology', 'Rheumatology',
       'UNSPECIFIED / OTHER', 'Anesthesia / Pain', 'Internal Medicine',
       'Endocrinology / Diabetes', 'Family Practice', 'Cardiology',
       'Ophthalmology / Optometrists', 'Allergy / Immunology',
       'Nephrology', 'Radiology', 'Psychiatry', 'Urology',
       'Nurse Practitioners', 'Surgeons', 'Pediatrics'], dtype=object)

In [209]:
#prg_id in different types 
query_multiple_topic = """select topic_id, count(topic_id) from topic_specialty
group by topic_id
having count(topic_id) > 1"""
df_topic_multiple_count = pd.read_sql(query_multiple_topic, db_docdx)

In [11]:
df_topic_multiple_count

Unnamed: 0,topic_id,count
0,16,3
1,148,2
2,189,9
3,3,4
4,67,2
5,107,2
6,147,2
7,145,21


In [12]:
query_multiple_topic_check = """select topic.title,topic.description, specialty.name, topic_specialty.topic_id from topic_specialty
inner join specialty 
on topic_specialty.specialty_id = specialty.id
inner join topic
on topic.id = topic_specialty.topic_id
where topic_id in 
(select topic_id from topic_specialty
group by topic_id
having count(topic_id) > 1) 
order by topic_id """
df_topic_multiple = pd.read_sql(query_multiple_topic_check, db_docdx)

In [14]:
df_topic_multiple.head()

Unnamed: 0,title,description,name,topic_id
0,Test Question edit,This is just a test question so I can see if moderator can create a new question on production. Will delete it right away! edit,Anesthesia / Pain,3
1,Test Question edit,This is just a test question so I can see if moderator can create a new question on production. Will delete it right away! edit,Cardiology,3
2,Test Question edit,This is just a test question so I can see if moderator can create a new question on production. Will delete it right away! edit,Dermatology,3
3,Test Question edit,This is just a test question so I can see if moderator can create a new question on production. Will delete it right away! edit,Allergy / Immunology,3
4,Testteteteteqweqe,asdasddasdasdasdsasdadasdaedit,Anesthesia / Pain,16


In [210]:
#remove test topics
test_topic = [3,16,67,23,24,25,26,27,28,29]
df_docdx = df_docdx[~df_docdx['id'].isin(test_topic)]

In [211]:
#test topics removal 
len(df_docdx)

10306

- Pick one specialty in the multi-topic cases

In [212]:
#mapped to dataset 
def one_name(id):
    if id == 189:
        return 'Oncology / Hematology'
    elif id == 145:
        return 'Internal Medicine'
    elif id == 107:
        return 'Family Practice'
    elif id == 180:
        return 'Pulmonology'
    elif id == 147:
        return 'Gastroenterology'
    elif id == 148:
        return 'Oncology / Hematology' 
    else:
        return str(df_docdx[df_docdx['id'] == id].name.unique())[2:-2]

In [213]:
df_docdx['name'] = df_docdx['id'].apply(lambda x: one_name(x))

In [214]:
len(df_docdx)

10306

In [215]:
#remove dupe in the df_docdx 
#because we we get it from the database, every answer will have multiple specialties mapped 
df_docdx = df_docdx.drop_duplicates(subset = ['id', 'text','npi'])

In [216]:
#remove pattern
pattern_new = "\n"
df_docdx['text'] = df_docdx.text.apply(lambda x: re.sub(pattern_new, " ", x))

#remove punctuation 

def remove_punctuation(sentence: str) -> str:
    return sentence.translate(str.maketrans('', '', string.punctuation))

df_docdx['text_np'] = df_docdx['text'].apply(lambda x: remove_punctuation(x))

#change to lower case 
df_docdx['text_np'] = df_docdx['text_np'].str.lower()
#df_docdx['text'] = df_docdx['text'].str.lower()

In [221]:
#only the info we want
#not in a user level 
df_docdx_clean = df_docdx[['id','title','description','text_np','name','spe']]

In [222]:
df_docdx_clean.head(5)

Unnamed: 0,id,title,description,text_np,name,spe
0,44,Patient Case: would you confirm Blastic Plasmacytoid Dendritic Cell Neoplasm?,"A 65-year-old male patient presents with a 1-month history of multiple asymptomatic bruise-like lesions on his trunk and face. Additionally, he developed ecchymotic lesions on the trunk easily from scratching. He also suffered from several episodes of bleeding in the right nasal cavity for 1-2 months. He denied having any underlying systemic disease or malignancy. On physical examination, the following were noted: multiple 2-3 cm in diameter bluish to violaceous infiltrated patches or plaques scattered on his trunk and a few irregularly shaped violaceous plaques on his cheeks. Based on these symptoms, how would you proceed to either confirm or rule out a diagnosis of Blastic Plasmacytoid Dendritic Cell Neoplasm (BPDCN)? If patient is diagnosed with BPDCN, what would be the typical treatment approach for him?",refer to oncologist,Dermatology,Dermatology
1,44,Patient Case: would you confirm Blastic Plasmacytoid Dendritic Cell Neoplasm?,"A 65-year-old male patient presents with a 1-month history of multiple asymptomatic bruise-like lesions on his trunk and face. Additionally, he developed ecchymotic lesions on the trunk easily from scratching. He also suffered from several episodes of bleeding in the right nasal cavity for 1-2 months. He denied having any underlying systemic disease or malignancy. On physical examination, the following were noted: multiple 2-3 cm in diameter bluish to violaceous infiltrated patches or plaques scattered on his trunk and a few irregularly shaped violaceous plaques on his cheeks. Based on these symptoms, how would you proceed to either confirm or rule out a diagnosis of Blastic Plasmacytoid Dendritic Cell Neoplasm (BPDCN)? If patient is diagnosed with BPDCN, what would be the typical treatment approach for him?",patient needs a skin biopsy not sure how to treat though,Dermatology,Dermatology
2,44,Patient Case: would you confirm Blastic Plasmacytoid Dendritic Cell Neoplasm?,"A 65-year-old male patient presents with a 1-month history of multiple asymptomatic bruise-like lesions on his trunk and face. Additionally, he developed ecchymotic lesions on the trunk easily from scratching. He also suffered from several episodes of bleeding in the right nasal cavity for 1-2 months. He denied having any underlying systemic disease or malignancy. On physical examination, the following were noted: multiple 2-3 cm in diameter bluish to violaceous infiltrated patches or plaques scattered on his trunk and a few irregularly shaped violaceous plaques on his cheeks. Based on these symptoms, how would you proceed to either confirm or rule out a diagnosis of Blastic Plasmacytoid Dendritic Cell Neoplasm (BPDCN)? If patient is diagnosed with BPDCN, what would be the typical treatment approach for him?",i would biopsy a lesion chemotherapy can be used for treatment prognosis is poor,Dermatology,Dermatology
3,44,Patient Case: would you confirm Blastic Plasmacytoid Dendritic Cell Neoplasm?,"A 65-year-old male patient presents with a 1-month history of multiple asymptomatic bruise-like lesions on his trunk and face. Additionally, he developed ecchymotic lesions on the trunk easily from scratching. He also suffered from several episodes of bleeding in the right nasal cavity for 1-2 months. He denied having any underlying systemic disease or malignancy. On physical examination, the following were noted: multiple 2-3 cm in diameter bluish to violaceous infiltrated patches or plaques scattered on his trunk and a few irregularly shaped violaceous plaques on his cheeks. Based on these symptoms, how would you proceed to either confirm or rule out a diagnosis of Blastic Plasmacytoid Dendritic Cell Neoplasm (BPDCN)? If patient is diagnosed with BPDCN, what would be the typical treatment approach for him?",a skin biopsy labs and oncology consult asap,Dermatology,Dermatology
4,44,Patient Case: would you confirm Blastic Plasmacytoid Dendritic Cell Neoplasm?,"A 65-year-old male patient presents with a 1-month history of multiple asymptomatic bruise-like lesions on his trunk and face. Additionally, he developed ecchymotic lesions on the trunk easily from scratching. He also suffered from several episodes of bleeding in the right nasal cavity for 1-2 months. He denied having any underlying systemic disease or malignancy. On physical examination, the following were noted: multiple 2-3 cm in diameter bluish to violaceous infiltrated patches or plaques scattered on his trunk and a few irregularly shaped violaceous plaques on his cheeks. Based on these symptoms, how would you proceed to either confirm or rule out a diagnosis of Blastic Plasmacytoid Dendritic Cell Neoplasm (BPDCN)? If patient is diagnosed with BPDCN, what would be the typical treatment approach for him?",skin biopsy complete bloodwork refer to onc,Dermatology,Dermatology


## 2. K-means - Docdx

https://towardsdatascience.com/the-5-clustering-algorithms-data-scientists-need-to-know-a36d136ef68

- Clustering 
    - Clustering is a method of unsupervised learning and is a common technique for statistical data analysis used in many fields.
    - In Data Science, we can use clustering analysis to gain some valuable insights from our data by seeing what groups the data points fall into when we apply a clustering algorithm. 
    
- Word2vec + kmeans


In [108]:
from sklearn.cluster import KMeans

In [78]:
#create a dictionary on base on question/answers with their mapped specialty 
#{id: [answer_1, answer_2....] }
answers_dict = dict()
for index, row in df_docdx_clean.iterrows():
    qid = row['id']
    answer = row['text_np']
    #print(row['text_np'])
    if qid in answers_dict :
        answers_dict[qid].append(answer)
    else : 
        answers_dict[qid] = [answer]

In [76]:
#create a dictionary: 
#{specialty: question_id}
topic_spec_dict = {}
for index, row in df_docdx_clean.drop_duplicates(subset = ['id', 'title','name']).iterrows():
    spec = row['name']
    qid = row['id']
    if spec in topic_spec_dict:
        topic_spec_dict[spec].append(qid)
    else: 
        topic_spec_dict[spec] = [qid]

In [168]:
def kmeans(specialty):
    
    ids_list = topic_spec_dict[specialty]
    
    words = dict() # Map each word to its frequency in the corpus
    tokens = dict() # Map each word to its SpaCy token 
    for id in ids_list: #every id 
        for a in answers_dict[id] : #every answer under that question 
            doc = nlp(a) 
            # Count frequency of nouns in the corpus
            for token in doc:
                if token.is_alpha and token.pos_ == 'NOUN' :
                    if token.text not in words :
                        words[token.text] = 1
                        tokens[token.text] = token
                    else :
                        words[token.text] += 1

# Limit to nouns that occur at least 20 times
    freqwords = [w for w in words if words[w] > 10]   

#remove 1 character tokens/words in the dictionary 
    one_char_list_1 = []
    for i in words:
        if len(i) == 1:
            one_char_list_1.append(i)
    #print(i)
    for i in one_char_list_1:
        if i in words: del words[i]

    one_char_list_2 = []
    for i in tokens:
        if len(i) == 1:
            one_char_list_2.append(i)
    for i in one_char_list_2:
        if i in tokens: del tokens[i]
        
    for i in freqwords:
        if len(i) == 1: freqwords.remove(i)
            
    return freqwords, words, tokens


def showClust(k,clust) :
    for i in range(k) :
        vals = []
        for j in range(len(clust)) :
            if clust[j] == i :
                vals.append(freqwords[j])
        print(i,vals)   
        print(' ---------- ')

In [114]:
#Specialty name
df_docdx.name.unique()

array(['Dermatology', 'Gastroenterology', 'Pulmonology', 'Neurology',
       'Oncology / Hematology', 'Rheumatology', 'UNSPECIFIED / OTHER',
       'Anesthesia / Pain', 'Internal Medicine',
       'Endocrinology / Diabetes', 'Family Practice', 'Cardiology',
       'Ophthalmology / Optometrists'], dtype=object)

In [171]:
def pick_spe_kmeans(spec): 
    print(spec)
    print(' ')
    freqwords, words, tokens = kmeans('Oncology / Hematology')

    # Compare the frequent nouns to all the other nouns
    simwords = dict()
    for word in freqwords :
        # Sort the nouns by how similar they are
        sortedwords = sorted(words, key=lambda w: tokens[word].similarity(tokens[w]), reverse=True)
        # Take the top five most similar words
        simwords[word] = sortedwords[:5]
        

    # Show some examples
    for w in freqwords[:20] :
        print(w,simwords[w])
    
    # Get the vectors for the most frequent words
    vectors = []
    for word in freqwords :
        vectors.append(tokens[word].vector)
    print(' ----------------------------------------------------------- ')
    # Cluster the words into a fixed number of clusters
    K = 8 
    km = KMeans(n_clusters=K, init='k-means++', n_init=10,max_iter=1000)
    km.fit(vectors)
    x = km.fit_predict(vectors)

    # Show the words that belong to each cluster
    #showClust(K,x)
    return showClust(K,x)

### Final function 

In [172]:
pick_spe_kmeans('Oncology / Hematology')

Oncology / Hematology
 
xrt ['xrt', 'disorder', 'rituxn', 'chemoradiotherapy', 'line']
hospice ['hospice', 'abastrozole', 'refinement', 'safety', 'ice']
cns ['cns', 'jaundice', 'research', 'usa', 'soif']
histology ['histology', 'sedation', 'region', 'usage', 'histopathology']
tumor ['tumor', 'tumour', 'term', 'mire', 'metabolic']
splenectomy ['splenectomy', 'lumpectomymastectomy', 'spleenectomy', 'immunotherapy', 'hospicepalliative']
scan ['scan', 'ercp', 'rev', 'serologic', 'adenopathy']
revlimid ['revlimid', 'plasma', 'cranial', 'pulmonologist', 'flor']
profile ['profile', 'schedule', 'range', 'agnoist', 'context']
colonoscopy ['colonoscopy', 'bronchoscopy', 'context', 'prognosis', 'routinehow']
lesions ['lesions', 'levels', 'adjustments', 'declines', 'regulations']
metastasis ['metastasis', 'death', 'modality', 'osteoporosis', 'vector']
bone ['bone', 'alpha', 'adjuvent', 'tumour', 'ife']
use ['use', 'deformity', 'form', 'presence', 'sign']
cbc ['cbc', 'polyp', 'claims', 'metastases'

In [174]:
pick_spe_kmeans('Pulmonology')

Pulmonology
 
xrt ['xrt', 'disorder', 'rituxn', 'chemoradiotherapy', 'line']
hospice ['hospice', 'abastrozole', 'refinement', 'safety', 'ice']
cns ['cns', 'jaundice', 'research', 'usa', 'soif']
histology ['histology', 'sedation', 'region', 'usage', 'histopathology']
tumor ['tumor', 'tumour', 'term', 'mire', 'metabolic']
splenectomy ['splenectomy', 'lumpectomymastectomy', 'spleenectomy', 'immunotherapy', 'hospicepalliative']
scan ['scan', 'ercp', 'rev', 'serologic', 'adenopathy']
revlimid ['revlimid', 'plasma', 'cranial', 'pulmonologist', 'flor']
profile ['profile', 'schedule', 'range', 'agnoist', 'context']
colonoscopy ['colonoscopy', 'bronchoscopy', 'context', 'prognosis', 'routinehow']
lesions ['lesions', 'levels', 'adjustments', 'declines', 'regulations']
metastasis ['metastasis', 'death', 'modality', 'osteoporosis', 'vector']
bone ['bone', 'alpha', 'adjuvent', 'tumour', 'ife']
use ['use', 'deformity', 'form', 'presence', 'sign']
cbc ['cbc', 'polyp', 'claims', 'metastases', 'cts']
r

## 3. Topic Modeling - LDA
https://zhuanlan.zhihu.com/p/31470216


## Agenda

Ideal Answers: Producing relevant, precise, non-repetitive and readable
summaries for biomedical questions

1. Spacy with medical text 
2. Topic Modeling in LDA（algo method)


__Background:__

A topic model is a kind of a probabilistic generative model that has been used widely in the field of computer science with a specific focus on text mining and information retrieval in recent years. Since this model was first proposed, it has received a lot of attention and gained widespread interest among researchers in many research fields. 


__Topic modeling__ is a useful method (in contrast to the traditional means of data reduction in bioinformatics) and enhances researchers’ ability to interpret biological information.

The majority of medical documents and electronic health records (EHRs) are in text format that poses a challenge for data processing and finding relevant documents. Looking for ways to automatically retrieve the enormous amount of health and medical knowledge has always been an intriguing topic. Powerful methods have been developed in recent years to make the text processing automatic. One of the popular approaches to retrieve information based on discovering the themes in health & medical corpora is topic modeling; however, this approach still needs new perspectives.
- do research with remedy parterner

https://arxiv.org/pdf/1705.00995.pdf






Topic modeling is family of techniques that can be used to describe and summarize the documents in a corpus according to a set of latent "topics". For this demo, we'll be using Latent Dirichlet Allocation or LDA, a popular approach to topic modeling.
In many conventional NLP applications, documents are represented a mixture of the individual tokens (words and phrases) they contain. In other words, a document is represented as a vector of token counts. 

There are two layers in this model — documents and tokens — and the size or dimensionality of the document vectors is the number of tokens in the corpus vocabulary. This approach has a number of disadvantages:
Document vectors tend to be large (one dimension for each token $\Rightarrow$ lots of dimensions)

They also tend to be very sparse. Any given document only contains a small fraction of all tokens in the vocabulary, so most values in the document's token vector are 0.

The dimensions are fully indepedent from each other — there's no sense of connection between related tokens, such as knife and fork.
LDA injects a third layer into this conceptual model. Documents are represented as a mixture of a pre-defined number of topics, and the topics are represented as a mixture of the individual tokens in the vocabulary. The number of topics is a model hyperparameter selected by the practitioner. LDA makes a prior assumption that the (document, topic) and (topic, token) mixtures follow Dirichlet probability distributions. This assumption encourages documents to consist mostly of a handful of topics, and topics to consist mostly of a modest set of the tokens.

In [195]:
n_features = 1000
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                max_features = n_features,
                                stop_words = 'english',
                                max_df = 0.5,
                                min_df = 10)


tf = tf_vectorizer.fit_transform(df_docdx_clean.text_np)                        

In [196]:
#modeling
n_topics = 20
lda = LatentDirichletAllocation(n_components = n_topics, max_iter = 50,
                                learning_method = 'online',
                                learning_offset = 50.,
                                random_state = 0)

In [197]:
#fit the model 
lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=50, mean_change_tol=0.001,
             n_components=20, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [198]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print('')
    print()

In [199]:
n_top_words = 40
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0:
pain therapy physical consider pt management nsaids injection look joint help center injections topical muscle patch gabapentin patient cause benefit tramadol low andor gene program counseling specialist fracture tylenol massage ortho point acupuncture tens exercise imaging need helpful major chronic

Topic #1:
check cbc consider referral labs need echo thyroid including exam work ultrasound cmp symptoms esr evaluation blood rule start history renal panel crp workup ro negative ekg tsh eval rate lab disease physical pelvic ana function evaluate dvt studies stress

Topic #2:
trial months response monitor type suggest tx regimen clinical consider replacement poor recurrent inhibitor cymbalta suspicious recommend 46 based hedgehog trigger cycles localized complete beneficial strongly suspect markers active vismodegib lesion concurrent year underlying current infection scale controlled total frequent

Topic #3:
risk screening patients care age high years patient screen discuss 50

Plot and graph

In [200]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


#### By Oncologist 

- 'Oncology / Hematology'

In [242]:
n_features = 1000
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                max_features = n_features,
                                stop_words = 'english',
                                max_df = 0.5,
                                min_df = 10)


tf = tf_vectorizer.fit_transform(df_docdx_clean[df_docdx_clean['spe'] == 'Oncology / Hematology' ].text_np)          

#modeling
n_topics = 5
lda = LatentDirichletAllocation(n_components = n_topics, max_iter = 50,
                                learning_method = 'online',
                                learning_offset = 50.,
                                random_state = 0)

#fit the model 
lda.fit(tf)

n_top_words = 20
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0:
biopsy lung bone marrow colonoscopy ct pet colon bx scan need cancer petct cell nodules nodule transplant ultrasound recurrence negative

Topic #1:
consider therapy use treatment response patient inhibitor azedra alectinib radiation option options recommend brain splenectomy rituxan maintenance line parp control

Topic #2:
surgery staging ercp neoadjuvant biopsy chemo eus followed mri stent petct consult surgical disease resection chemotherapy evaluation gi ct diagnosis

Topic #3:
needs need center exam refer history check referral patient pain cmp cbc thalassemia pt physical dose major like iron beta

Topic #4:
chemotherapy adjuvant status chemo therapy need followed cycles radiation consider patient disease based recommend neoadjuvant cancer stage surgery xrt risk




In [243]:
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


### By onocology questions 

In [244]:
n_features = 1000
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                max_features = n_features,
                                stop_words = 'english',
                                max_df = 0.5,
                                min_df = 10)


tf = tf_vectorizer.fit_transform(df_docdx_clean[df_docdx_clean['name'] == 'Oncology / Hematology' ].text_np)          

#modeling
n_topics = 5
lda = LatentDirichletAllocation(n_components = n_topics, max_iter = 50,
                                learning_method = 'online',
                                learning_offset = 50.,
                                random_state = 0)

#fit the model 
lda.fit(tf)

n_top_words = 20
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0:
chemo status disease adjuvant need followed surgery chemotherapy therapy radiation consider neoadjuvant stage metastatic cycles staging mri patient months pet

Topic #1:
chemotherapy followed chemo recommend alectinib brain cancer based neoadjuvant mets cell gastric adjuvant stem resection start myeloma radiation surgery regimen

Topic #2:
consider therapy patient use inhibitor treatment response azedra option radiation options parp maintenance recommend clinical splenectomy rituxan patients line vismodegib

Topic #3:
biopsy lung marrow bone ct colonoscopy pet colon bx scan cancer need petct mammogram ultrasound nodules repeat diagnostic needs cbc

Topic #4:
ercp consult stent need refer center biopsy eus referral diagnosis oncology gi evaluation staging surgery resectable pancreatic tissue surgeon possible




In [245]:
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


## 4. Feature Selection
- can calculate the highest frequency words used (quantitative) and place into categories
    - phsu: drug name
    - dsyn: Disease or Syndrome
    - topp: Therapeutic or Preventive Procedure - treatment
    - fndg: finding 
    - diap: Diagnostic Procedure
    
    
- in two ways:
    1. overall level: by each question id
    2. individual level: by each answer/doctor

#### Load pre-built dictiionary 

In [218]:
import pickle
#load pre-built dictionary 
with open("dict_all_sep_10.txt", "rb") as myFile:
    dict_all = pickle.load(myFile)

### 1. By each question

In [180]:
#built in function - by question_id
def freq_umls(question_id, sem_type, Count):
    
    df = df_docdx_clean[df_docdx_clean['id'] == question_id]
    len_id = len(df)
    freq_dict = {}
    #count total frequency of answers - mentioned times 

        
    for index, row in df.iterrows():
        #print(index)
        for i in set(row.text_np.split()):
            if i in dict_all.keys() and dict_all[i][0] == sem_type:
                if dict_all[i][1] not in freq_dict:
                    freq_dict[dict_all[i][1]] = 1
                else:
                    freq_dict[dict_all[i][1]] += 1
                        
    if Count == True:  
        return sorted(freq_dict.items(), key=lambda x: x[1], reverse = True)
    
    else: 
        #freq_dict_per = {k: "{:.2%}".format(v / len_id) for k, v in freq_dict.items()}
        freq_dict_per = {k: (v / len_id) for k, v in freq_dict.items()}
        return sorted(freq_dict_per.items(), key=lambda x: x[1], reverse = True) 

#### Example

In [183]:
#shoe in count
freq_umls(88, 'phsu', True)

[('Insulin', 199),
 ('Lantus', 78),
 ('Sugars', 44),
 ('NovoLog', 44),
 ('Glucose', 28),
 ('Levemir', 28),
 ('Metformin', 16),
 ('Insulin Glargine', 8),
 ('Pharmaceutical Preparations', 7),
 ('Januvia', 5),
 ('Victoza', 4),
 ('Humalog', 3),
 ('Hypoglycemic Agents', 3),
 ('Agonist', 3),
 ('Sulfonylurea Compounds', 2),
 ('Insulins', 2),
 ('Linagliptin', 2),
 ('other medicated shampoos in ATC', 2),
 ('Inulin', 2),
 ('Histidine', 1),
 ('Incretins', 1),
 ('pioglitazone', 1),
 ('Water', 1),
 ('insulin degludec', 1),
 ('Jardiance', 1),
 ('Trulicity', 1),
 ('Hydroxymethylglutaryl-CoA Reductase Inhibitors', 1),
 ('Acarbose', 1),
 ('Fatty acid glycerol esters', 1),
 ('Bydureon', 1),
 ('Farxiga', 1)]

In [184]:
#show in percentage
freq_umls(88, 'phsu', False)

[('Insulin', 0.796),
 ('Lantus', 0.312),
 ('Sugars', 0.176),
 ('NovoLog', 0.176),
 ('Glucose', 0.112),
 ('Levemir', 0.112),
 ('Metformin', 0.064),
 ('Insulin Glargine', 0.032),
 ('Pharmaceutical Preparations', 0.028),
 ('Januvia', 0.02),
 ('Victoza', 0.016),
 ('Humalog', 0.012),
 ('Hypoglycemic Agents', 0.012),
 ('Agonist', 0.012),
 ('Sulfonylurea Compounds', 0.008),
 ('other medicated shampoos in ATC', 0.008),
 ('Linagliptin', 0.008),
 ('Inulin', 0.008),
 ('Insulins', 0.008),
 ('Histidine', 0.004),
 ('Farxiga', 0.004),
 ('pioglitazone', 0.004),
 ('Water', 0.004),
 ('Jardiance', 0.004),
 ('Trulicity', 0.004),
 ('Hydroxymethylglutaryl-CoA Reductase Inhibitors', 0.004),
 ('Incretins', 0.004),
 ('Acarbose', 0.004),
 ('Fatty acid glycerol esters', 0.004),
 ('Bydureon', 0.004),
 ('insulin degludec', 0.004)]

### 2. by each user_id 
- key word extraction

In [185]:
def feature_extraction(string, semtype):
    key_words = []
    for i in string.split():
        if i in dict_all.keys() and dict_all[i][0] == semtype:
            key_words.append(dict_all[i][1])
    return set(key_words)

In [186]:
def question_extraction(question_id, semtype):
    df = df_docdx[df_docdx['id'] == question_id]
    df = df.reset_index(drop=True)
    df['extraction'] = df['text_np'].apply(lambda x: feature_extraction(x, semtype))
    return df 

#### example

In [205]:
question_extraction(88, 'phsu').head(10)

Unnamed: 0,npi,first_name,last_name,email,created_at,id,title,description,text,name,text_np,extraction
0,1760401000.0,Robert,Vyge,rvyge@msn.com,2018-07-23 10:40:54.092000+00:00,88,78-year-old female newly taking insulin reporting high levels of blood glucose in AM,"78-year-old female recently started on insulin, in addition to her 1000 mg BID Metformin. She was started on 15 units Novolog insulin prior to meals due to her review of blood sugars at 300s prior to mealtimes. Patient notes now her am blood glucose is high 100s, usually 180 and her before meals blood sugars are low 200s. She has also started walking about 20 minutes per day, after lunch. What do you recommend?","I would check her HbA1C I would add a Basal insulin at nighttime such as Levemir or Toujeo starting at 10u sc It would be important to know her weight and if she was obese , I would consider adding a GLP-1 in the am instead of the nighttime basal insulin",Internal Medicine,i would check her hba1c i would add a basal insulin at nighttime such as levemir or toujeo starting at 10u sc it would be important to know her weight and if she was obese i would consider adding a glp1 in the am instead of the nighttime basal insulin,"{Levemir, Insulin}"
1,1780640000.0,Neal,Lakritz,nlakritz@massmed.org,2018-07-23 10:41:40.146000+00:00,88,78-year-old female newly taking insulin reporting high levels of blood glucose in AM,"78-year-old female recently started on insulin, in addition to her 1000 mg BID Metformin. She was started on 15 units Novolog insulin prior to meals due to her review of blood sugars at 300s prior to mealtimes. Patient notes now her am blood glucose is high 100s, usually 180 and her before meals blood sugars are low 200s. She has also started walking about 20 minutes per day, after lunch. What do you recommend?","there are many options here. The easiest from the patient standpoint would be to switch to a mixed insulin. That way she would get a longer duration of coverage. For example, NovoLog 70/30 , with the dose titrated upwards. this would require no additional injections. Other options would be to increase her current novolog, or add a GLP1",Internal Medicine,there are many options here the easiest from the patient standpoint would be to switch to a mixed insulin that way she would get a longer duration of coverage for example novolog 7030 with the dose titrated upwards this would require no additional injections other options would be to increase her current novolog or add a glp1,"{NovoLog, Insulin}"
2,1386776000.0,Michael,Shanik,mshanikmd@hotmail.com,2018-07-23 10:46:55.837000+00:00,88,78-year-old female newly taking insulin reporting high levels of blood glucose in AM,"78-year-old female recently started on insulin, in addition to her 1000 mg BID Metformin. She was started on 15 units Novolog insulin prior to meals due to her review of blood sugars at 300s prior to mealtimes. Patient notes now her am blood glucose is high 100s, usually 180 and her before meals blood sugars are low 200s. She has also started walking about 20 minutes per day, after lunch. What do you recommend?","Several options, likely needs basal insulin as well.",Internal Medicine,several options likely needs basal insulin as well,{Insulin}
3,1376568000.0,Randy,Shemer,shemer.r@comcast.net,2018-07-23 10:47:10.335000+00:00,88,78-year-old female newly taking insulin reporting high levels of blood glucose in AM,"78-year-old female recently started on insulin, in addition to her 1000 mg BID Metformin. She was started on 15 units Novolog insulin prior to meals due to her review of blood sugars at 300s prior to mealtimes. Patient notes now her am blood glucose is high 100s, usually 180 and her before meals blood sugars are low 200s. She has also started walking about 20 minutes per day, after lunch. What do you recommend?",Addition of a GLP-1 and possible an SGLT-2.,Internal Medicine,addition of a glp1 and possible an sglt2,{}
4,1013947000.0,Gary,Vigilante,gary.vigilante@uphs.upenn.edu,2018-07-23 10:51:09.195000+00:00,88,78-year-old female newly taking insulin reporting high levels of blood glucose in AM,"78-year-old female recently started on insulin, in addition to her 1000 mg BID Metformin. She was started on 15 units Novolog insulin prior to meals due to her review of blood sugars at 300s prior to mealtimes. Patient notes now her am blood glucose is high 100s, usually 180 and her before meals blood sugars are low 200s. She has also started walking about 20 minutes per day, after lunch. What do you recommend?","1st of all, I would like to see her hemoglobin A1c. Hemoglobin A1c under 8 would be fine. Assuming that this level is above 8, I would most likely start Lantus insulin 15 units at bedtime. I would consider GLP 1 agent only if she was significantly obese as this is quite expensive.",Internal Medicine,1st of all i would like to see her hemoglobin a1c hemoglobin a1c under 8 would be fine assuming that this level is above 8 i would most likely start lantus insulin 15 units at bedtime i would consider glp 1 agent only if she was significantly obese as this is quite expensive,"{Lantus, Insulin}"
5,1912073000.0,Steven,Glasser,nglas2112@aol.com,2018-07-23 11:01:38.961000+00:00,88,78-year-old female newly taking insulin reporting high levels of blood glucose in AM,"78-year-old female recently started on insulin, in addition to her 1000 mg BID Metformin. She was started on 15 units Novolog insulin prior to meals due to her review of blood sugars at 300s prior to mealtimes. Patient notes now her am blood glucose is high 100s, usually 180 and her before meals blood sugars are low 200s. She has also started walking about 20 minutes per day, after lunch. What do you recommend?",Would start long acting insulin at night.,Internal Medicine,would start long acting insulin at night,{Insulin}
6,1053384000.0,david,schindler,schin98@juno.com,2018-07-23 11:14:52.211000+00:00,88,78-year-old female newly taking insulin reporting high levels of blood glucose in AM,"78-year-old female recently started on insulin, in addition to her 1000 mg BID Metformin. She was started on 15 units Novolog insulin prior to meals due to her review of blood sugars at 300s prior to mealtimes. Patient notes now her am blood glucose is high 100s, usually 180 and her before meals blood sugars are low 200s. She has also started walking about 20 minutes per day, after lunch. What do you recommend?","I would check her HGBA1c, and start a long acting insulin at night. Recommend weight management if she is clinically obese.",Internal Medicine,i would check her hgba1c and start a long acting insulin at night recommend weight management if she is clinically obese,{Insulin}
7,1558353000.0,Marshall,Eidenberg,tridocm@yahoo.com,2018-07-23 11:21:02.053000+00:00,88,78-year-old female newly taking insulin reporting high levels of blood glucose in AM,"78-year-old female recently started on insulin, in addition to her 1000 mg BID Metformin. She was started on 15 units Novolog insulin prior to meals due to her review of blood sugars at 300s prior to mealtimes. Patient notes now her am blood glucose is high 100s, usually 180 and her before meals blood sugars are low 200s. She has also started walking about 20 minutes per day, after lunch. What do you recommend?","Several approaches are available. Could simply increase her novolog prior to meals to 20 units. Another option is to change to a 70/30 formulation and titration from there. Alternatively could add a long acting HS insulin like lantus, 10-15 units. Would check an A1C to determine how her control has been outside of these spot checks. Additionally, some level of diet modification, reduction in grains, potatoes, and sugars, with the right patient may be able to improve glycemic control.",Internal Medicine,several approaches are available could simply increase her novolog prior to meals to 20 units another option is to change to a 7030 formulation and titration from there alternatively could add a long acting hs insulin like lantus 1015 units would check an a1c to determine how her control has been outside of these spot checks additionally some level of diet modification reduction in grains potatoes and sugars with the right patient may be able to improve glycemic control,"{Lantus, NovoLog, Sugars, Insulin}"
8,1730367000.0,William,Smith,wsmithmd@gmail.com,2018-07-23 11:28:18.306000+00:00,88,78-year-old female newly taking insulin reporting high levels of blood glucose in AM,"78-year-old female recently started on insulin, in addition to her 1000 mg BID Metformin. She was started on 15 units Novolog insulin prior to meals due to her review of blood sugars at 300s prior to mealtimes. Patient notes now her am blood glucose is high 100s, usually 180 and her before meals blood sugars are low 200s. She has also started walking about 20 minutes per day, after lunch. What do you recommend?",I would definitely add basal insulin. Basaglar or Toujeo 15 units. HbA1c would be helpful for baseline. GLP-1 is also an option as is basal insulin/GLP-1 combination. Definitely needs diabetic education.,Internal Medicine,i would definitely add basal insulin basaglar or toujeo 15 units hba1c would be helpful for baseline glp1 is also an option as is basal insulinglp1 combination definitely needs diabetic education,{Insulin}
9,1386617000.0,James,Gallagher,dancinfrog@aol.com,2018-07-23 11:29:07.875000+00:00,88,78-year-old female newly taking insulin reporting high levels of blood glucose in AM,"78-year-old female recently started on insulin, in addition to her 1000 mg BID Metformin. She was started on 15 units Novolog insulin prior to meals due to her review of blood sugars at 300s prior to mealtimes. Patient notes now her am blood glucose is high 100s, usually 180 and her before meals blood sugars are low 200s. She has also started walking about 20 minutes per day, after lunch. What do you recommend?",You are making progress. Would titrate the dose of insulin and get detailed information about when the patient takes her insulin. Always emphasize life style choices as well. Good luck.,Internal Medicine,you are making progress would titrate the dose of insulin and get detailed information about when the patient takes her insulin always emphasize life style choices as well good luck,{Insulin}


### 3. by each specialty of doctor

In [223]:
df_docdx_clean.columns

Index(['id', 'title', 'description', 'text_np', 'name', 'spe'], dtype='object')

In [224]:
#built in function - by question_id
def freq_umls(spe, sem_type, Count):
    
    df = df_docdx_clean[df_docdx_clean['spe'] == spe]
    len_id = len(df)
    freq_dict = {} 
    #count total frequency of answers - mentioned times 

        
    for index, row in df.iterrows():
        #print(index)
        for i in set(row.text_np.split()):
            if i in dict_all.keys() and dict_all[i][0] == sem_type:
                if dict_all[i][1] not in freq_dict:
                    freq_dict[dict_all[i][1]] = 1
                else:
                    freq_dict[dict_all[i][1]] += 1
                        
    if Count == True:  
        return sorted(freq_dict.items(), key=lambda x: x[1], reverse = True)
    
    else: 
        #freq_dict_per = {k: "{:.2%}".format(v / len_id) for k, v in freq_dict.items()}
        freq_dict_per = {k: (v / len_id) for k, v in freq_dict.items()}
        return sorted(freq_dict_per.items(), key=lambda x: x[1], reverse = True) 

In [227]:
freq_umls('Oncology / Hematology', 'phsu', True)

[('alectinib', 57),
 ('Rituxan', 46),
 ('Histidine', 38),
 ('Immunoglobulins, Intravenous', 35),
 ('rituximab', 33),
 ('Pharmaceutical Preparations', 32),
 ('vismodegib', 27),
 ('Tamoxifen', 25),
 ('Erivedge', 23),
 ('Promacta', 23),
 ('Carboplatin', 21),
 ('Cisplatin', 20),
 ('Iron', 20),
 ('Charcoal', 20),
 ('sonidegib', 19),
 ('fostamatinib', 19),
 ('Xgeva', 18),
 ('Taxol', 18),
 ('olaparib', 17),
 ('Fluorouracil', 17),
 ('ceritinib', 12),
 ('Alimta', 11),
 ('Oxygen', 11),
 ('Prednisone', 11),
 ('romiplostim', 10),
 ('Insulin', 10),
 ('Zometa', 9),
 ('Revlimid', 9),
 ('Thalidomide', 9),
 ('Lynparza', 9),
 ('other medicated shampoos in ATC', 9),
 ('eltrombopag', 8),
 ('agonists', 8),
 ('Herceptin', 8),
 ('Velcade', 8),
 ('Immunoglobulins', 8),
 ('Avid', 8),
 ('Anti-Inflammatory Agents, Non-Steroidal', 7),
 ('Diuretics', 7),
 ('Dexamethasone', 6),
 ('Prolia', 6),
 ('Etoposide', 6),
 ('Abraxane', 6),
 ('Agonist', 5),
 ('Alcohols', 5),
 ('Lithium', 5),
 ('Catecholamines', 5),
 ('Avastin

In [228]:
freq_umls('Oncology / Hematology', 'phsu', False)

[('alectinib', 0.031879194630872486),
 ('Rituxan', 0.025727069351230425),
 ('Histidine', 0.021252796420581657),
 ('Immunoglobulins, Intravenous', 0.019574944071588368),
 ('rituximab', 0.018456375838926176),
 ('Pharmaceutical Preparations', 0.017897091722595078),
 ('vismodegib', 0.015100671140939598),
 ('Tamoxifen', 0.013982102908277404),
 ('Erivedge', 0.012863534675615212),
 ('Promacta', 0.012863534675615212),
 ('Carboplatin', 0.01174496644295302),
 ('Cisplatin', 0.011185682326621925),
 ('Iron', 0.011185682326621925),
 ('Charcoal', 0.011185682326621925),
 ('sonidegib', 0.010626398210290829),
 ('fostamatinib', 0.010626398210290829),
 ('Xgeva', 0.010067114093959731),
 ('Taxol', 0.010067114093959731),
 ('olaparib', 0.009507829977628635),
 ('Fluorouracil', 0.009507829977628635),
 ('ceritinib', 0.006711409395973154),
 ('Alimta', 0.006152125279642058),
 ('Prednisone', 0.006152125279642058),
 ('Oxygen', 0.006152125279642058),
 ('romiplostim', 0.005592841163310962),
 ('Insulin', 0.005592841163

## 5. Question analysis 

Provide a detailed analysis of __Syntactic Structure__

The structure was used to determine whether each question is __“polar”__ (a yes/no question) or __“non-polar”__ (a question using “what”, “how” or “when”)

In addition, a summary of each question was obtained by __analyzing multiple clauses__ to determine, verbs, subjects and objects. 

These summaries are shown as __lists of words__ in square brackets. 



A set of __rules__ was written to assign each summary to one out of 5 categories: 
- drug (seeking names or classes of medications)
- treatment (more general issues about therapy)
- diagnosis (tests and workup of patients)
- communication (interaction with patients and providers)
- time (seeking durations, frequencies or other temporal information)




Each separate question below is classified as __polar (seeking positive or negative response)__ or __non-polar (seeking a specific type of answer)__

The questions are assigned to one of the 5 categories to assist in __identifying the type of
information to be expected in the response__ 



In [204]:
# Return questions that match given target
def getQuestionType(target,answertype) :
    q = dict()
    for qid in answertype :
        atype = answertype[qid]
        question = questions[qid]
        if target in atype :
            q[qid] = question
    return q

# Show tokens from parsing some text
def showTokens(qid) :
    question = questions[qid]
    doc = nlp(question)
    for token in doc:
        print(token.i,token.tag_,token.text,token.dep_,token.head.i)

# Get root notes from parsing some text
def getRoots(doc) :
    roots = []
    for token in doc:
        if token.dep_ == 'ROOT' :
            roots.append(token)
    return roots

# Determine if a root is a question
def isquest(root,doc) :
    for child in root.children :
        if child.text == '?' : return True
    return False

# Get main clause
def getClause(root,doc) :
    wh = getWh(root,doc)
    head = getHead(root,doc)
    args = getArgs(root,doc)
    prep = getPrep(root,doc)
    comps = getComps(root,doc)
    words = wh + head + args + prep + comps
    return words

# Get head of clause
def getHead(root,doc) :
    head = []
    h = root.lemma_
    if h not in ['be','find','have','suppose','do','need','about'] :
        head.append(h)
    return head

# Find wh-words (what, how, when)
def getWh(root,doc) :
    wh = []
    for child in root.children :
        if child.tag_ in ['WP', 'WDT', 'WRB'] :
            wh.append(child.lemma_)
        elif child.dep_ in ['advmod'] :
            for grandchild in child.children :
                if grandchild.tag_ in ['WP', 'WDT', 'WRB'] :
                    wh.extend([grandchild.lemma_,child.lemma_])
    return wh

# Get prepositional phrase
def getPrep(root,doc) :
    prep = []
    for child in root.children :
        if child.dep_ == 'prep' :
            prep.append(child.lemma_)
            for grandchild in child.children :
                if grandchild.dep_ == 'pobj' :
                    c = grandchild.lemma_
                    prep.append(c)
    return prep

# Get arguments of clause            
def getArgs(root,doc) :
    args = []
    for child in root.children :
        if child.dep_ in ['nsubj', 'dobj', 'iobj', 'nsubjpass', 'attr'] :
            #args.append((child.dep_,child.lemma_))
            c = child.lemma_
            if c not in ['-PRON-','anyone','someone','anything','that','what','who'] :
                wh = getWh(child,doc)
                mods = getMods(child,doc)
                prep = getPrep(child,doc)
                args = wh + [c] + mods + prep
    return args

# Get modifiers of noun
def getMods(root,doc) :
    mods = []
    for child in root.children :
        if child.dep_ in ['amod','compound'] :
            c = child.lemma_
            if c not in ['good'] :
                mods.append(c)
    return mods

# Get complements of a given root 
def getComps(root,doc) :
    comps = []
    for token in doc : 
        if token.dep_ in ['xcomp', 'acomp', 'relcl', 'advcl', 'pcomp', 'csubj'] :
            if root.is_ancestor(token) :
                wh = getWh(token,doc)
                head = getHead(token,doc)
                args = getArgs(token,doc)
                prep = getPrep(token,doc)
                words = wh + prep + head + args 
                comps.extend(words)
    return comps

# Map words to a category of question
def getCat(words) :
    polarity = 'non-polar'
    
    if 'what' in words :
        i = words.index('what') 
        if words[i+1] == 'treatment' :
            if words[i+2] in ['drug','pharmacological'] :
                cat = 'drug'
            else :
                cat = 'treatment'
        elif words[i+1] in ['agent','prescription','analgesic'] :
            cat = 'drug'
        elif words[i+1] in ['treat','precaution','thinking','regimen','patient'] :
            cat = 'treatment'
        elif words[i+1] in ['factor','sign','target'] :
            cat = 'diagnosis'
        elif words[i+1] in ['period'] :
            cat = 'time'
        else :
            cat = words[i+1]
            
    elif 'how' == words[0] :
        if  words[1] in ['often','time']:
            cat ='time'
        elif  words[1] in ['interpret']:
            cat = 'diagnosis'
        elif  words[1] in ['receive']:
            cat = 'communication'
        elif  words[1] in ['present']:
            cat = 'communication'
        elif  words[1] in ['decide']:
            cat = 'treatment'
        else :
            cat =  'how'
            
    elif 'when' == words[0] :
        cat =  'time'
        
    else :
        polarity = 'polar'
        if words[0] in ['agent'] :
            cat = 'drug'
        elif words[0] in ['restart','experience','history','success','add','way',
                'suggest','advocate','regimen','contraindication',
                'manage','role','monitoring','eradication','difference'] :
            cat = 'treatment'
        elif words[0] in ['discuss'] :
            cat = 'communication'
        elif words[0] in ['explain','biopsy','gender','recommend'] :
            cat = 'diagnosis'
        elif  words[0] in ['point']:
            cat ='time'
        else : 
            cat = 'treatment'
            
    return polarity, cat

# Get asnwer types, questions and answers    
answertype = getAnswerTypes()
questions, answers = getQandA(answertype)

# Determine categories for each question
for qid in questions :
    question = questions[qid]
    print(qid,question)
    doc = nlp(question) # parse the question
    roots = getRoots(doc) # get the root nodes
    for r in roots :
        if not isquest(r,doc) : continue # Determine if question
        words = getClause(r,doc) # Extract list of important words
        print(words)
        polar, cat = getCat(words) # Determine polarity and category
        print(polar,cat)
    print()

NameError: name 'getAnswerTypes' is not defined

### Apply to this into a pipeline 

In [None]:
# This code provides a simple example of adding a UMLS pipeline to SpaCy.

import spacy
from spacy.tokens import Span
 
# Return UMLS semantic type for a given word 
# This function needs to be replaced by a real function that can map any term. 
def getUMLS(term) :
    #future dictionary 
    #umls = semtype_dict
    umls = {"The patient" : "podg", "treatment" : "topp", "a new drug" : "phsu" }
    if term in umls :
        return umls[term]
    else :
        return None
 
class UMLSPipeline(object):
    def __init__(self, nlp):
        Span.set_extension('umls', default=None)
 
    def __call__(self, doc):
        for chunk in doc.noun_chunks:
            sem = getUMLS(chunk.text)
            if sem != None:
                chunk._.set('umls', sem)
        return doc
 
 
nlp = spacy.load('en_core_web_md')  

# Create a UMLS pipeline and add to the existing ones
umls_pipeline = UMLSPipeline(nlp)

#customize the spacy nlp 
nlp.add_pipe(umls_pipeline, name='umls')
print(nlp.pipeline)

#doc = nlp("this is still in the acute phase and we cannot assume he will continue to have pain. the combination of 800 mg of ibuprofen and 650 mg of acetaminophen 3 times a day would be recommended. the use of oxycodone without acetaminophen 4 times a day whether the pain is elevated or not . the use of as needed increases the chances of chasing the pain which can increase the chances of addiction. if there is continued pain after surgeries after 6 months than a long-acting opioid may be indicated. ")
doc = nlp("The patient a new drug treatment with a new drug.")

#Show noun phrase chunks and their mapping to UMLS
for chunk in doc.noun_chunks:
    
    #remove a/the before the noun phrase
    #look up don't matched in every case 
    
        print(chunk.text,len(chunk.text),len(chunk),chunk._.umls) 

#chunk: how many words in the phrase
#chunk.text: how many charactors in the phrase 
 
#span : one or more tokens 
#multi-word tokens: span

#noun