In [203]:
import pandas as pd
import numpy as np
import glob
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/himanshujanbandhu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/himanshujanbandhu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/himanshujanbandhu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [204]:
#base = 'Downloads/archive/'

files = glob.glob('Downloads/archive/plaintext_data/*.txt')

dataset = []

for file in files:
    data = open(file)
    dataset.append(data.read())
    
database = dataset
dataset = np.array(dataset, dtype='object')
dataset[0]

'CT HEAD WITHOUT CONTRAST AND CT CERVICAL SPINE WITHOUT CONTRAST\n\nREASON FOR EXAM: \n\n Motor vehicle collision.\n\nCT HEAD WITHOUT CONTRAST\n\nTECHNIQUE:\n\n  Noncontrast axial CT images of the head were obtained.\n\nFINDINGS: \n\n There is no acute intracranial hemorrhage, mass effect, midline shift, or extra-axial fluid collection.  The ventricles and cortical sulci are normal in shape and configuration.  The gray/white matter junctions are well preserved.  There is no calvarial fracture.  The visualized paranasal sinuses and mastoid air cells are clear.\n\nIMPRESSION: \n\n Negative for acute intracranial disease.\n\nCT CERVICAL SPINE\n\nTECHNIQUE:  \n\nNoncontrast axial CT images of the cervical spine were obtained.  Sagittal and coronal images were obtained.\n\nFINDINGS:\n\n  Straightening of the normal cervical lordosis is compatible with patient position versus muscle spasms.  No fracture or subluxation is seen.  Anterior and posterior osteophyte formation is seen at C5-C6.  N

In [205]:
#Preprocessing

#lowecase
def lowerCase(dataset):
    vec_lowerCase = np.vectorize(lambda x:x.lower())
    dataset = vec_lowerCase(dataset)
    return dataset
    
#remove stopwords
stop_words = set(stopwords.words('english'))
def remove_sw(dataset):
    for i in range(len(dataset)):
        word_tokens = word_tokenize(dataset[i])
        new_text = ''
        for word in word_tokens:
            if word not in stop_words:
                new_text = new_text + ' ' + word
        
        dataset[i] = new_text
    return dataset
        
#remove punctuation
def remove_punc(dataset):
    stripPunct = str.maketrans('', '', string.punctuation)
    vecTrans=np.vectorize(lambda x:x.translate(stripPunct))
    dataset = vecTrans(dataset)
    return dataset

#stemming
def stem(dataset):
    ps = PorterStemmer()
    for i in range(len(dataset)):
        word_tokens = word_tokenize(dataset[i])
        new_text = ''
        for word in word_tokens:
            new_text = new_text + ' ' + ps.stem(word)
        dataset[i] = new_text
    return dataset

#lemmetization
def lemmetization(dataset):
    lemmatizer = WordNetLemmatizer()
    for i in range(len(dataset)):
        word_tokens = word_tokenize(dataset[i])
        new_text = ''
        for word in word_tokens:
            new_text = new_text + ' ' + lemmatizer.lemmatize(word)
        dataset[i] = new_text
    return dataset




In [206]:
dataset = lemmetization(dataset)
dataset[0]

' CT HEAD WITHOUT CONTRAST AND CT CERVICAL SPINE WITHOUT CONTRAST REASON FOR EXAM : Motor vehicle collision . CT HEAD WITHOUT CONTRAST TECHNIQUE : Noncontrast axial CT image of the head were obtained . FINDINGS : There is no acute intracranial hemorrhage , mass effect , midline shift , or extra-axial fluid collection . The ventricle and cortical sulcus are normal in shape and configuration . The gray/white matter junction are well preserved . There is no calvarial fracture . The visualized paranasal sinus and mastoid air cell are clear . IMPRESSION : Negative for acute intracranial disease . CT CERVICAL SPINE TECHNIQUE : Noncontrast axial CT image of the cervical spine were obtained . Sagittal and coronal image were obtained . FINDINGS : Straightening of the normal cervical lordosis is compatible with patient position versus muscle spasm . No fracture or subluxation is seen . Anterior and posterior osteophyte formation is seen at C5-C6 . No abnormal anterior cervical soft tissue swelli

In [207]:
def preprocessing(dataset):
    dataset = lowerCase(dataset)
    dataset = remove_sw(dataset)
    dataset = remove_punc(dataset)
    dataset = stem(dataset)
    dataset = lemmetization(dataset)
    return dataset

dataset = preprocessing(dataset)
dataset[0]

' ct head without contrast ct cervic spine without contrast reason exam motor vehicl collis ct head without contrast techniqu noncontrast axial ct imag head obtain find acut intracrani hemorrhag mass effect midlin shift extraaxi fluid collect ventricl cortic sulcu normal shape configur graywhit matter junction well preserv calvari fractur visual parana sinu mastoid air cell clear impress neg acut intracrani diseas ct cervic spine techniqu noncontrast axial ct imag cervic spine obtain sagitt coron imag obtain find straighten normal cervic lordosi compat patient posit versu muscl spasm fractur sublux seen anterior posterior osteophyt format seen c5c6 abnorm anterior cervic soft tissu swell seen spinal compress note atlantoden interv normal larg retent cyst versu polyp within right maxillari sinu impress 1 straighten normal cervic lordosi compat patient posit versu muscl spasm 2 degen disk joint diseas c5c6 3 retent cyst versu polyp right maxillari sinu'

In [208]:
tfIdfVectorizer = TfidfVectorizer()
tfIdf = tfIdfVectorizer.fit_transform(dataset)
df = pd.DataFrame(tfIdf[0].T.todense(),index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
print (df.head(25))
print(tfIdf)

               TF-IDF
cervic       0.330977
versu        0.268890
ct           0.240240
lordosi      0.212856
straighten   0.182893
c5c6         0.182893
retent       0.163673
spine        0.162390
maxillari    0.160228
sinu         0.154972
spasm        0.154266
noncontrast  0.154266
contrast     0.150848
imag         0.147781
intracrani   0.142876
compat       0.135966
polyp        0.135966
head         0.130960
axial        0.130265
cyst         0.125412
fractur      0.122193
techniqu     0.108934
obtain       0.108838
seen         0.108171
graywhit     0.106428
  (0, 5402)	0.0601051404065591
  (0, 3561)	0.06161543613173908
  (0, 3390)	0.06649577045308773
  (0, 5885)	0.16022752389451314
  (0, 7849)	0.055765212880890765
  (0, 9574)	0.039485755014916656
  (0, 7153)	0.13596630218012906
  (0, 3297)	0.12541166857282082
  (0, 7787)	0.16367279437250576
  (0, 5533)	0.048533555380030666
  (0, 5267)	0.06722255289299817
  (0, 2099)	0.09509503547308419
  (0, 6516)	0.03509164953897528
  (0, 3038

In [209]:
from sklearn.metrics.pairwise import cosine_similarity

def search(query):
    preprocessing(query)
    query_vec = tfIdfVectorizer.transform(query)
    results = cosine_similarity(tfIdf, query_vec)
    out = np.argsort(results, axis=0)
    #print(out)
    k=1
    for i in out[-10:][::-1]:
        print("result ", k, "=============================")
        print(database[i[0]])
        print("=============================")
        k+=1
    
search(["Recommend correlation with site of pain in addition to conservative management and followup imaging."])

EXAM:

  Nuclear medicine lymphatic scan.

REASON FOR EXAM: 

 Left breast cancer.

TECHNIQUE: 

 1.0 mCi of Technetium-99m sulfur colloid was injected within the dermis surrounding the left breast biopsy site at four locations.  A 16-hour left anterior oblique imaging was performed with and without shielding of the original injection site.

FINDINGS:  

There are two small foci of increased activity in the left axilla.  This is consistent with the sentinel lymph node.  No other areas of activity are visualized outside of the injection site and two axillary lymph nodes.

IMPRESSION:  

Technically successful lymph node injection with two areas of increased activity in the left axilla consistent with sentinel lymph node.
EXAM: 

 CT chest with contrast.

REASON FOR EXAM: 

 Pneumonia, chest pain, short of breath, and coughing up blood.

TECHNIQUE: 

 Postcontrast CT chest 100 mL of Isovue-300 contrast.

FINDINGS: 

 This study demonstrates a small region of coalescent infiltrates/consol