In [1]:
import numpy as np
import pandas as pd
import requests, re
from bs4 import BeautifulSoup
from bs4 import Tag, NavigableString, Comment


# Create a function to parse through web pages

def parse_page(_d):
    data = filter(lambda x:x != '\n', [i for i in _d.find('table', {'id':'Browse'}).td.contents if isinstance(i, str) or i.name == 'a'])
    _next = next(data, None)
    while _next is not None:
        new_d, _n = {'title':_next.text, 'link':_next['href']}, next(data, None)
        if hasattr(_n, 'text'):
            yield new_d
            yield {'title':_n.text, 'link':_n['href'], 'about':next(data)}
        else:
            yield {**new_d, 'about':_n}   
        _next = next(data, None)  
        

# Identify the web page to start at and create a parsed text list with parse_page

d, r = BeautifulSoup(requests.get('https://www.mtsamples.com/site/pages/browse.asp?type=82-Urology').text, 'html.parser'), []
r.append(list(parse_page(d)))
_c = [i for i in d.find('div', {'class':'Contrast'}).find_all('a') if i.text == '>']


# Use a while loop to continuously iterate over the links as long as the "next" (>) button is present. 
# At each iteration, scrape the next page and extract the headers with the link and description

while _c:
    d = BeautifulSoup(requests.get(f'https://www.mtsamples.com{_c[0]["href"]}').text, 'html.parser')
    r.append(list(parse_page(d)))
    _c = [i for i in d.find('div', {'class':'Contrast'}).find_all('a') if i.text == '>']
                          

In [2]:
# Specify medical specialty of web pages

urology_web_pages = r
len(urology_web_pages)

16

In [3]:
urology_web_pages

[[{'title': 'Adrenalectomy & Umbilical Hernia Repair',
   'link': '/site/pages/sample.asp?Type=82-Urology&Sample=2464-Adrenalectomy & Umbilical Hernia Repair',
   'about': 'Laparoscopic hand-assisted left adrenalectomy and umbilical hernia repair.  Patient with a 5.5-cm diameter nonfunctioning mass in his right adrenal.'},
  {'title': 'Bilateral Vasovasostomy',
   'link': '/site/pages/sample.asp?Type=82-Urology&Sample=272-Bilateral Vasovasostomy',
   'about': 'Bilateral vasovasostomy surgery sample.'},
  {'title': 'BioArc Midurethral Sling',
   'link': '/site/pages/sample.asp?Type=82-Urology&Sample=1527-BioArc Midurethral Sling',
   'about': 'Cystoscopy, cystocele repair, BioArc midurethral sling.'},
  {'title': 'Bladder Biopsies & Fulguration',
   'link': '/site/pages/sample.asp?Type=82-Urology&Sample=2469-Bladder Biopsies & Fulguration',
   'about': 'Cystoscopy, bladder biopsies, and fulguration.  Bladder lesions with history of previous transitional cell bladder carcinoma, pathology

In [4]:
# Create a url list for the medical records within the specified medical specialty 

urology_url_list = []

for i in urology_web_pages:
    for j in i:
        urology_url_list.append('https://www.mtsamples.com' + j['link'])
        
len(urology_url_list)

158

In [5]:
print(urology_url_list)

['https://www.mtsamples.com/site/pages/sample.asp?Type=82-Urology&Sample=2464-Adrenalectomy & Umbilical Hernia Repair', 'https://www.mtsamples.com/site/pages/sample.asp?Type=82-Urology&Sample=272-Bilateral Vasovasostomy', 'https://www.mtsamples.com/site/pages/sample.asp?Type=82-Urology&Sample=1527-BioArc Midurethral Sling', 'https://www.mtsamples.com/site/pages/sample.asp?Type=82-Urology&Sample=2469-Bladder Biopsies & Fulguration', 'https://www.mtsamples.com/site/pages/sample.asp?Type=82-Urology&Sample=2292-Bladder Cancer', 'https://www.mtsamples.com/site/pages/sample.asp?Type=82-Urology&Sample=431-Bladder Instillation', 'https://www.mtsamples.com/site/pages/sample.asp?Type=82-Urology&Sample=1003-Bladder Laceration Closure', 'https://www.mtsamples.com/site/pages/sample.asp?Type=82-Urology&Sample=1471-Bladder Tumor', 'https://www.mtsamples.com/site/pages/sample.asp?Type=82-Urology&Sample=2750-Blood in Urine - ER Visit', 'https://www.mtsamples.com/site/pages/sample.asp?Type=82-Urology&Sa

In [6]:
def get_medical_notes(url_link):

    global medical_notes
    medical_notes = []
    
    
    # We will loop through each of the url links provided as input for the function

    for i in url_link:

        url = i
        res = requests.get(url)
        res.raise_for_status()
        html = res.text
        soup = BeautifulSoup(html, 'html.parser')
        
        
        # Here we can get page_title from <h1>

        title_el = soup.find('h1')
        page_title = title_el.text.strip()
        first_hr = title_el.find_next_sibling('hr')
        
        
        # Here we can get the page_title description after we see the text 'description'

        description_title = title_el.find_next_sibling('b', text=re.compile('description', flags=re.I))
        description_text_parts = []
        for s in description_title.next_siblings:
            if s is first_hr:
                break
            if isinstance(s, Tag):
                description_text_parts.append(s.text.strip())
            elif isinstance(s, NavigableString):
                description_text_parts.append(str(s).strip())
        description_text = '\n'.join(p for p in description_text_parts if p.strip())

        
        # Titles are all bold and uppercase
        
        titles = [b for b in first_hr.find_next_siblings('b') if b.text.strip().isupper()]
        
        
        # We can find the text between the titles and assign it to the title we see earlier

        docs = []
        
        for t in titles:
            text_parts = []
            for s in t.next_siblings:
                
                # go until next title
                
                if s in titles:
                    break
                if isinstance(s, Comment):
                    continue
                if isinstance(s, Tag):
                    if s.name == 'div':
                        break
                    text_parts.append(s.text.strip())
                elif isinstance(s, NavigableString):
                    text_parts.append(str(s).strip())
                    
            text = '\n'.join(p for p in text_parts if p.strip())
            
            docs.append({
                'title': t.text.strip(),
                'text': text
            })
            
            
        # Clean the text in docs such that we have one full_text document of medical notes
        
        lis = []

        for j in docs:
            for k in j.values():
                lis.append(k.rstrip('\n'))

        full_text = ''.join(lis)
        
        
        # Append to our medical_notes list

        medical_notes.append(full_text)

    len(medical_notes)

In [8]:
# Get and clean all medical notes for the specified medical specialty 

get_medical_notes(urology_url_list)

urology_medical_notes = medical_notes
len(urology_medical_notes)

158

In [9]:
urology_medical_notes

['PREOPERATIVE DIAGNOSES1.  Adrenal mass, right sided.\n2.  Umbilical hernia.POSTOPERATIVE DIAGNOSES1.  Adrenal mass, right sided.\n2.  Umbilical hernia.OPERATION PERFORMED:Laparoscopic hand-assisted left adrenalectomy and umbilical hernia repair.ANESTHESIA:General.CLINICAL NOTE:This is a 52-year-old inmate with a 5.5 cm diameter nonfunctioning mass in his right adrenal.  Procedure was explained including risks of infection, bleeding, possibility of transfusion, possibility of further treatments being required.  Alternative of fully laparoscopic are open surgery or watching the lesion.DESCRIPTION OF OPERATION:In the right flank-up position, table was flexed.  He had a Foley catheter in place.  Incision was made from just above the umbilicus, about 5.5 cm in diameter.  The umbilical hernia was taken down.  An 11 mm trocar was placed in the midline, superior to the GelPort and a 5 mm trocar placed in the midaxillary line below the costal margin.  A liver retractor was placed to this.\nTh

In [10]:
# Return medical notes and the labeled medical specialty as a dataframe

urology_df = pd.DataFrame({'Medical Specialty': 'Urology', 'Medical Notes': urology_medical_notes})
print(len(urology_df))
urology_df.head()


158


Unnamed: 0,Medical Specialty,Medical Notes
0,Urology,"PREOPERATIVE DIAGNOSES1. Adrenal mass, right ..."
1,Urology,
2,Urology,PREOPERATIVE DIAGNOSIS:Stress urinary incontin...
3,Urology,PREOPERATIVE DIAGNOSIS:Bladder lesions with hi...
4,Urology,CHIEF COMPLAINT:Bladder cancer.HISTORY OF PRES...


In [11]:
import pickle

# pickle dataframe as a checkpoint

pickle_out = open("urology_df.pickle", "wb") # write file to bytes
pickle.dump(urology_df, pickle_out)
pickle_out.close()