In [112]:
import urllib2
from bs4 import BeautifulSoup, NavigableString
import requests as req
import string
import re
import pdb

In [33]:
class WebReader(object):
    def __init__(self):
        self._buffer = {}
    
    def fetch(self, origlink):
        link = self.prune(origlink)
        if link in self._buffer:
            return self._buffer[link]
        try:
            page = urllib2.urlopen(link) 
            handler = BeautifulSoup(page, 'html.parser')
            text = handler.findAll(text=lambda text:isinstance(text, NavigableString))
            self._buffer[link] = self.process(text)
            return self._buffer[link]
        except:
            print ("Unable to retrieve web-data for {} ! Please check your link!".format(origlink))
            return None
        
    def prune(self, link):
        return link.split('#')[0]
    
    def process(self, text):
        data =  u" ".join(text).encode('utf-8')
        data = data.translate(None, string.punctuation)
        data = re.sub("\\W",' ', data.strip())
        words = data.split(" ")
        filtered = []
        for word in words:
            if len(word) <= 30:
                filtered.append(word)
        return re.sub(' +',' '," ".join(filtered))
    
    def batchFetch(self, links):
        for link in links:
            self.fetch(link)
        return self._buffer
    
    def fileFetchWithLabels(self, linkfile, trainfile, writemode='w'):
        with open(trainfile,writemode) as fw:
            with open(linkfile,'r') as fp:
                for line in fp:
                    line = line.strip().split(",")
                    data = self.fetch(line[0])
                    label = ",".join(line[1:])
                    if data:
                        fw.write("{}|***|{}\n".format(data,label))

In [126]:
class MedlineCustomReader(WebReader):
    def __init__(self):
        super(MedlineCustomReader, self).__init__()
    
    def fetch(self, origlink, beyond=True):
        link = self.prune(origlink)
        if link in self._buffer:
            return self._buffer[link]
        try:
            page = urllib2.urlopen(link) 
            handler = BeautifulSoup(page, 'html.parser')
            text = handler.findAll(text=lambda text:isinstance(text, NavigableString))
            textdata = self.process(text)
            self._buffer[link] = [textdata]
            if beyond:
                div = handler.find('div', {'class': 'main'})
                childlinks = div.findAll('a')
                for clink in childlinks:
                    src = str(clink.get('href',None))
                    if re.match(r'^http*', src):
                        childpruned = self.prune(src)
                        data = self.fetch(childpruned,False)
                        if data and childpruned!=link:
                            self._buffer[link].append(data)
                return self._buffer[link]
            else:
                return textdata
        except:
            print ("Unable to retrieve web-data for {} ! Please check your link!".format(origlink))
            return None
        
    def fileFetchWithLabels(self, linkfile, trainfile, writemode='w'):
        with open(trainfile,writemode) as fw:
            with open(linkfile,'r') as fp:
                for line in fp:
                    line = line.strip().split(",")
                    label = ",".join(line[1:])
                    dataarr = self.fetch(line[0])
                    if len(dataarr):
                        [fw.write("{}|***|{}\n".format(data,label)) for data in dataarr]

In [103]:
rdr = WebReader()

In [37]:
rdr.fileFetchWithLabels("train_level1.csv","traindata")

Unable to retrive web-data for https://www.onhealth.com/content/1/gynecological_disorders_-_research ! Please check your link!


In [127]:
rdr2 = MedlineCustomReader()

In [128]:
rdr2.fileFetchWithLabels("train_level2.csv","traindata2")

Unable to retrive web-data for https://familydoctor.org/condition/abdominal-aortic-aneurysm/?adfree=true ! Please check your link!
Unable to retrive web-data for https://es.familydoctor.org/condicion/aneurisma-aortico-abdominal/?adfree=true ! Please check your link!
Unable to retrive web-data for http://www.heart.org/HEARTORG/Conditions/Arrhythmia/AboutArrhythmia/Who-is-at-Risk-for-Atrial-Fibrillation-AF-or-AFib_UCM_423773_Article.jsp ! Please check your link!
Unable to retrive web-data for https://familydoctor.org/condition/arrhythmogenic-right-ventricular-dysplasia/?adfree=true ! Please check your link!
Unable to retrive web-data for https://es.familydoctor.org/condicion/displasia-ventricular-derecha-arritmogenica/?adfree=true ! Please check your link!
Unable to retrive web-data for https://familydoctor.org/condition/bacterial-endocarditis/?adfree=true ! Please check your link!
Unable to retrive web-data for https://es.familydoctor.org/condicion/endocarditis-bacteriana/?adfree=true !

In [86]:
link = "https://medlineplus.gov/aorticaneurysm.html"
page = urllib2.urlopen(link) 
handler = BeautifulSoup(page, 'html.parser')

In [99]:
div = handler.find('div', {'class': 'main'})
childlinks = div.findAll('a')
for link in childlinks:
    src = str(link.get('href',None))
    if re.match(r'^http*', src):
        print src, link.text

https://medlineplus.gov/aneurysms.html aneurysm
https://www.nhlbi.nih.gov/health/health-topics/topics/arm What Is an Aneurysm?
https://www.radiologyinfo.org/en/info.cfm?pg=abdominus Abdominal Ultrasound
https://www.radiologyinfo.org/sp/info.cfm?pg=abdominus Spanish
https://www.nhlbi.nih.gov/health/health-topics/topics/cct Chest CT Scan
https://www.nhlbi.nih.gov/health/health-topics/topics/cmri Chest MRI
https://medlineplus.gov/hearthealthtests.html Heart Health Tests: MedlinePlus Health Topic
https://medlineplus.gov/spanish/hearthealthtests.html Spanish
https://www.nhlbi.nih.gov/health/health-topics/topics/arm/diagnosis How Is an Aneurysm Diagnosed?
https://www.uspreventiveservicestaskforce.org/Home/GetFileByID/1874 Screening for Abdominal Aortic Aneurysm
http://www.texasheart.org/HIC/Topics/Proced/asurg.cfm Aneurysm Repair
http://www.texasheart.org/HIC/Topics_Esp/Proced/asurg_span.cfm Spanish
https://www.nhlbi.nih.gov/health/health-topics/topics/arm/treatment How Is an Aneurysm Treate

In [93]:
childlinks

[<a href="#summary" title="Go to: Summary">Summary</a>,
 <a href="#cat_51" title="Go to: Start Here">Start Here</a>,
 <a href="#cat_92" title="Go to: Diagnosis and Tests">Diagnosis and Tests</a>,
 <a href="#cat_78" title="Go to: Treatments and Therapies">Treatments and Therapies</a>,
 <a href="#cat_47" title="Go to: Related Issues">Related Issues</a>,
 <a href="#cat_42" title="Go to: Specifics">Specifics</a>,
 <a href="#cat_46" title="Go to: Genetics">Genetics</a>,
 <a href="#cat_94" title="Go to: Videos and Tutorials">Videos and Tutorials</a>,
 <a href="#cat_79" title="Go to: Statistics and Research">Statistics and Research</a>,
 <a href="#cat_27" title="Go to: Clinical Trials">Clinical Trials</a>,
 <a href="#cat_59" title="Go to: Journal Articles">Journal Articles</a>,
 <a href="#cat_82" title="Go to: Reference Desk">Reference Desk</a>,
 <a href="#cat_83" title="Go to: Find an Expert">Find an Expert</a>,
 <a href="#cat_69" title="Go to: Patient Handouts">Patient Handouts</a>,
 <a nam