In [1]:
%%writefile scrape.py
#!/usr/bin/env python

""" This is a modified version of James Mills' original recipe. """

import re
import sys
import time
import math
import urllib2
import urlparse
import optparse
import hashlib
from cgi import escape
from traceback import format_exc
from Queue import Queue, Empty as QueueEmpty

from bs4 import BeautifulSoup

class Link (object):

    def __init__(self, src, dst, link_type):
        self.src = src
        self.dst = dst
        self.link_type = link_type

    def __hash__(self):
        return hash((self.src, self.dst, self.link_type))

    def __eq__(self, other):
        return (self.src == other.src and
                self.dst == other.dst and
                self.link_type == other.link_type)
    
    def __str__(self):
        return self.src + " -> " + self.dst

class Crawler(object):

    def __init__(self, root, depth_limit, confine=None, exclude=[], locked=True, filter_seen=True):
        self.root = root
        self.host = urlparse.urlparse(root)[1]

        ## Data for filters:
        self.depth_limit = depth_limit # Max depth (number of hops from root)
        self.locked = locked           # Limit search to a single host?
        self.confine_prefix=confine    # Limit search to this prefix
        self.exclude_prefixes=exclude; # URL prefixes NOT to visit
                

        self.urls_seen = set()          # Used to avoid putting duplicates in queue
        self.urls_remembered = set()    # For reporting to user
        self.visited_links= set()       # Used to avoid re-processing a page
        self.links_remembered = set()   # For reporting to user
        
        self.num_links = 0              # Links found (and not excluded by filters)
        self.num_followed = 0           # Links followed.  

        # Pre-visit filters:  Only visit a URL if it passes these tests
        self.pre_visit_filters=[self._prefix_ok,
                                self._exclude_ok,
                                self._not_visited,
                                self._same_host]

        # Out-url filters: When examining a visited page, only process
        # links where the target matches these filters.        
        if filter_seen:
            self.out_url_filters=[self._prefix_ok,
                                     self._same_host]
        else:
            self.out_url_filters=[]

    def _pre_visit_url_condense(self, url):
        
        """ Reduce (condense) URLs into some canonical form before
        visiting.  All occurrences of equivalent URLs are treated as
        identical.

        All this does is strip the \"fragment\" component from URLs,
        so that http://foo.com/blah.html\#baz becomes
        http://foo.com/blah.html """

        base, frag = urlparse.urldefrag(url)
        return base

    ## URL Filtering functions.  These all use information from the
    ## state of the Crawler to evaluate whether a given URL should be
    ## used in some context.  Return value of True indicates that the
    ## URL should be used.
    
    def _prefix_ok(self, url):
        """Pass if the URL has the correct prefix, or none is specified"""
        return (self.confine_prefix is None  or
                url.startswith(self.confine_prefix))

    def _exclude_ok(self, url):
        """Pass if the URL does not match any exclude patterns"""
        prefixes_ok = [ not url.startswith(p) for p in self.exclude_prefixes]
        return all(prefixes_ok)
    
    def _not_visited(self, url):
        """Pass if the URL has not already been visited"""
        return (url not in self.visited_links)
    
    def _same_host(self, url):
        """Pass if the URL is on the same host as the root URL"""
        try:
            host = urlparse.urlparse(url)[1]
            return re.match(".*%s" % self.host, host) 
        except Exception, e:
            print >> sys.stderr, "ERROR: Can't process url '%s' (%s)" % (url, e)
            return False
            

    def crawl(self):

        """ Main function in the crawling process.  Core algorithm is:

        q <- starting page
        while q not empty:
           url <- q.get()
           if url is new and suitable:
              page <- fetch(url)   
              q.put(urls found in page)
           else:
              nothing

        new and suitable means that we don't re-visit URLs we've seen
        already fetched, and user-supplied criteria like maximum
        search depth are checked. """
        
        q = Queue()
        q.put((self.root, 0))

        while not q.empty():
            this_url, depth = q.get()
            
            #Non-URL-specific filter: Discard anything over depth limit
            if depth > self.depth_limit:
                continue
            
            #Apply URL-based filters.
            do_not_follow = [f for f in self.pre_visit_filters if not f(this_url)]
            
            #Special-case depth 0 (starting URL)
            if depth == 0 and [] != do_not_follow:
                print >> sys.stderr, "Whoops! Starting URL %s rejected by the following filters:", do_not_follow

            #If no filters failed (that is, all passed), process URL
            if [] == do_not_follow:
                try:
                    self.visited_links.add(this_url)
                    self.num_followed += 1
                    page = Fetcher(this_url)
                    page.fetch()
                    for link_url in [self._pre_visit_url_condense(l) for l in page.out_links()]:
                        if link_url not in self.urls_seen:
                            q.put((link_url, depth+1))
                            self.urls_seen.add(link_url)
                            
                        do_not_remember = [f for f in self.out_url_filters if not f(link_url)]
                        if [] == do_not_remember:
                                self.num_links += 1
                                self.urls_remembered.add(link_url)
                                link = Link(this_url, link_url, "href")
                                if link not in self.links_remembered:
                                    self.links_remembered.add(link)
                except Exception, e:
                    print >>sys.stderr, "ERROR: Can't process url '%s' (%s)" % (this_url, e)
                    #print format_exc()

class OpaqueDataException (Exception):
    def __init__(self, message, mimetype, url):
        Exception.__init__(self, message)
        self.mimetype=mimetype
        self.url=url
        

class Fetcher(object):
    
    """The name Fetcher is a slight misnomer: This class retrieves and interprets web pages."""

    def __init__(self, url):
        self.url = url
        self.out_urls = []

    def __getitem__(self, x):
        return self.out_urls[x]

    def out_links(self):
        return self.out_urls

    #def _addHeaders(self, request):
    #    request.add_header("User-Agent", AGENT)

    def _open(self):
        url = self.url
        try:
            request = urllib2.Request(url)
            handle = urllib2.build_opener()
        except IOError:
            return None
        return (request, handle)

    def fetch(self):
        request, handle = self._open()
        #self._addHeaders(request)
        if handle:
            try:
                data=handle.open(request)
                mime_type=data.info().gettype()
                url=data.geturl();
                if mime_type != "text/html":
                    raise OpaqueDataException("Not interested in files of type %s" % mime_type,
                                              mime_type, url)
                content = unicode(data.read(), "utf-8",
                        errors="replace")
                soup = BeautifulSoup(content)
                tags = soup('a')
            except urllib2.HTTPError, error:
                if error.code == 404:
                    print >> sys.stderr, "ERROR: %s -> %s" % (error, error.url)
                else:
                    print >> sys.stderr, "ERROR: %s" % error
                tags = []
            except urllib2.URLError, error:
                print >> sys.stderr, "ERROR: %s" % error
                tags = []
            except OpaqueDataException, error:
                print >>sys.stderr, "Skipping %s, has type %s" % (error.url, error.mimetype)
                tags = []
            for tag in tags:
                href = tag.get("href")
                if href is not None:
                    url = urlparse.urljoin(self.url, escape(href))
                    if url not in self:
                        self.out_urls.append(url)

def getLinks(url):
    page = Fetcher(url)
    page.fetch()
    """for i, url in enumerate(page):
        print "%d. %s" % (i, url) """
    j = 1
    for i, url in enumerate(page):
        if url.find("http")>=0:
	        print "%d. %s" % (j, url)
	        j = j + 1

def parse_options():
    """parse_options() -> opts, args

    Parse any command-line options given returning both
    the parsed options and arguments.
    """

    parser = optparse.OptionParser()

    parser.add_option("-q", "--quiet",
            action="store_true", default=False, dest="quiet",
            help="Enable quiet mode")

    parser.add_option("-l", "--links",
            action="store_true", default=False, dest="links",
            help="Get links for specified url only")    

    parser.add_option("-d", "--depth",
            action="store", type="int", default=30, dest="depth_limit",
            help="Maximum depth to traverse")

    parser.add_option("-c", "--confine",
            action="store", type="string", dest="confine",
            help="Confine crawl to specified prefix")

    parser.add_option("-x", "--exclude", action="append", type="string",
                      dest="exclude", default=[], help="Exclude URLs by prefix")
    
    parser.add_option("-L", "--show-links", action="store_true", default=False,
                      dest="out_links", help="Output links found")

    parser.add_option("-u", "--show-urls", action="store_true", default=False,
                      dest="out_urls", help="Output URLs found")

    parser.add_option("-D", "--dot", action="store_true", default=False,
                      dest="out_dot", help="Output Graphviz dot file")
    


    opts, args = parser.parse_args()

    if len(args) < 1:
        parser.print_help(sys.stderr)
        raise SystemExit, 1

    if opts.out_links and opts.out_urls:
        parser.print_help(sys.stderr)
        parser.error("options -L and -u are mutually exclusive")

    return opts, args

class DotWriter:

    """ Formats a collection of Link objects as a Graphviz (Dot)
    graph.  Mostly, this means creating a node for each URL with a
    name which Graphviz will accept, and declaring links between those
    nodes."""

    def __init__ (self):
        self.node_alias = {}

    def _safe_alias(self, url, silent=False):

        """Translate URLs into unique strings guaranteed to be safe as
        node names in the Graphviz language.  Currently, that's based
        on the md5 digest, in hexadecimal."""

        if url in self.node_alias:
            return self.node_alias[url]
        else:
            m = hashlib.md5()
            m.update(url)
            name = "N"+m.hexdigest()
            self.node_alias[url]=name
            if not silent:
                print "\t%s [label=\"%s\"];" % (name, url)                
            return name


    def asDot(self, links):

        """ Render a collection of Link objects as a Dot graph"""
        
        print "digraph Crawl {"
        print "\t edge [K=0.2, len=0.1];"
        for l in links:            
            print "\t" + self._safe_alias(l.src) + " -> " + self._safe_alias(l.dst) + ";"
        print  "}"

        
    

def main():    
    opts, args = parse_options()

    url = args[0]

    if opts.links:
        getLinks(url)
        raise SystemExit, 0

    depth_limit = opts.depth_limit
    confine_prefix=opts.confine
    exclude=opts.exclude

    sTime = time.time()

    print >> sys.stderr,  "Crawling %s (Max Depth: %d)" % (url, depth_limit)
    crawler = Crawler(url, depth_limit, confine_prefix, exclude)
    crawler.crawl()

    if opts.out_urls:
        print "\n".join(crawler.urls_seen)

    if opts.out_links:
        print "\n".join([str(l) for l in crawler.links_remembered])
        
    if opts.out_dot:
        d = DotWriter()
        d.asDot(crawler.links_remembered)

    eTime = time.time()
    tTime = eTime - sTime

    print >> sys.stderr, "Found:    %d" % crawler.num_links
    print >> sys.stderr, "Followed: %d" % crawler.num_followed
    print >> sys.stderr, "Stats:    (%d/s after %0.2fs)" % (
            int(math.ceil(float(crawler.num_links) / tTime)), tTime)

if __name__ == "__main__":
    main()

Overwriting scrape.py


In [3]:
!python scrape.py https://jacknorthrup.com/Documentation/

Crawling https://jacknorthrup.com/Documentation/ (Max Depth: 30)
ERROR: <urlopen error [Errno -2] Name or service not known>
Found:    0
Followed: 1
Stats:    (0/s after 0.01s)


In [79]:
import requests

page = requests.get("http://127.0.0.1:8081")
page.content


'<!DOCTYPE html>\n<html lang="en">\n<head>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>\n\n<style>\nbody {\npadding:30px;\nborder:1px solid gray;\nbackground-color: #fff3e6;\n}\nh2{\ntext-align:center;\n}\np{\nfont-size:25px;\ncolor:;\t\nmargin-top: 5px;\nmargin-left: auto;\nmargin-right: auto;\nwidth:80%;\nbackground-color: #fff3e6;\n\ntext-align:center;\npadding:5px;\nborder:1px solid gray;\n}\nhr{\nwidth:70%;\n}\nul{\nmargin-left:auto;\nmargin-right:auto;\ntext-align: center;\n}\ndiv.container {\n    width: 100%;\n    border: 1px solid gray;\n    background-color: #e6ffff;\n}\n\nheader, footer {\n    padding: 1em;\n    color: white;\n    background-color: black;\n    clear: left;\n    text-align: center;\n}\n\nnav {\n    float: left;\n    max-width: 160px;\n    margin: 0;\n    padding: 1em;\n}\n\nnav ul {\n    list-style-type: none;\n    padding: 0;\n}\n   \nnav ul a {\n    text-decoration: none;\n}\n\narticle {\n    margin-left: 170px;\n    border-left: 1px

In [80]:
import requests

page = requests.get("http://127.0.0.1:8081")
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')
htmnew = soup.prettify()
htmnew

u'<!DOCTYPE html>\n<html lang="en">\n <head>\n  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>\n  <style>\n   body {\npadding:30px;\nborder:1px solid gray;\nbackground-color: #fff3e6;\n}\nh2{\ntext-align:center;\n}\np{\nfont-size:25px;\ncolor:;\t\nmargin-top: 5px;\nmargin-left: auto;\nmargin-right: auto;\nwidth:80%;\nbackground-color: #fff3e6;\n\ntext-align:center;\npadding:5px;\nborder:1px solid gray;\n}\nhr{\nwidth:70%;\n}\nul{\nmargin-left:auto;\nmargin-right:auto;\ntext-align: center;\n}\ndiv.container {\n    width: 100%;\n    border: 1px solid gray;\n    background-color: #e6ffff;\n}\n\nheader, footer {\n    padding: 1em;\n    color: white;\n    background-color: black;\n    clear: left;\n    text-align: center;\n}\n\nnav {\n    float: left;\n    max-width: 160px;\n    margin: 0;\n    padding: 1em;\n}\n\nnav ul {\n    list-style-type: none;\n    padding: 0;\n}\n   \nnav ul a {\n    text-decoration: none;\n}\n\narticle {\n    margin-left: 170px;\n    border-le

In [76]:
list(soup.children)


[u'html',
 u'\n',
 <html lang="en">\n<head>\n<meta content="text/html; charset=unicode-escape" http-equiv="Content-Type"/>\n<style>\nbody {\npadding:30px;\nborder:1px solid gray;\nbackground-color: #fff3e6;\n}\nh2{\ntext-align:center;\n}\np{\nfont-size:25px;\ncolor:;\t\nmargin-top: 5px;\nmargin-left: auto;\nmargin-right: auto;\nwidth:80%;\nbackground-color: #fff3e6;\n\ntext-align:center;\npadding:5px;\nborder:1px solid gray;\n}\nhr{\nwidth:70%;\n}\nul{\nmargin-left:auto;\nmargin-right:auto;\ntext-align: center;\n}\ndiv.container {\n    width: 100%;\n    border: 1px solid gray;\n    background-color: #e6ffff;\n}\n\nheader, footer {\n    padding: 1em;\n    color: white;\n    background-color: black;\n    clear: left;\n    text-align: center;\n}\n\nnav {\n    float: left;\n    max-width: 160px;\n    margin: 0;\n    padding: 1em;\n}\n\nnav ul {\n    list-style-type: none;\n    padding: 0;\n}\n   \nnav ul a {\n    text-decoration: none;\n}\n\narticle {\n    margin-left: 170px;\n    border-l

In [17]:
import bs4
from urllib2.Request import urlopen

ImportError: No module named Request

In [None]:
https://www.newegg.com/global/ph/Processors-Desktops/SubCategory/ID-343?nm_mc=KNC-GooglephAdwords&cm_mmc=KNC-GooglephAdwords-_-Sitelink-Philippines-_-CPUs-_-Global&gclid=Cj0KCQjw_JrMBRDPARIsACis1HyfF4COWw3RstUEDcNBdmXRwW0QcMpajSCs_bNagtlfsdMur7u3j6gaAur8EALw_wcB

In [67]:
import urllib2
import BeautifulSoup
newUrl ="http://jacknorthrup.com/Documentation/index.html"
page = urllib2.urlopen(newUrl)
#print (page)
#page
#soup = BeautifulSoup(page.content, 'html.parser')
page.status_code

AttributeError: addinfourl instance has no attribute 'status_code'

In [18]:
import urllib2
dir(urllib2)

['AbstractBasicAuthHandler',
 'AbstractDigestAuthHandler',
 'AbstractHTTPHandler',
 'BaseHandler',
 'CacheFTPHandler',
 'FTPHandler',
 'FileHandler',
 'HTTPBasicAuthHandler',
 'HTTPCookieProcessor',
 'HTTPDefaultErrorHandler',
 'HTTPDigestAuthHandler',
 'HTTPError',
 'HTTPErrorProcessor',
 'HTTPHandler',
 'HTTPPasswordMgr',
 'HTTPPasswordMgrWithDefaultRealm',
 'HTTPRedirectHandler',
 'HTTPSHandler',
 'OpenerDirector',
 'ProxyBasicAuthHandler',
 'ProxyDigestAuthHandler',
 'ProxyHandler',
 'Request',
 'StringIO',
 'URLError',
 'UnknownHandler',
 '__builtins__',
 '__doc__',
 '__file__',
 '__name__',
 '__package__',
 '__version__',
 '_cut_port_re',
 '_have_ssl',
 '_opener',
 '_parse_proxy',
 '_safe_gethostbyname',
 'addinfourl',
 'base64',
 'bisect',
 'build_opener',
 'ftpwrapper',
 'getproxies',
 'hashlib',
 'httplib',
 'install_opener',
 'localhost',
 'mimetools',
 'os',
 'parse_http_list',
 'parse_keqv_list',
 'posixpath',
 'proxy_bypass',
 'quote',
 'random',
 'randombytes',
 're',
 'r

In [16]:
import urllib2
from urllib2 import Request 
help(Request)

Help on class Request in module urllib2:

class Request
 |  Methods defined here:
 |  
 |  __getattr__(self, attr)
 |  
 |  __init__(self, url, data=None, headers={}, origin_req_host=None, unverifiable=False)
 |  
 |  add_data(self, data)
 |  
 |  add_header(self, key, val)
 |  
 |  add_unredirected_header(self, key, val)
 |  
 |  get_data(self)
 |  
 |  get_full_url(self)
 |  
 |  get_header(self, header_name, default=None)
 |  
 |  get_host(self)
 |  
 |  get_method(self)
 |  
 |  get_origin_req_host(self)
 |  
 |  get_selector(self)
 |  
 |  get_type(self)
 |  
 |  has_data(self)
 |  
 |  has_header(self, header_name)
 |  
 |  has_proxy(self)
 |  
 |  header_items(self)
 |  
 |  is_unverifiable(self)
 |  
 |  set_proxy(self, host, type)



Successfully built PyDispatcher Twisted
Installing collected packages: PyDispatcher, zope.interface, constantly, incremental, 
attrs, Automat, hyperlink, Twisted, queuelib, cssselect, w3lib, parsel, pyasn1-modules, 
service-identity, scrapy
Successfully installed Automat-0.6.0 PyDispatcher-2.0.5 Twisted-17.5.0 
attrs-17.2.0 constantly-15.1.0 cssselect-1.0.1 hyperlink-17.3.0 incremental-17.5.0 
parsel-1.2.0 pyasn1-modules-0.0.11 queuelib-1.4.2 scrapy-1.4.0 service-identity-17.0.0 
w3lib-1.18.0 zope.interface-4.4.2


In [56]:
import requests

page = requests.get("http://jacknorthrup.com/Documentation/index.html")
page

print page

In [57]:
import urllib2
from bs4 import BeautifulSoup
web = "http://jacknorthrup.com/Documentation/index.html"
page = urllib2.urlopen(web)
soup = BeautifulSoup(page, "html.parser", from_encoding="gb18030")
print soup.prettify()

In [44]:
import urllib2
from urllib2 import Request 

html = urllib2.urlopen("http://jacknorthrup.com")
print(html.read())
#print (html)

In [52]:
#import libraries 
import urllib2  # urllib2 is used to fetch url(s) via urlopen()
from bs4 import BeautifulSoup   # when importing ‘Beautiful Soup’ don’t add 4.   
from datetime import datetime  # contains functions and classes for working with dates and times, separately and together

newUrl ="http://jacknorthrup.com/Documentation/"
page = urllib2.urlopen(newUrl)

soup = BeautifulSoup(page, "html.parser")
name_store = soup.find("h1" ,  attrs={"class": "name"} )
data_name = name_store.text.strip()
price_store = soup.find("div", attrs={"class": "price"})

price = price_store.text
print  data_name
print price
t2 = datetime() 
total = t2 - t1 
print  "scraping completed in ", total

AttributeError: 'NoneType' object has no attribute 'text'

In [29]:
import datetime
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
import zope.interface

class BillBoardItem(Item):
    date = Field()
    song = Field()
    artist = Field()


BASE_URL = "http://www.billboard.com/charts/%s/hot-100"


class BillBoardSpider(BaseSpider):
    name = "billboard_spider"
    allowed_domains = ["billboard.com"]

    def __init__(self):
        date = datetime.date(year=1958, month=8, day=9)

        self.start_urls = []
        while True:
            if date.year >= 2013:
                break

            self.start_urls.append(BASE_URL % date.strftime('%Y-%m-%d'))
            date += datetime.timedelta(days=7)

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        date = hxs.select('//span[@class="chart_date"]/text()').extract()[0]

        songs = hxs.select('//div[@class="listing chart_listing"]/article')
        for song in songs:
            item = BillBoardItem()
            item['date'] = date
            try:
                item['song'] = song.select('.//header/h1/text()').extract()[0]
                item['artist'] = song.select('.//header/p[@class="chart_info"]/a/text()').extract()[0]
            except:
                continue

            yield item

ImportError: No module named zope.interface

In [32]:
import scrapy
#help(scrapy)

ImportError: No module named zope.interface