# Scrapy

## Introduction to HTML

In [None]:
# <tag attrib="info">
#  ..contents..
# </tag>

f = open('test.html', 'w')

html = '''
<html>
  <head>
    <title>Website Title</title>
    <link rel="stylesheet" type="text/css" href="style.css">
  </head>
  <body>
    <div class="class1" id="div1">
      <p class="class2">
        Visit <a href="http://datacamp.com/">DataCamp</a>!
      </p>
    </div>
    <div class="class1 class3" id="div2">
      <p class="class2">
        Or search for it on <a href="http://www.google.com">Google</a>!
      </p>
    </div>
  </body>
</html>
'''

f.write(html)
f.close()

## XPaths & Selectors

### XPaths

In [None]:
# / = look forward one generation
# [] = specific elements
xpath = '/html/body/div[2]/p'
# // = look forward all future generations
xpath = '//p'
# @ = attribute
xpath = '//span[@class="span-class"]'

# Create an XPath string to direct to children of body element
# * = wildcard
xpath = '/html/body/*'

# Create an XPath string to the desired paragraph element
xpath = '/html/body/div/div/p'

# Create an Xpath string to select desired p element
xpath = '//*[@id="div3"]/p'

# Create an XPath string to select p element by class
xpath = '//p[@class="class-1 class-2"]'

# Create an xpath to the href attribute
xpath = '//p[@id="p2"]/a/@href'

# Create an xpath to the href attributes
# contains(@attrib, "string-expr")
xpath = '//*[contains(@class, "class-1")]'
xpath = '//a[contains(@class,"package-snippet")]/@href'

### Selectors

In [None]:
import scrapy

html = '''
<html>
  <body>
    <div class="hello datacamp">
      <p>Hello World!</p>
    </div>
    <p>Enjoy DataCamp!</p>
  </body>
</html>
'''
# Create a Selector selecting html as the HTML document
sel = scrapy.Selector(text=html)

print(sel.xpath("//p").extract())
print(sel.xpath("//p").extract_first())
print(sel.xpath("//p")[1].extract())

# Create a SelectorList of all div elements in the HTML document
divs = sel.xpath('//div')
print(divs)

# Chain together xpath methods to select desired p element
sel.xpath('//div').xpath('./p[1]')

In [None]:
# Import a scrapy Selector
import scrapy
# Import requests
import requests

# Create the string html containing the HTML source
url = 'https://en.wikipedia.org/wiki/Web_scraping'
html = requests.get(url).content
# Create the Selector object sel from html
sel = scrapy.Selector(text=html)
# Print out the number of elements in the HTML document
print("There are 1020 elements in the HTML document.")
print("You have found: ", len(sel.xpath('//*')))

## CSS Locators & Responses

### CSS Locators

In [None]:
# xpath: /html/body/div          css: html > body > div
# xpath: //div/span//p           css: div > span p
# xpath: //div/p[2]              css: div > p:nth-of-type(2)
# xpath: /html/body//div/p[2]    css: html > body div > p:nth-of-type(2)

# Select paragraph elements within class class1
css = 'div#uid > p.class1'
# Select all elements whose class attribute belongs to class1
css = '.class1'
# Create the CSS Locator to all children of the element whose id is uid
css = '#uid > *'

# Create the XPath string equivalent to the CSS Locator
xpath = '/html/body/span[1]//a'
# Create the CSS Locator string equivalent to the XPath
css = 'html > body > span:nth-of-type(1) a'

# Create the XPath string equivalent to the CSS Locator
xpath = '//div[@id="uid"]/span//h4'
# Create the CSS Locator string equivalent to the XPath
css = 'div#uid > span h4'

# Create the XPath string equivalent to the CSS Locator
xpath = '//div[@id="uid"]/a/@href'
# Create the CSS Locator string equivalent to the XPath
css = 'div#uid > a::attr(href)'

# Create an XPath string to the desired text.
xpath = '//p[@id="p3"]//text()'
# Create a CSS Locator string to the desired text.
css = 'p#p3 ::text'

In [None]:
import scrapy

html = '''
<html>
  <body>
    <div class="hello datacamp">
      <p>Hello World!</p>
    </div>
    <p>Enjoy DataCamp!</p>
    <p id="p-example">
      Hello world!
      Try <a href="http://www.datacamp.com">Datacamp</a> today!
    </p>
  </body>
</html>
'''

# Create a selector from the html (of a secret website)
sel = scrapy.Selector(text=html)

print(sel.css('div > p'))
print(sel.css('div > p').extract())
print(sel.css('p#p-example > a').extract())

print(sel.xpath('//p[@id="p-example"]/text()').extract())
print(sel.css('p#p-example::text').extract())
print(sel.xpath('//p[@id="p-example"]//text()').extract())
print(sel.css('p#p-example ::text').extract())

In [None]:
import scrapy

# Create a selector object from a secret website
sel = Selector(text=html)
# Select all hyperlinks of div elements belonging to class "course-block"
course_as = sel.css('div.course-block > a')
# Selecting all href attributes chaining with css
hrefs_from_css = course_as.css('::attr(href)')
# Selecting all href attributes chaining with xpath
hrefs_from_xpath = course_as.xpath('./@href')

### Responses

In [None]:
response.xpath('//div/span[@class="bio"]').extract()
response.xpath('div > span.bio').extract_first()
response.xpath('//div').css('span.bio').extract()

# Get the URL to the website loaded in response
this_url = response.url

# Get the title of the website loaded in response
this_title = response.xpath('/html/head/title')\
    .css('::text').extract_first()

# next_url is the string path of the next url
response.follow(next_url)

In [None]:
# Create a CSS Locator string to the desired hyperlink elements
css_locator = 'a.course-block__link'

# Select the hyperlink elements from response and sel
response_as = response.css(css_locator)
sel_as = sel.css(css_locator)

# Examine similarity
nr = len(response_as)
ns = len(sel_as)
for i in range(min(nr, ns, 2)):
    print("Element %d from response: %s" % (i+1, response_as[i]))
    print("Element %d from sel: %s" % (i+1, sel_as[i]))
    print("")

In [None]:
# Select all desired div elements
divs = response.css('div.course-block')

# Take the first div element
first_div = divs[0]

# Extract the text from the (only) h4 element in first_div
h4_text = first_div.css('h4::text').extract_first()

# Print out the text
print("The text from the h4 element is:", h4_text)

In [None]:
# Response loaded with HTML from https://www.datacamp.com/courses/all
course_divs = response.css('div.course-block')
print(len(course_divs))

# Inspecting course-block
first_div = course_divs[0]
children = first_div.xpath('./*')
print(len(children))

# First child
first_child = children[0]
print(first_child.extract())

# CSS Locator
links = response.css('div.course-block > a::attr(href)').extract()

# Stepwise
# Step 1: course blocks
course_divs = response.css('div.course-block')
# Step 2: hyperlink elements
hrefs = course_divs.xpath('./a/@href')
# Step 3: extract the links
links = hrefs.extract()

for l in links:
    print(l)

In [None]:
# Create a SelectorList of the course titles
crs_title_els = response.css('h4::text')

# Extract the course titles 
crs_titles = crs_title_els.extract()

# Print out the course titles 
for el in crs_titles:
    print(">>", el)

# Calculate the number of children of the mystery element
how_many_kids = len(mystery.xpath('./*'))

# Print out the number
print("The number of elements you selected was:", how_many_kids)

## Spiders

### Spider

In [None]:
# Import scrapy library
import scrapy
from scrapy.crawler import CrawlerProcess

# Create the spider class
class SpiderClass(scrapy.Spider):
    name = "spider"
    # start_requests method
    def start_requests(self):
        self.print_msg("Hello World!")
        urls = ['https://www.datacamp.com']
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)
    # parse method
    def parse(self, response):
        # simple example: write out the html
        html_file = 'DC_courses.html'
        with open(html_file, 'wb') as fout:
            fout.write(response.body)
    # print_msg method
    def print_msg(self, msg):
        print("Calling start_requests in SpiderClass prints out:", msg)

# Initiate a CrawlerProcess
process = CrawlerProcess()
# Tell the process which spider to use
process.crawl(SpiderClass)
# Start the crawling process
process.start()

In [None]:
# Import scrapy library
import scrapy
from scrapy.crawler import CrawlerProcess

# Create the spider class
class SpiderClass(scrapy.Spider):
    name = "spider"
    # start_requests method
    def start_requests(self):
        urls = ['https://www.datacamp.com/courses/all']
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)
    # parse method
    def parse(self, response):
        links = response.css('div.course-block > a::attr(href)').extract()
        for link in links:
            yield response.follow(url=link, callback=self.parse2)

    def parse2(self, response):
        filepath = 'DC_links.csv'
        with open(filepath, 'w') as f:
            f.writelines(response + '/n')

process = CrawlerProcess()
process.crawl(SpiderClass)
process.start()

In [None]:
# Import the scrapy library
import scrapy
from scrapy.crawler import CrawlerProcess

# Create the Spider class
class DCspider(scrapy.Spider):
    name = 'dcspider'
    # start_requests method
    def start_requests(self):
        yield scrapy.Request(url=url_short, callback=self.parse)
    # parse method
    def parse(self, response):
        # Create an extracted list of course author names
        author_names = response.css('p.course-block__author-name::text').extract()
        # Here we will just return the list of Authors
        return author_names

process = CrawlwerProcess()
process.crawl(DCspider)
process.start()

In [None]:
# Import the scrapy library
import scrapy

# Create the Spider class
class DCdescr(scrapy.Spider):
    name = 'dcdescr'
    # start_requests method
    def start_requests(self):
        yield scrapy.Request(url=url_short, callback=self.parse)
    
    # First parse method
    def parse(self, response):
        links = response.css('div.course-block > a::attr(href)').extract()
        # Follow each of the extracted links
        for link in links:
            yield response.follow(url=link, callback=self.parse_descr)
      
    # Second parsing method
    def parse_descr(self, response):
        # Extract course description
        course_descr = response.css('p.course__description::text').extract_first()
        # For now, just yield the course description
        yield course_descr

In [None]:
# Import scrapy library
import scrapy
from scrapy.crawler import CrawlerProcess

# Create the spider class
class DC_Chapter_Spider(scrapy.Spider):
    name = "dc_chapter_spider"
    # start_requests method
    def start_requests(self):
        url = 'https://www.datacamp.com/courses/all'
            yield scrapy.Request(url=url, callback=self.parse_front)
    # parse the front courses page
    def parse_front(self, response):
        # Narrow in on the course blocks
        course_blocks = response.css('div.course-block')
        # Direct to the course links
        course_links = course_blocks.xpath('./a/@href')
        # Extract the links (as a list of strings)
        links_to_follow = course_links.extract()
        # Follow the links to the next parser
        for url in links_to_follow:
            yield response.follow(url=url, callback=self.parse_pages)
    # parse course pages
    def parse_pages(self, response):
        # Direct to the course title text
        crs_title = response.xpath('//h1[contains(@class, "title")]/text()')
        # Extract and clean the course title text
        crs_title_ext = crs_title.extract_first().strip()
        # Direct to the chapter titles text
        ch_titles = response.css('h4.chapter__title::text')
        # Extract and clean the chapter titles text
        ch_titles_ext = [t.strip() for t in ch_titles.extract()]
        # Store this in our dictionary
        dc_dict[crs_title_ext] = ch_titles_ext

dc_dict = dict()

# Initiate a CrawlerProcess
process = CrawlerProcess()
# Tell the process which spider to use
process.crawl(DC_Chapter_Spider)
# Start the crawling process
process.start()

In [None]:
# Import scrapy
import scrapy
from scrapy.crawler import CrawlerProcess

# Create the Spider class
class DC_Chapter_Spider(scrapy.Spider):
    name = "dc_chapter_spider"
    # start_requests method
    def start_requests(self):
        yield scrapy.Request(url=url_short, callback=self.parse_front)
    # First parsing method
    def parse_front(self, response):
        course_blocks = response.css('div.course-block')
        course_links = course_blocks.xpath('./a/@href')
        links_to_follow = course_links.extract()
        for url in links_to_follow:
            yield response.follow(url=url, callback=self.parse_pages)
    # Second parsing method
    def parse_pages(self, response):
        # Create a SelectorList of the course titles text
        crs_title = response.xpath('//h1[contains(@class, "title")]/text()')
        # Extract the text and strip it clean
        crs_title_ext = crs_title.extract_first().strip()
        ch_titles = response.css('h4.chapter__title::text')
        ch_titles_ext = [t.strip() for t in ch_titles.extract()]
        dc_dict[crs_title_ext] = ch_titles_ext

# Initialize the dictionary **outside** of the Spider class
dc_dict = dict()

# Import the CrawlerProcess: for running the spider
process = CrawlerProcess()
process.crawl(DC_Chapter_Spider)
# Run the Spider
process.start()

In [None]:
# Import scrapy
import scrapy
from scrapy.crawler import CrawlerProcess

# Create the Spider class
class DC_Description_Spider(scrapy.Spider):
    name = "dc_description_spider"
    # start_requests method
    def start_requests(self):
        yield scrapy.Request(url=url_short, callback=self.parse_front)
    # First parsing method
    def parse_front(self, response):
        course_blocks = response.css('div.course-block')
        course_links = course_blocks.xpath('./a/@href')
        links_to_follow = course_links.extract()
        for url in links_to_follow:
            yield response.follow(url=url, callback=self.parse_pages)
    # Second parsing method
    def parse_pages(self, response):
        # Create a SelectorList of the course titles text
        crs_title = response.xpath('//h1[contains(@class, "title")]/text()')
        # Extract the text and strip it clean
        crs_title_ext = crs_title.extract_first().strip()
        # Create a SelectorList of course descriptions text
        crs_descr = response.css('p.course__description::text')
        # Extract the text and strip it clean
        crs_descr_ext = crs_descr.extract_first().strip()
        # Fill in the dictionary
        dc_dict[crs_title_ext] = crs_descr_ext

# Initialize the dictionary **outside** of the Spider class
dc_dict = dict()

# Import the CrawlerProcess: for running the spider
process = CrawlerProcess()
process.crawl(DC_Description_Spider)
# Run the Spider
process.start()

In [None]:
# Import scrapy
import scrapy
from scrapy.crawler import CrawlerProcess

# Create the Spider class
class YourSpider(scrapy.Spider):
    name = "yourspider"
    # start_requests method
    def start_requests(self):
        yield scrapy.Request(url=url_short, callback=self.parse)
    # parse method
    def parse(self, response):
        # Extracted course titles
        crs_titles = response.xpath('//h4[contains(@class, "block__title")]/text()')\
            .extract()
        # Extracted course descriptions
        crs_descrs = response.xpath('//p[contains(@class, "block__description")]/text()')\
            .extract()
        # Fill in the dictionary: it is the spider output
        for crs_title, crs_descr in zip(crs_titles, crs_descrs):
            dc_dict[crs_title] = crs_descr

# Initialize the dictionary **outside** of the Spider class
dc_dict = dict()

# Import the CrawlerProcess: for running the spider
process = CrawlerProcess()
process.crawl(YourSpider)
# Run the Spider
process.start()