<a href="https://colab.research.google.com/github/MengOonLee/Web_scraping/blob/master/Tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tutorial

In [None]:
%%writefile ./venv.sh
pip install --no-cache-dir -U pip wheel build
pip install --no-cache-dir -U scrapy
pip check
rm -rf dist

In [None]:
%%writefile pip_install.sh
#!/bin/bash
pip install --no-cache-dir -U pip wheel
pip install --no-cache-dir -U numpy pandas matplotlib seaborn
pip install --no-cache-dir -U scrapy

## XPaths & Selectors

### Attributes

`<tag attrib="attrib info">...</tag>`
  - `<div id="unique" class="non unique">...</div>`  
  - `<a href="https://...">...</a>`

`@`: attributes
  - `@id`, `@class`, `@href`
  
### XPaths notation

xpath = '//*[@id="uid"]/p[2]'  
- `/`: look forward one generation  
- `[]`: narrow on specific elements  
- `//`: look forward all generations  
- `*`: wildcard

xpath = '//*[contains(@class, "expr")]'

xpath = '//*/@class'

XPath: `<xpath-to-element>/@attr-name`  
xpath = '//div[@id="uid"]/a/@href'

In [None]:
# Selector object
# Import a scrapy Selector
from scrapy import Selector

# Import requests
import requests
url = 'https://en.wikipedia.org/wiki/Web_scraping'
# Create the string html containing the HTML source
html = requests.get(url).content

# Create the Selector object sel from html
sel = Selector(text=html)

# Outputs the SelectorList:
sel.xpath('//p')
# out: [<Selector xpath='//p' data='<p>..</p>'>, ...]

sel.xpath('//p').extract()
# out: ['<p>...</p>', ...]

sel.xpath('//p').extract_first()
# out: '<p>...</p>'

# Text extraction for future generations
sel.xpath('//p[@id="uid"]//text()').extract()

## CSS & Response

### CSS Locator

CSS: Cascading Style Sheets  
`/` replaced by `>`
- XPath: `/html/body/div`  
- CSS: `html > body > div`

`//` replaced by ` `
- XPath: `//div/span//p`  
- CSS: `div > span p`

`[N]` replaced by `:nth-of-type(N)`
- XPath: `//div/p[2]`  
- CSS: `div > p:nth-of-type(2)`

`<tag>.<class>`: find element by class  
`<tag>#<id>`: find element by id

CSS Locator: `<css-to-element>::attr(attr-name)`  
css_locator = 'div#uid > a::attr(href)'

In [None]:
from scrapy import Selector

# Create a selector from the html
sel = Selector(text=html)

sel.css('div > p')
# out: [<Selector xpath='...' data='<p>...</p>'>, ...]

sel.css('div > p').extract()
# out: [<p>...</p>, ...]

# hyperlink children of all div belongs to class course-block
sel.css('div.course-block > a')

# all element's class = class-1
sel.css('.class-1')

# Create the CSS Locator to all children of the element whose id is uid
sel.css('#uid > *')

# Text extraction for future generations
sel.css('p#uid ::text').extract()

In [None]:
# XPaths Notation & CSS Locators
from scrapy import Selector

# Create a selector object from a secret website
sel = Selector(text=html)

# Select all hyperlinks of div elements belonging to class "course-block"
course_as = sel.css('div.course-block > a')

# Selecting all href attributes chaining with css
hrefs_from_css = course_as.css('::attr(href)')

# Selecting all href attributes chaining with xpath
hrefs_from_xpath = course_as.xpath('./@href')

# Create an XPath string to the desired text.
xpath = '//p[@id="p3"]/text()'
# Create a CSS Locator string to the desired text.
css_locator = 'p#p3::text'

## Response

- has all the tools with Selectors  
- keeps track of the url  
- move from one side to another  

XPath:  
response.xpath('//div/span[@class="bio"]')  
CSS:  
response.css('div > span.bio')  
Chaining:  
response.xpath('//div').css('span.bio')  

`response.url`: keeps track URL  
`response.follow(next_url)`: follow a new link

In [None]:
# Get the URL to the website loaded in response
this_url = response.url

# Get the title of the website loaded in response
this_title = response.xpath('/html/head/title')\
    .css('::text').extract_first()

## Scrapy

In [1]:
%%writefile ./myspider.py
import scrapy

class BlogSpider(scrapy.Spider):
    name ='blogspider'
    start_urls = ['https://www.zyte.com/blog/']
    
    def parse(self, response):
        for title in response.css('.oxy-post-title'):
            yield {'title': title.css('::text').get()}
            
        for next_page in response.css('a.next'):
            yield response.follow(next_page, self.parse)

Overwriting ./myspider.py


In [2]:
%%writefile ./run.sh
#!/bin/bash

scrapy runspider myspider.py -o myspider.jl

Overwriting ./run.sh


In [3]:
%%writefile ./quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = 'quotes'
    start_urls = ['https://quotes.toscrape.com/tag/humor/']

    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'author': quote.xpath('span/small/text()').get(),
                'text': quote.css('span.text::text').get()
            }

        next_page = response.css('li.next a::attr("href")').get()
        if next_page is not None:
            yield response.follow(next_page, self.parse)

Overwriting ./quotes_spider.py


In [4]:
%%writefile ./run.sh
#!/bin/bash

scrapy runspider quotes_spider.py -o quotes.jl

Overwriting ./run.sh


### Creating a project

In [5]:
%%writefile ./start_project.sh
#!/bin/bash

scrapy startproject tutorial

Writing ./start_project.sh


### How to run spider

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def start_request(self):
        urls = [
            'https://quotes.toscrape.com/page/1/',
            'https://quotes.toscrape.com/page/2/'
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        page = response.url.split("/")[-2]
        filename = f'quotes-{page}.html'
        with open(filename, 'wb') as f:
            f.write(response.body)
        self.log(f'Saved file {filename}')

In [None]:
%%bash
cd tutorial
scrapy crawl quotes

### Shortcut to the start_requests method

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'https://quotes.toscrape.com/page/1/',
        'https://quotes.toscrape.com/page/2/'
    ]

    def parse(self, response):
        page = response.url.split("/")[-2]
        filename = f'quotes-{page}.html'
        with open(filename, 'wb') as f:
            f.write(response.body)

In [None]:
%%bash
cd tutorial
scrapy crawl quotes

### Extracting data

In [None]:
%%bash
scrapy shell "https://quotes.toscrape.com/page/1/"
response.css('title::text').getall()
response.css('title::text').get()
response.css('title::text').re(r'Quotes.*')
response.css('title::text').re(r'Q\w+')
response.css('title::text').re(r'(\w+) to (\w+)')
response.xpath('//title/text()').getall()
response.xpath('//title/text()').get()

In [None]:
%%bash
scrapy shell "https://quotes.toscrape.com"
quote = response.css("div.quote")[0]
quote.css("span.text::text").get()
quote.css("small.author::text").get()
quote.css("div.tags a.tag::text").getall()

for quote in response.css("div.quote"):
    text = quote.css("span.text::text").get()
    author = quote.css("small.author::text").get()
    tags = quote.css("div.tags a.tag::text").getall()
    print(dict(text=text, author=author, tags=tags))

### Extracting data in spider

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
      'https://quotes.toscrape.com/page/1/',
      'https://quotes.toscrape.com/page/2/'
    ]

    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }

In [None]:
%%bash
cd tutorial
scrapy crawl quotes

### Storing the scraped data

In [None]:
%%bash
cd tutorial
rm -rf tutorial/quotes.json
scrapy crawl quotes -O quotes.json

In [None]:
%%bash
cd tutorial
rm -rf tutorial/quotes.jl
scrapy crawl quotes -o quotes.jl

## Following links

In [None]:
%%bash
scrapy shell "https://quotes.toscrape.com"
response.css("li.next a").get()
response.css("li.next a::attr(href)").get()
response.css("li.next a").attrib["href"]

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
      'https://quotes.toscrape.com/page/1/'
    ]

    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }
        next_page = response.css('li.next a::attr(href)').get()
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)

In [None]:
%%bash
cd tutorial
rm -rf tutorial/quotes.jl
scrapy crawl quotes -o quotes.jl

### Supports relative URLs directly

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'https://quotes.toscrape.com/page/1/'
    ]
    
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }
        next_page = response.css('li.next a::attr(href)').get()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)

In [None]:
%%bash
cd tutorial
rm -rf tutorial/quotes.jl
scrapy crawl quotes -o quotes.jl

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'https://quotes.toscrape.com'
    ]
    
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }
        for href in response.css('li.next a::attr(href)'):
            yield response.follow(href, callback=self.parse)

In [None]:
%%bash
cd tutorial
rm -rf tutorial/quotes.jl
scrapy crawl quotes -o quotes.jl

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'https://quotes.toscrape.com'
    ]
    
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }
        for a in response.css('ul.pager li.next a'):
            yield response.follow(a, callback=self.parse)

In [None]:
%%bash
cd tutorial
rm -rf tutorial/quotes.jl
scrapy crawl quotes -o quotes.jl

### Create multiple requests from an iterable

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'https://quotes.toscrape.com'
    ]
    
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }
        anchors = response.css('ul.pager li.next a')
        yield from response.follow_all(anchors, callback=self.parse)

In [None]:
%%bash
cd tutorial
rm -rf tutorial/quotes.jl
scrapy crawl quotes -o quotes.jl

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'https://quotes.toscrape.com'
    ]
    
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }
        yield from response.follow_all(css='ul.pager li.next a', callback=self.parse)

In [None]:
%%bash
cd tutorial
rm -rf tutorial/quotes.jl
scrapy crawl quotes -o quotes.jl

## More patterns

In [None]:
%%writefile ./tutorial/tutorial/spiders/author_spider.py
import scrapy

class AuthorSpider(scrapy.Spider):
    name = 'author'
    
    start_urls = ['https://quotes.toscrape.com/']
    
    def parse(self, response):
        author_page_links = response.css('.author + a')