<a href="https://colab.research.google.com/github/MengOonLee/Web_scraping/blob/master/Tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tutorial

In [1]:
%%bash

pip install --no-cache-dir -qU pip wheel
pip install --no-cache-dir -qU numpy pandas matplotlib seaborn
pip install --no-cache-dir -qU scrapy
pip check

No broken requirements found.


In [2]:
%%bash
# Create tutorial project
scrapy startproject Tutorial

New Scrapy project 'Tutorial', using template directory '/home/meng/venv/lib/python3.8/site-packages/scrapy/templates/project', created in:
    /home/meng/work/Web_scraping/Tutorial

You can start your first spider with:
    cd Tutorial
    scrapy genspider example example.com


In [3]:
%%writefile ./Tutorial/venv.sh
pip install --no-cache-dir -U pip wheel build
pip install --no-cache-dir -U scrapy
pip check

Writing ./Tutorial/venv.sh


## XPaths & Selectors

### Attributes

`<tag attrib="attrib info">...</tag>`
  - `<div id="unique" class="non unique">...</div>`  
  - `<a href="https://...">...</a>`

`@`: attributes
  - `@id`, `@class`, `@href`
  
### XPaths notation

xpath = '//*[@id="uid"]/p[2]'  
- `/`: look forward one generation  
- `[]`: narrow on specific elements  
- `//`: look forward all generations  
- `*`: wildcard

xpath = '//*[contains(@class, "expr")]'

xpath = '//*/@class'

XPath: `<xpath-to-element>/@attr-name`  
xpath = '//div[@id="uid"]/a/@href'

In [None]:
# Selector object
# Import a scrapy Selector
from scrapy import Selector

# Import requests
import requests
url = "https://en.wikipedia.org/wiki/Web_scraping"
# Create the string html containing the HTML source
html = requests.get(url).content

# Create the Selector object sel from html
sel = Selector(text=html)

# Outputs the SelectorList:
sel.xpath('//p')
# out: [<Selector xpath='//p' data='<p>..</p>'>, ...]

sel.xpath('//p').extract()
# out: ['<p>...</p>', ...]

sel.xpath('//p').extract_first()
# out: '<p>...</p>'

# Text extraction for future generations
sel.xpath('//p[@id="uid"]//text()').extract()

## CSS & Response

### CSS Locator

CSS: Cascading Style Sheets  
`/` replaced by `>`
- XPath: `/html/body/div`  
- CSS: `html > body > div`

`//` replaced by ` `
- XPath: `//div/span//p`  
- CSS: `div > span p`

`[N]` replaced by `:nth-of-type(N)`
- XPath: `//div/p[2]`  
- CSS: `div > p:nth-of-type(2)`

`<tag>.<class>`: find element by class  
`<tag>#<id>`: find element by id

CSS Locator: `<css-to-element>::attr(attr-name)`  
css_locator = 'div#uid > a::attr(href)'

In [None]:
from scrapy import Selector

# Create a selector from the html
sel = Selector(text=html)

sel.css('div > p')
# out: [<Selector xpath='...' data='<p>...</p>'>, ...]

sel.css('div > p').extract()
# out: [<p>...</p>, ...]

# hyperlink children of all div belongs to class course-block
sel.css('div.course-block > a')

# all element's class = class-1
sel.css('.class-1')

# Create the CSS Locator to all children of the element whose id is uid
sel.css('#uid > *')

# Text extraction for future generations
sel.css('p#uid ::text').extract()

In [None]:
# XPaths Notation & CSS Locators
from scrapy import Selector

# Create a selector object from a secret website
sel = Selector(text=html)

# Select all hyperlinks of div elements belonging to class "course-block"
course_as = sel.css('div.course-block > a')

# Selecting all href attributes chaining with css
hrefs_from_css = course_as.css('::attr(href)')

# Selecting all href attributes chaining with xpath
hrefs_from_xpath = course_as.xpath('./@href')

# Create an XPath string to the desired text.
xpath = '//p[@id="p3"]/text()'
# Create a CSS Locator string to the desired text.
css_locator = 'p#p3::text'

## Response

- has all the tools with Selectors  
- keeps track of the url  
- move from one side to another  

XPath:  
response.xpath('//div/span[@class="bio"]')  
CSS:  
response.css('div > span.bio')  
Chaining:  
response.xpath('//div').css('span.bio')  

`response.url`: keeps track URL  
`response.follow(next_url)`: follow a new link

In [None]:
# Get the URL to the website loaded in response
this_url = response.url

# Get the title of the website loaded in response
this_title = response.xpath('/html/head/title')\
    .css('::text').extract_first()

## Scrapy

In [8]:
from scrapy import Selector
import requests
url = "https://www.zyte.com/blog/"
html = requests.get(url).content

sel = Selector(text=html)
title = sel.css('.oxy-post-title')\
    .css('::text').extract()
print(f"title: {title}")

next_page = sel.css('a.next')\
    .xpath('./@href').extract()
print(f"next page: {next_page}")

title: ['5 Reasons to Attend Extract Summit 2022', '5 Reasons to Attend Extract Summit 2022', 'How to use Playwright with Zyte Smart Proxy Manager', 'How to use Selenium with Zyte Smart Proxy Manager', 'How to use Puppeteer with Zyte Smart Proxy Manager', 'How to avoid web scraping blocks and bans', 'How web scraping is utilized for used car data extraction', 'Scraping large e-commerce websites: A guide for large scale scraping', 'A developer’s guide to rotating proxies in Python\xa0', 'The importance of web scraping in data journalism', 'How can the travel industry benefit from data scraping?']
next page: ['/blog/page/2/']


In [5]:
%%writefile ./Tutorial/myspider.py
import scrapy

class BlogSpider(scrapy.Spider):
    name ='blogspider'
    start_urls = ["https://www.zyte.com/blog/"]
    
    def parse(self, response):
        for title in response.css('.oxy-post-title'):
            yield {'title': title.css('::text').get()}
            
        for next_page in response.css('a.next'):
            yield response.follow(next_page, self.parse)

Writing ./Tutorial/myspider.py


In [7]:
%%writefile ./Tutorial/run.sh
#!/bin/bash

scrapy runspider myspider.py -O ./data/myspider.jl

Writing ./Tutorial/run.sh


In [9]:
from scrapy import Selector
import requests
url = "https://quotes.toscrape.com/tag/humor/"
html = requests.get(url).content

sel = Selector(text=html)
next_page = sel.css('li.next a::attr("href")').extract()
print(f"next page: {next_page}")

quote = sel.css('div.quote')
author = quote.xpath('span/small/text()').extract()
print(f"author: {author}")
text = quote.css('span.text::text').extract()
print(f"text: {text}")

next page: ['/tag/humor/page/2/']
author: ['Jane Austen', 'Steve Martin', 'Garrison Keillor', 'Jim Henson', 'Charles M. Schulz', 'Suzanne Collins', 'Charles Bukowski', 'Terry Pratchett', 'Dr. Seuss', 'George Carlin']
text: ['“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”', '“A day without sunshine is like, you know, night.”', '“Anyone who thinks sitting in church can make you a Christian must also think that sitting in a garage can make you a car.”', '“Beauty is in the eye of the beholder and it may be necessary from time to time to give a stupid or misinformed beholder a black eye.”', "“All you need is love. But a little chocolate now and then doesn't hurt.”", "“Remember, we're madly in love, so it's all right to kiss me anytime you feel like it.”", '“Some people never go crazy. What truly horrible lives they must lead.”', '“The trouble with having an open mind, of course, is that people will insist on coming along and trying t

In [10]:
%%writefile ./Tutorial/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = 'quotes'
    start_urls = [
        "https://quotes.toscrape.com/tag/humor/"
    ]

    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'author': quote.xpath('span/small/text()').get(),
                'text': quote.css('span.text::text').get()
            }

        next_page = response.css('li.next a::attr("href")').get()
        if next_page is not None:
            yield response.follow(next_page, self.parse)

Writing ./Tutorial/quotes_spider.py


In [11]:
%%writefile ./Tutorial/run.sh
#!/bin/bash

scrapy runspider quotes_spider.py -O ./data/quotes.jl

Overwriting ./Tutorial/run.sh


In [13]:
%%writefile ./Tutorial/Tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        "https://quotes.toscrape.com/page/1/",
        "https://quotes.toscrape.com/page/2/"
    ]

    def parse(self, response):
        page = response.url.split("/")[-2]
        filename = f'./data/quotes-{page}.html'
        with open(filename, 'wb') as f:
            f.write(response.body)

Overwriting ./Tutorial/Tutorial/spiders/quotes_spider.py


In [14]:
%%writefile ./Tutorial/run.sh
#!/bin/bash

scrapy crawl quotes

Overwriting ./Tutorial/run.sh


In [16]:
from scrapy import Selector
import requests

url = "https://quotes.toscrape.com/page/1/"
html = requests.get(url).content

sel = Selector(text=html)
# CSS
title = sel.css('title::text')
print(f"css title: {title}")
print(title.re(r'(\w+) to (\w+)'))

# XPath
title = sel.xpath('//title/text()')
print(f"xpath title: {title}")

css title: [<Selector xpath='descendant-or-self::title/text()' data='Quotes to Scrape'>]
['Quotes', 'Scrape']
xpath title: [<Selector xpath='//title/text()' data='Quotes to Scrape'>]


In [25]:
from scrapy import Selector
import requests

url = "https://quotes.toscrape.com/"
html = requests.get(url).content

sel = Selector(text=html)

for quote in sel.css('div.quote'):
    text = quote.css('span.text::text').extract_first()
    author = quote.css('small.author::text').extract_first()
    tags = quote.css('div.tags a.tag::text').extract()
    print(dict(text=text, author=author, tags=tags))

{'text': '“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”', 'author': 'Albert Einstein', 'tags': ['change', 'deep-thoughts', 'thinking', 'world']}
{'text': '“It is our choices, Harry, that show what we truly are, far more than our abilities.”', 'author': 'J.K. Rowling', 'tags': ['abilities', 'choices']}
{'text': '“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”', 'author': 'Albert Einstein', 'tags': ['inspirational', 'life', 'live', 'miracle', 'miracles']}
{'text': '“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”', 'author': 'Jane Austen', 'tags': ['aliteracy', 'books', 'classic', 'humor']}
{'text': "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”", 'author': 'Marilyn Monroe', 'tags': ['be-yourself', 'inspirational']}


In [26]:
%%writefile ./Tutorial/Tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
      'https://quotes.toscrape.com/page/1/',
      'https://quotes.toscrape.com/page/2/'
    ]

    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }

Overwriting ./Tutorial/Tutorial/spiders/quotes_spider.py


In [27]:
%%writefile ./Tutorial/run.sh
#!/bin/bash

scrapy crawl quotes

Overwriting ./Tutorial/run.sh


### Storing the scraped data

In [29]:
%%writefile ./Tutorial/run.sh
#!/bin/bash

scrapy crawl quotes -O ./data/quotes.json

Overwriting ./Tutorial/run.sh


In [30]:
%%writefile ./Tutorial/run.sh
#!/bin/bash

scrapy crawl quotes -o ./data/quotes.jl

Overwriting ./Tutorial/run.sh


## Following links

In [None]:
%%bash
scrapy shell "https://quotes.toscrape.com"
response.css("li.next a").get()
response.css("li.next a::attr(href)").get()
response.css("li.next a").attrib["href"]

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
      'https://quotes.toscrape.com/page/1/'
    ]

    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }
        next_page = response.css('li.next a::attr(href)').get()
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)

In [None]:
%%bash
cd tutorial
rm -rf tutorial/quotes.jl
scrapy crawl quotes -o quotes.jl

### Supports relative URLs directly

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'https://quotes.toscrape.com/page/1/'
    ]
    
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }
        next_page = response.css('li.next a::attr(href)').get()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)

In [None]:
%%bash
cd tutorial
rm -rf tutorial/quotes.jl
scrapy crawl quotes -o quotes.jl

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'https://quotes.toscrape.com'
    ]
    
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }
        for href in response.css('li.next a::attr(href)'):
            yield response.follow(href, callback=self.parse)

In [None]:
%%bash
cd tutorial
rm -rf tutorial/quotes.jl
scrapy crawl quotes -o quotes.jl

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'https://quotes.toscrape.com'
    ]
    
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }
        for a in response.css('ul.pager li.next a'):
            yield response.follow(a, callback=self.parse)

In [None]:
%%bash
cd tutorial
rm -rf tutorial/quotes.jl
scrapy crawl quotes -o quotes.jl

### Create multiple requests from an iterable

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'https://quotes.toscrape.com'
    ]
    
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }
        anchors = response.css('ul.pager li.next a')
        yield from response.follow_all(anchors, callback=self.parse)

In [None]:
%%bash
cd tutorial
rm -rf tutorial/quotes.jl
scrapy crawl quotes -o quotes.jl

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'https://quotes.toscrape.com'
    ]
    
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }
        yield from response.follow_all(css='ul.pager li.next a', callback=self.parse)

In [None]:
%%bash
cd tutorial
rm -rf tutorial/quotes.jl
scrapy crawl quotes -o quotes.jl

## More patterns

In [None]:
%%writefile ./tutorial/tutorial/spiders/author_spider.py
import scrapy

class AuthorSpider(scrapy.Spider):
    name = 'author'
    
    start_urls = ['https://quotes.toscrape.com/']
    
    def parse(self, response):
        author_page_links = response.css('.author + a')