<a href="https://colab.research.google.com/github/MengOonLee/Web_scraping/blob/master/Tutorial/Scrapy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scrapy

In [None]:
%%bash

pip install --no-cache-dir -qU scrapy

In [1]:
import scrapy
import requests

url = "https://quotes.toscrape.com/tag/humor/"
html = requests.get(url).content

sel = scrapy.Selector(text=html)
quote = sel.css("div.quote")
author = quote.xpath("span/small/text()").get()
print("Author:", author)
text = quote.css("span.text::text").get()
print("Text:", text)
next_page = sel.css("li.next a::attr(href)").get()
print("Next page:", next_page)

Author: Jane Austen
Text: “The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
Next page: /tag/humor/page/2/


## Spider

In [2]:
%%bash

rm -rf tutorial
scrapy startproject tutorial

New Scrapy project 'tutorial', using template directory '/Work/venv/lib/python3.8/site-packages/scrapy/templates/project', created in:
    /Work/Web_scraping/Tutorial/tutorial

You can start your first spider with:
    cd tutorial
    scrapy genspider example example.com


In [3]:
%%writefile ./tutorial/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        "https://quotes.toscrape.com/tag/humor/"
    ]

    def parse(self, response):
        for quote in response.css("div.quote"):
            yield {
                "author": quote.xpath("span/small/text()").get(),
                "text": quote.css("span.text::text").get()
            }

        next_page = response.css("li.next a::attr(href)").get()
        if next_page is not None:
            yield response.follow(next_page, self.parse)

Writing ./tutorial/quotes_spider.py


In [4]:
%%bash

scrapy runspider ./tutorial/quotes_spider.py -O ./tutorial/data/quotes.jl

2023-09-10 23:08:57 [scrapy.utils.log] INFO: Scrapy 2.10.1 started (bot: scrapybot)
2023-09-10 23:08:57 [scrapy.utils.log] INFO: Versions: lxml 4.9.3.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 22.10.0, Python 3.8.10 (default, Mar 13 2023, 10:26:41) - [GCC 9.4.0], pyOpenSSL 23.2.0 (OpenSSL 3.1.2 1 Aug 2023), cryptography 41.0.3, Platform Linux-6.2.0-31-generic-x86_64-with-glibc2.29
2023-09-10 23:08:57 [scrapy.addons] INFO: Enabled addons:
[]
2023-09-10 23:08:57 [scrapy.crawler] INFO: Overridden settings:
{'SPIDER_LOADER_WARN_ONLY': True}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2023-09-10 23:08:57 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2023-09-10 23:08:57 [scrapy.extensions.telnet] INFO: Telnet Password: a18899a3a7b0ecaa
2023-09-10 23:08:57 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.

In [5]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy
import pathlib

class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        urls = [
            "https://quotes.toscrape.com/page/1/",
            "https://quotes.toscrape.com/page/2/"
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        page = response.url.split("/")[-2]
        filename = f"./data/quotes-{page}.html"
        pathlib.Path(filename).write_bytes(response.body)
        self.log(f"Saved file {filename}")

Writing ./tutorial/tutorial/spiders/quotes_spider.py


In [6]:
%%bash

cd ./tutorial
scrapy crawl quotes

2023-09-10 23:09:52 [scrapy.utils.log] INFO: Scrapy 2.10.1 started (bot: tutorial)
2023-09-10 23:09:52 [scrapy.utils.log] INFO: Versions: lxml 4.9.3.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 22.10.0, Python 3.8.10 (default, Mar 13 2023, 10:26:41) - [GCC 9.4.0], pyOpenSSL 23.2.0 (OpenSSL 3.1.2 1 Aug 2023), cryptography 41.0.3, Platform Linux-6.2.0-31-generic-x86_64-with-glibc2.29
2023-09-10 23:09:52 [scrapy.addons] INFO: Enabled addons:
[]
2023-09-10 23:09:52 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'tutorial',
 'FEED_EXPORT_ENCODING': 'utf-8',
 'NEWSPIDER_MODULE': 'tutorial.spiders',
 'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_MODULES': ['tutorial.spiders'],
 'TWISTED_REACTOR': 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'}
2023-09-10 23:09:52 [asyncio] DEBUG: Using selector: EpollSelector
2023-09-10 23:09:52 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.asyncioreactor.AsyncioSelec

In [7]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy
import pathlib

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        "https://quotes.toscrape.com/page/1/",
        "https://quotes.toscrape.com/page/2/"
    ]

    def parse(self, response):
        page = response.url.split("/")[-2]
        filename = f"./data/quotes-{page}.html"
        pathlib.Path(filename).write_bytes(response.body)
        self.log(f"Saved file {filename}")

Overwriting ./tutorial/tutorial/spiders/quotes_spider.py


In [8]:
%%bash

cd ./tutorial
scrapy crawl quotes

2023-09-10 23:23:22 [scrapy.utils.log] INFO: Scrapy 2.10.1 started (bot: tutorial)
2023-09-10 23:23:22 [scrapy.utils.log] INFO: Versions: lxml 4.9.3.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 22.10.0, Python 3.8.10 (default, Mar 13 2023, 10:26:41) - [GCC 9.4.0], pyOpenSSL 23.2.0 (OpenSSL 3.1.2 1 Aug 2023), cryptography 41.0.3, Platform Linux-6.2.0-31-generic-x86_64-with-glibc2.29
2023-09-10 23:23:22 [scrapy.addons] INFO: Enabled addons:
[]
2023-09-10 23:23:22 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'tutorial',
 'FEED_EXPORT_ENCODING': 'utf-8',
 'NEWSPIDER_MODULE': 'tutorial.spiders',
 'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_MODULES': ['tutorial.spiders'],
 'TWISTED_REACTOR': 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'}
2023-09-10 23:23:22 [asyncio] DEBUG: Using selector: EpollSelector
2023-09-10 23:23:22 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.asyncioreactor.AsyncioSelec

## Extracting data

In [17]:
import scrapy
import requests

url = "https://quotes.toscrape.com/page/1/"
html = requests.get(url).content

sel = scrapy.Selector(text=html)

print(sel.css("title"))
print(sel.css("title::text").get())
print(sel.css("title::text").re(r"Quotes.*"))
print(sel.css("title::text").re(r"Q\w+"))
print(sel.css("title::text").re(r"(\w+) to (\w+)"))

print(sel.xpath("//title"))
print(sel.xpath("//title/text()").get())

[<Selector query='descendant-or-self::title' data='<title>Quotes to Scrape</title>'>]
Quotes to Scrape
['Quotes to Scrape']
['Quotes']
['Quotes', 'Scrape']
[<Selector query='//title' data='<title>Quotes to Scrape</title>'>]
Quotes to Scrape


In [16]:
import scrapy
import requests

url = "https://quotes.toscrape.com"
html = requests.get(url).content

sel = scrapy.Selector(text=html)

quote = response.css("div.quote")[0]
quote
text = quote.css("span.text::text").get()
text

2023-09-10 23:54:20 [scrapy.utils.log] INFO: Scrapy 2.10.1 started (bot: scrapybot)
2023-09-10 23:54:20 [scrapy.utils.log] INFO: Versions: lxml 4.9.3.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 22.10.0, Python 3.8.10 (default, Mar 13 2023, 10:26:41) - [GCC 9.4.0], pyOpenSSL 23.2.0 (OpenSSL 3.1.2 1 Aug 2023), cryptography 41.0.3, Platform Linux-6.2.0-31-generic-x86_64-with-glibc2.29
2023-09-10 23:54:20 [scrapy.addons] INFO: Enabled addons:
[]
2023-09-10 23:54:20 [scrapy.crawler] INFO: Overridden settings:
{'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
 'LOGSTATS_INTERVAL': 0}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2023-09-10 23:54:20 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2023-09-10 23:54:20 [scrapy.extensions.telnet] INFO: Telnet Password: 3dd7f2114abc3617
2023-09-10 23:54:21 [scrapy.middlewar

[s] Available Scrapy objects:
[s]   scrapy     scrapy module (contains scrapy.Request, scrapy.Selector, etc)
[s]   crawler    <scrapy.crawler.Crawler object at 0x7fbfd9247100>
[s]   item       {}
[s]   request    <GET https://quotes.toscrape.com>
[s]   response   <200 https://quotes.toscrape.com>
[s]   settings   <scrapy.settings.Settings object at 0x7fbfd92473d0>
[s]   spider     <DefaultSpider 'default' at 0x7fbfd8cb6d60>
[s] Useful shortcuts:
[s]   fetch(url[, redirect=True]) Fetch URL and update local objects (by default, redirects are followed)
[s]   fetch(req)                  Fetch a scrapy.Request and update local objects 
[s]   shelp()           Shell help (print this help)
[s]   view(response)    View response in a browser
In [1]: 
In [1]: 
In [2]: Out[2]: <Selector query="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' quote ')]" data='<div class="quote" itemscope itemtype...'>

In [3]: 
In [4]: Out[4]: '“The world as we have created 

In [None]:
from scrapy import Selector
import requests

url = "https://quotes.toscrape.com/page/1/"
html = requests.get(url).content

sel = Selector(text=html)
# CSS
title = sel.css('title::text')
print(f"css title: {title}")
print(title.re(r'(\w+) to (\w+)'))

# XPath
title = sel.xpath('//title/text()')
print(f"xpath title: {title}")

In [None]:
from scrapy import Selector
import requests

url = "https://quotes.toscrape.com/"
html = requests.get(url).content

sel = Selector(text=html)

for quote in sel.css('div.quote'):
    text = quote.css('span.text::text').extract_first()
    author = quote.css('small.author::text').extract_first()
    tags = quote.css('div.tags a.tag::text').extract()
    print(dict(text=text, author=author, tags=tags))

In [None]:
%%writefile ./Tutorial/Tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
      'https://quotes.toscrape.com/page/1/',
      'https://quotes.toscrape.com/page/2/'
    ]

    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }

In [None]:
%%writefile ./Tutorial/run.sh
#!/bin/bash

scrapy crawl quotes

### Storing the scraped data

In [None]:
%%writefile ./Tutorial/run.sh
#!/bin/bash

scrapy crawl quotes -O ./data/quotes.json

In [None]:
%%writefile ./Tutorial/run.sh
#!/bin/bash

scrapy crawl quotes -o ./data/quotes.jl

## Following links

In [None]:
%%bash
scrapy shell "https://quotes.toscrape.com"
response.css("li.next a").get()
response.css("li.next a::attr(href)").get()
response.css("li.next a").attrib["href"]

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
      'https://quotes.toscrape.com/page/1/'
    ]

    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }
        next_page = response.css('li.next a::attr(href)').get()
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)

In [None]:
%%bash
cd tutorial
rm -rf tutorial/quotes.jl
scrapy crawl quotes -o quotes.jl

### Supports relative URLs directly

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'https://quotes.toscrape.com/page/1/'
    ]

    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }
        next_page = response.css('li.next a::attr(href)').get()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)

In [None]:
%%bash
cd tutorial
rm -rf tutorial/quotes.jl
scrapy crawl quotes -o quotes.jl

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'https://quotes.toscrape.com'
    ]

    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }
        for href in response.css('li.next a::attr(href)'):
            yield response.follow(href, callback=self.parse)

In [None]:
%%bash
cd tutorial
rm -rf tutorial/quotes.jl
scrapy crawl quotes -o quotes.jl

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'https://quotes.toscrape.com'
    ]

    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }
        for a in response.css('ul.pager li.next a'):
            yield response.follow(a, callback=self.parse)

In [None]:
%%bash
cd tutorial
rm -rf tutorial/quotes.jl
scrapy crawl quotes -o quotes.jl

### Create multiple requests from an iterable

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'https://quotes.toscrape.com'
    ]

    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }
        anchors = response.css('ul.pager li.next a')
        yield from response.follow_all(anchors, callback=self.parse)

In [None]:
%%bash
cd tutorial
rm -rf tutorial/quotes.jl
scrapy crawl quotes -o quotes.jl

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'https://quotes.toscrape.com'
    ]

    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }
        yield from response.follow_all(css='ul.pager li.next a', callback=self.parse)

In [None]:
%%bash
cd tutorial
rm -rf tutorial/quotes.jl
scrapy crawl quotes -o quotes.jl

## More patterns

In [None]:
%%writefile ./tutorial/tutorial/spiders/author_spider.py
import scrapy

class AuthorSpider(scrapy.Spider):
    name = 'author'

    start_urls = ['https://quotes.toscrape.com/']

    def parse(self, response):
        author_page_links = response.css('.author + a')