<a href="https://colab.research.google.com/github/MengOonLee/Web_scraping/blob/master/Tutorial/Scrapy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scrapy

In [None]:
%%bash

pip install --no-cache-dir -qU scrapy

In [1]:
import scrapy
import requests

url = "https://quotes.toscrape.com/tag/humor/"
html = requests.get(url).content

sel = scrapy.Selector(text=html)
quote = sel.css("div.quote")
author = quote.xpath("span/small/text()").get()
print("Author:", author)
text = quote.css("span.text::text").get()
print("Text:", text)
next_page = sel.css("li.next a::attr(href)").get()
print("Next page:", next_page)

Author: Jane Austen
Text: “The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
Next page: /tag/humor/page/2/


## Spider

In [2]:
%%bash

rm -rf tutorial
scrapy startproject tutorial

New Scrapy project 'tutorial', using template directory '/Work/venv/lib/python3.8/site-packages/scrapy/templates/project', created in:
    /Work/Web_scraping/Tutorial/tutorial

You can start your first spider with:
    cd tutorial
    scrapy genspider example example.com


In [3]:
%%writefile ./tutorial/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        "https://quotes.toscrape.com/tag/humor/"
    ]

    def parse(self, response):
        for quote in response.css("div.quote"):
            yield {
                "author": quote.xpath("span/small/text()").get(),
                "text": quote.css("span.text::text").get()
            }

        next_page = response.css("li.next a::attr(href)").get()
        if next_page is not None:
            yield response.follow(next_page, self.parse)

Writing ./tutorial/quotes_spider.py


In [None]:
%%bash

scrapy runspider ./tutorial/quotes_spider.py

In [5]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy
import pathlib

class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        urls = [
            "https://quotes.toscrape.com/page/1/",
            "https://quotes.toscrape.com/page/2/"
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        page = response.url.split("/")[-2]
        filename = f"quotes-{page}.html"
        pathlib.Path(filename).write_bytes(response.body)
        self.log(f"Saved file {filename}")

Writing ./tutorial/tutorial/spiders/quotes_spider.py


In [None]:
%%bash

cd ./tutorial
scrapy crawl quotes

In [7]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy
import pathlib

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        "https://quotes.toscrape.com/page/1/",
        "https://quotes.toscrape.com/page/2/"
    ]

    def parse(self, response):
        page = response.url.split("/")[-2]
        filename = f"quotes-{page}.html"
        pathlib.Path(filename).write_bytes(response.body)
        self.log(f"Saved file {filename}")

Overwriting ./tutorial/tutorial/spiders/quotes_spider.py


In [None]:
%%bash

cd ./tutorial
scrapy crawl quotes

## Extracting data

In [9]:
import scrapy
import requests

url = "https://quotes.toscrape.com/page/1/"
html = requests.get(url).content

sel = scrapy.Selector(text=html)

print(sel.css("title"))
print(sel.css("title::text").get())
print(sel.css("title::text").re(r"Quotes.*"))
print(sel.css("title::text").re(r"Q\w+"))
print(sel.css("title::text").re(r"(\w+) to (\w+)"))

print(sel.xpath("//title"))
print(sel.xpath("//title/text()").get())

[<Selector query='descendant-or-self::title' data='<title>Quotes to Scrape</title>'>]
Quotes to Scrape
['Quotes to Scrape']
['Quotes']
['Quotes', 'Scrape']
[<Selector query='//title' data='<title>Quotes to Scrape</title>'>]
Quotes to Scrape


In [10]:
import scrapy
import requests

url = "https://quotes.toscrape.com"
html = requests.get(url).content

sel = scrapy.Selector(text=html)

quote = sel.css("div.quote")[0]
print(quote)
text = quote.css("span.text::text").get()
print(text)
author = quote.css("small.author::text").get()
print(author)
tags = quote.css("div.tags a.tag::text").getall()
print(tags)

for quote in sel.css("div.quote"):
    text = quote.css("span.text::text").get()
    author = quote.css("small.author::text").get()
    tags = quote.css("div.tags a.tag::text").getall()
    print(dict(text=text, author=author, tags=tags))

<Selector query="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' quote ')]" data='<div class="quote" itemscope itemtype...'>
“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
Albert Einstein
['change', 'deep-thoughts', 'thinking', 'world']
{'text': '“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”', 'author': 'Albert Einstein', 'tags': ['change', 'deep-thoughts', 'thinking', 'world']}
{'text': '“It is our choices, Harry, that show what we truly are, far more than our abilities.”', 'author': 'J.K. Rowling', 'tags': ['abilities', 'choices']}
{'text': '“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”', 'author': 'Albert Einstein', 'tags': ['inspirational', 'life', 'live', 'miracle', 'miracles']}
{'text': '“The person, be it gentleman or

In [11]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        "https://quotes.toscrape.com/page/1/",
        "https://quotes.toscrape.com/page/2/"
    ]

    def parse(self, response):
        for quote in response.css("div.quote"):
            yield {
                "text": quote.css("span.text::text").get(),
                "author": quote.css("small.author::text").get(),
                "tags": quote.css("div.tags a.tag::text").getall()
            }

Overwriting ./tutorial/tutorial/spiders/quotes_spider.py


In [None]:
%%bash

cd ./tutorial
scrapy crawl quotes -O ./tutorial/data/quotes.json
scrapy crawl quotes -o ./tutorial/data/quotes.jsonl

## Following links

In [13]:
import scrapy
import requests

url = "https://quotes.toscrape.com"
html = requests.get(url).content

sel = scrapy.Selector(text=html)
next = sel.css("li.next a::attr(href)").get()
print(next)
next = sel.css("li.next a").attrib["href"]
print(next)

/page/2/
/page/2/


In [14]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        "https://quotes.toscrape.com/page/1"
    ]

    def parse(self, response):
        for quote in response.css("div.quote"):
            yield {
                "text": quote.css("span.text::text").get(),
                "author": quote.css("small.author::text").get(),
                "tags": quote.css("div.tags a.tag::text").getall()
            }

        next_page = response.css("li.next a::attr(href)").get()
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)

Overwriting ./tutorial/tutorial/spiders/quotes_spider.py


In [None]:
%%bash

cd ./tutorial
scrapy crawl quotes

In [16]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuatesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        "https://quotes.toscrape.com/page/1/"
    ]

    def parse(self, response):
        for quote in response.css("div.quote"):
            yield {
                "text": quote.css("span.text::text").get(),
                "author": quote.css("span small::text").get(),
                "tags": quote.css("div.tags a.tag::text").getall()
            }

        next_page = response.css("li.next a::attr(href)").get()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)

Overwriting ./tutorial/tutorial/spiders/quotes_spider.py


In [None]:
%%bash

cd ./tutorial
scrapy crawl quotes

In [18]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        "https://quotes.toscrape.com/page/1/"
    ]

    def parse(self, response):
        for quote in response.css("div.quote"):
            yield {
                "text": quote.css("span.text::text").get(),
                "author": quote.css("span small::text").get(),
                "tags": quote.css("div.tags a.tag::text").getall()
            }

        for href in response.css("ul.pager a::attr(href)"):
            yield response.follow(href, callback=self.parse)

Overwriting ./tutorial/tutorial/spiders/quotes_spider.py


In [None]:
%%bash

cd ./tutorial
scrapy crawl quotes

In [20]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        "https://quotes.toscrape.com/page/1/"
    ]

    def parse(self, response):
        for quote in response.css("div.quote"):
            yield {
                "text": quote.css("span.text::text").get(),
                "author": quote.css("span small::text").get(),
                "tags": quote.css("div.tags a.tag::text").getall()
            }

        for a in response.css("ul.pager a"):
            yield response.follow(a, callback=self.parse)

Overwriting ./tutorial/tutorial/spiders/quotes_spider.py


In [None]:
%%bash

cd ./tutorial
scrapy crawl quotes

In [22]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        "https://quotes.toscrape.com/page/1/"
    ]

    def parse(self, response):
        for quote in response.css("div.quote"):
            yield {
                "text": quote.css("span.text::text").get(),
                "author": quote.css("span small::text").get(),
                "tags": quote.css("div.tags a.tag::text").getall()
            }

        yield from response.follow_all(css="ul.pager a", callback=self.parse)

Overwriting ./tutorial/tutorial/spiders/quotes_spider.py


In [None]:
%%bash

cd ./tutorial
scrapy crawl quotes

In [24]:
%%writefile ./tutorial/tutorial/spiders/author_spider.py
import scrapy

class AuthorSpider(scrapy.Spider):
    name = "author"
    start_urls = [
        "https://quotes.toscrape.com/"
    ]

    def parse(self, response):
        author_page_links = response.css(".author + a")
        yield from response.follow_all(author_page_links, callback=self.parse_author)

        pagination_links = response.css("li.next a")
        yield from response.follow_all(pagination_links, callback=self.parse)

    def parse_author(self, response):
        def extract_with_css(query):
            return response.css(query).get(default="").strip()

        yield {
            "name": extract_with_css("h3.author-title::text"),
            "birthdate": extract_with_css(".author-born-date::text"),
            "bio": extract_with_css(".author-description::text")
        }

Writing ./tutorial/tutorial/spiders/author_spider.py


In [None]:
%%bash

cd ./tutorial
scrapy crawl author

In [1]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        url = "https://quotes.toscrape.com/"
        tag = getattr(self, "tag", None)
        if tag is not None:
            url = url + "tag/" + tag
        yield scrapy.Request(url, self.parse)

    def parse(self, response):
        for quote in response.css("div.quote"):
            yield {
                "text": quote.css("span.text::text").get(),
                "author": quote.css("small.author::text").get()
            }

        next_page = response.css("li.next a::attr(href)").get()
        if next_page is not None:
            yield response.follow(next_page, self.parse)

Overwriting ./tutorial/tutorial/spiders/quotes_spider.py


In [None]:
%%bash

cd ./tutorial
scrapy crawl quotes -O ./tutorial/data/quotes-humor.json -a tag=humor