<a href="https://colab.research.google.com/github/MengOonLee/WebScrapy/blob/master/Tutorial/Scrapy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scrapy

In [None]:
%%bash
pip install --no-cache-dir -qU scrapy

In [None]:
import scrapy
import requests

url = "https://quotes.toscrape.com"
html = requests.get(url).content

response = scrapy.Selector(text=html)
for quote in response.css('div.quote'):
    print({
        'author': quote.css('small.author::text').get(),
        'text': quote.xpath('span[@class="text"]/text()').get(),
        'tags': quote.css('div.tags a.tag::text').getall()
    })
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
    print(next_page)

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        "https://quotes.toscrape.com/page/1/"
    ]

    def parse(self, response):
        for quote in response.css("div.quote"):
            yield {
                "text": quote.css("span.text::text").get(),
                "author": quote.css("span small::text").get(),
                "tags": quote.css("div.tags a.tag::text").getall()
            }

        next_page = response.css("li.next a::attr(href)").get()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)

process = CrawlerProcess()
process.crawl(QuotesSpider)
process.start()

## Spider

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
from itemloaders.processors import MapCompose, TakeFirst
from w3lib.html import remove_tags
import re

def remove_unicode(text):
    return text.encode('ascii', errors='ignore')\
        .decode().strip()

class QuotesItem(scrapy.Item):
    author = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=TakeFirst()
    )
    text = scrapy.Field(
        input_processor=MapCompose(remove_tags, remove_unicode),
        output_processor=TakeFirst()
    )

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        "https://quotes.toscrape.com/tag/humor/"
    ]

    def parse(self, response):
        for quote in response.css("div.quote"):
            l = ItemLoader(item=QuotesItem(), selector=quote)
            l.add_xpath("author", "span/small/text()")
            l.add_css("text", "span.text::text")
            yield l.load_item()

        next_page = response.css("li.next a::attr(href)").get()
        if next_page is not None:
            yield response.follow(next_page, self.parse)

process = CrawlerProcess(
    settings={
        "FEEDS":{"items.jl":{"format":"jsonlines"}}
    }
)
process.crawl(QuotesSpider)
process.start()

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
import pathlib

class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        urls = [
            "https://quotes.toscrape.com/page/1/",
            "https://quotes.toscrape.com/page/2/"
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        page = response.url.split("/")[-2]
        filename = f"quotes-{page}.html"
        pathlib.Path(filename).write_bytes(response.body)
        self.log(f"Saved file {filename}")

process = CrawlerProcess()
process.crawl(QuotesSpider)
process.start()

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
import pathlib

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        "https://quotes.toscrape.com/page/1/",
        "https://quotes.toscrape.com/page/2/"
    ]

    def parse(self, response):
        page = response.url.split("/")[-2]
        filename = f"quotes-{page}.html"
        pathlib.Path(filename).write_bytes(response.body)
        self.log(f"Saved file {filename}")

process = CrawlerProcess()
process.crawl(QuotesSpider)
process.start()

## Extracting data

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
from itemloaders.processors import MapCompose, TakeFirst, Identity
from w3lib.html import remove_tags
import re

def remove_unicode(text):
    return text.encode("ascii", errors="ignore").decode().strip()

class QuotesItem(scrapy.Item):
    author = scrapy.Field(
        input_processor=MapCompose(remove_tags, remove_unicode),
        output_processor=TakeFirst()
    )
    text = scrapy.Field(
        input_processor=MapCompose(remove_tags, remove_unicode),
        output_processor=TakeFirst()
    )
    tags = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=Identity()
    )

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        "https://quotes.toscrape.com/page/1/",
        "https://quotes.toscrape.com/page/2/"
    ]

    def parse(self, response):
        for quote in response.css("div.quote"):
            l = ItemLoader(item=QuotesItem(), selector=quote)
            l.add_css("author", "small.author::text")
            l.add_css("text", "span.text::text")
            l.add_css("tags", "div.tags a.tag::text")
            yield l.load_item()

process = CrawlerProcess(
    settings={
        "FEEDS":{"items.jl":{"format":"jsonlines"}}
    }
)
process.crawl(QuotesSpider)
process.start()

## Following links

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
from itemloaders.processors import MapCompose, TakeFirst, Identity
from w3lib.html import remove_tags
import re

def remove_unicode(text):
    return text.encode("ascii", errors="ignore").decode().strip()

class QuotesItem(scrapy.Item):
    author = scrapy.Field(
        input_processor=MapCompose(remove_tags, remove_unicode),
        output_processor=TakeFirst()
    )
    text = scrapy.Field(
        input_processor=MapCompose(remove_tags, remove_unicode),
        output_processor=TakeFirst()
    )
    tags = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=Identity()
    )

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        "https://quotes.toscrape.com/page/1"
    ]

    def parse(self, response):
        for quote in response.css("div.quote"):
            l = ItemLoader(item=QuotesItem(), selector=quote)
            l.add_css("author", "small.author::text")
            l.add_css("text", "span.text::text")
            l.add_css("tags", "div.tags a.tag::text")
            yield l.load_item()

        next_page = response.css("li.next a::attr(href)").get()
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)

process = CrawlerProcess(
    settings={
        "FEEDS":{"items.jl":{"format":"jsonlines"}}
    }
)
process.crawl(QuotesSpider)
process.start()

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        "https://quotes.toscrape.com/page/1/"
    ]

    def parse(self, response):
        for quote in response.css("div.quote"):
            yield {
                "text": quote.css("span.text::text").get(),
                "author": quote.css("span small::text").get(),
                "tags": quote.css("div.tags a.tag::text").getall()
            }

        for href in response.css("ul.pager a::attr(href)"):
            yield response.follow(href, callback=self.parse)

process = CrawlerProcess()
process.crawl(QuotesSpider)
process.start()

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        "https://quotes.toscrape.com/page/1/"
    ]

    def parse(self, response):
        for quote in response.css("div.quote"):
            yield {
                "text": quote.css("span.text::text").get(),
                "author": quote.css("span small::text").get(),
                "tags": quote.css("div.tags a.tag::text").getall()
            }

        for a in response.css("ul.pager a"):
            yield response.follow(a, callback=self.parse)

process = CrawlerProcess()
process.crawl(QuotesSpider)
process.start()

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        "https://quotes.toscrape.com/page/1/"
    ]

    def parse(self, response):
        for quote in response.css("div.quote"):
            yield {
                "text": quote.css("span.text::text").get(),
                "author": quote.css("span small::text").get(),
                "tags": quote.css("div.tags a.tag::text").getall()
            }

        yield from response.follow_all(css="ul.pager a", callback=self.parse)

process = CrawlerProcess()
process.crawl(QuotesSpider)
process.start()

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess

class AuthorSpider(scrapy.Spider):
    name = "author"
    start_urls = [
        "https://quotes.toscrape.com/"
    ]

    def parse(self, response):
        author_page_links = response.css(".author + a")
        yield from response.follow_all(author_page_links, callback=self.parse_author)

        pagination_links = response.css("li.next a")
        yield from response.follow_all(pagination_links, callback=self.parse)

    def parse_author(self, response):
        def extract_with_css(query):
            return response.css(query).get(default="").strip()

        yield {
            "name": extract_with_css("h3.author-title::text"),
            "birthdate": extract_with_css(".author-born-date::text"),
            "bio": extract_with_css(".author-description::text")
        }

process = CrawlerProcess()
process.crawl(AuthorSpider)
process.start()

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess

class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        url = "https://quotes.toscrape.com/"
        tag = getattr(self, "tag", None)
        if tag is not None:
            url = url + "tag/" + tag
        yield scrapy.Request(url, self.parse)

    def parse(self, response):
        for quote in response.css("div.quote"):
            yield {
                "text": quote.css("span.text::text").get(),
                "author": quote.css("small.author::text").get()
            }

        next_page = response.css("li.next a::attr(href)").get()
        if next_page is not None:
            yield response.follow(next_page, self.parse)

process = CrawlerProcess()
process.crawl(QuotesSpider, tag='humor')
process.start()