# Scrapy

In [1]:
%%bash

pip install --no-cache-dir -qU pip wheel
pip install --no-cache-dir -qU numpy pandas
pip install --no-cache-dir -qU scrapy
pip check

No broken requirements found.


In [1]:
%%bash
# Create tutorial project
scrapy startproject ScrapyTutorial

New Scrapy project 'ScrapyTutorial', using template directory '/Work/venv/lib/python3.8/site-packages/scrapy/templates/project', created in:
    /Work/Web_scraping/Tutorial/ScrapyTutorial

You can start your first spider with:
    cd ScrapyTutorial
    scrapy genspider example example.com


In [2]:
import scrapy
import requests
url = "https://www.zyte.com/blog/"
html = requests.get(url).content

sel = scrapy.Selector(text=html)
title = sel.css('.oxy-post-title')\
    .css('::text').extract()
print(f"title: {title}")

next_page = sel.css('a.next')\
    .xpath('./@href').extract()
print(f"next page: {next_page}")

title: []
next page: []


In [3]:
%%writefile ./ScrapyTutorial/myspider.py
import scrapy

class BlogSpider(scrapy.Spider):
    name ='blogspider'
    start_urls = ["https://www.zyte.com/blog/"]
    
    def parse(self, response):
        for title in response.css('.oxy-post-title'):
            yield {'title': title.css('::text').get()}
            
        for next_page in response.css('a.next'):
            yield response.follow(next_page, self.parse)

Writing ./ScrapyTutorial/myspider.py


In [4]:
%%writefile ./ScrapyTutorial/run.sh
#!/bin/bash

scrapy runspider myspider.py -O ./data/myspider.jl

Writing ./ScrapyTutorial/run.sh


In [6]:
%%bash
scrapy runspider ./ScrapyTutorial/myspider.py

2023-09-09 00:10:27 [scrapy.utils.log] INFO: Scrapy 2.10.1 started (bot: scrapybot)
2023-09-09 00:10:27 [scrapy.utils.log] INFO: Versions: lxml 4.9.3.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 22.10.0, Python 3.8.10 (default, Mar 13 2023, 10:26:41) - [GCC 9.4.0], pyOpenSSL 23.2.0 (OpenSSL 3.1.2 1 Aug 2023), cryptography 41.0.3, Platform Linux-6.2.0-31-generic-x86_64-with-glibc2.29
2023-09-09 00:10:27 [scrapy.addons] INFO: Enabled addons:
[]
2023-09-09 00:10:27 [scrapy.crawler] INFO: Overridden settings:
{'SPIDER_LOADER_WARN_ONLY': True}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2023-09-09 00:10:27 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2023-09-09 00:10:27 [scrapy.extensions.telnet] INFO: Telnet Password: 7c25b7c4780f6c97
2023-09-09 00:10:27 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.

In [None]:
%%bash
cd ./ScrapyTutorial

In [9]:
from scrapy import Selector
import requests
url = "https://quotes.toscrape.com/tag/humor/"
html = requests.get(url).content

sel = Selector(text=html)
next_page = sel.css('li.next a::attr("href")').extract()
print(f"next page: {next_page}")

quote = sel.css('div.quote')
author = quote.xpath('span/small/text()').extract()
print(f"author: {author}")
text = quote.css('span.text::text').extract()
print(f"text: {text}")

next page: ['/tag/humor/page/2/']
author: ['Jane Austen', 'Steve Martin', 'Garrison Keillor', 'Jim Henson', 'Charles M. Schulz', 'Suzanne Collins', 'Charles Bukowski', 'Terry Pratchett', 'Dr. Seuss', 'George Carlin']
text: ['“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”', '“A day without sunshine is like, you know, night.”', '“Anyone who thinks sitting in church can make you a Christian must also think that sitting in a garage can make you a car.”', '“Beauty is in the eye of the beholder and it may be necessary from time to time to give a stupid or misinformed beholder a black eye.”', "“All you need is love. But a little chocolate now and then doesn't hurt.”", "“Remember, we're madly in love, so it's all right to kiss me anytime you feel like it.”", '“Some people never go crazy. What truly horrible lives they must lead.”', '“The trouble with having an open mind, of course, is that people will insist on coming along and trying t

In [10]:
%%writefile ./Tutorial/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = 'quotes'
    start_urls = [
        "https://quotes.toscrape.com/tag/humor/"
    ]

    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'author': quote.xpath('span/small/text()').get(),
                'text': quote.css('span.text::text').get()
            }

        next_page = response.css('li.next a::attr("href")').get()
        if next_page is not None:
            yield response.follow(next_page, self.parse)

Writing ./Tutorial/quotes_spider.py


In [11]:
%%writefile ./Tutorial/run.sh
#!/bin/bash

scrapy runspider quotes_spider.py -O ./data/quotes.jl

Overwriting ./Tutorial/run.sh


In [13]:
%%writefile ./Tutorial/Tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        "https://quotes.toscrape.com/page/1/",
        "https://quotes.toscrape.com/page/2/"
    ]

    def parse(self, response):
        page = response.url.split("/")[-2]
        filename = f'./data/quotes-{page}.html'
        with open(filename, 'wb') as f:
            f.write(response.body)

Overwriting ./Tutorial/Tutorial/spiders/quotes_spider.py


In [14]:
%%writefile ./Tutorial/run.sh
#!/bin/bash

scrapy crawl quotes

Overwriting ./Tutorial/run.sh


In [16]:
from scrapy import Selector
import requests

url = "https://quotes.toscrape.com/page/1/"
html = requests.get(url).content

sel = Selector(text=html)
# CSS
title = sel.css('title::text')
print(f"css title: {title}")
print(title.re(r'(\w+) to (\w+)'))

# XPath
title = sel.xpath('//title/text()')
print(f"xpath title: {title}")

css title: [<Selector xpath='descendant-or-self::title/text()' data='Quotes to Scrape'>]
['Quotes', 'Scrape']
xpath title: [<Selector xpath='//title/text()' data='Quotes to Scrape'>]


In [25]:
from scrapy import Selector
import requests

url = "https://quotes.toscrape.com/"
html = requests.get(url).content

sel = Selector(text=html)

for quote in sel.css('div.quote'):
    text = quote.css('span.text::text').extract_first()
    author = quote.css('small.author::text').extract_first()
    tags = quote.css('div.tags a.tag::text').extract()
    print(dict(text=text, author=author, tags=tags))

{'text': '“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”', 'author': 'Albert Einstein', 'tags': ['change', 'deep-thoughts', 'thinking', 'world']}
{'text': '“It is our choices, Harry, that show what we truly are, far more than our abilities.”', 'author': 'J.K. Rowling', 'tags': ['abilities', 'choices']}
{'text': '“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”', 'author': 'Albert Einstein', 'tags': ['inspirational', 'life', 'live', 'miracle', 'miracles']}
{'text': '“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”', 'author': 'Jane Austen', 'tags': ['aliteracy', 'books', 'classic', 'humor']}
{'text': "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”", 'author': 'Marilyn Monroe', 'tags': ['be-yourself', 'inspirational']}


In [26]:
%%writefile ./Tutorial/Tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
      'https://quotes.toscrape.com/page/1/',
      'https://quotes.toscrape.com/page/2/'
    ]

    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }

Overwriting ./Tutorial/Tutorial/spiders/quotes_spider.py


In [27]:
%%writefile ./Tutorial/run.sh
#!/bin/bash

scrapy crawl quotes

Overwriting ./Tutorial/run.sh


### Storing the scraped data

In [29]:
%%writefile ./Tutorial/run.sh
#!/bin/bash

scrapy crawl quotes -O ./data/quotes.json

Overwriting ./Tutorial/run.sh


In [30]:
%%writefile ./Tutorial/run.sh
#!/bin/bash

scrapy crawl quotes -o ./data/quotes.jl

Overwriting ./Tutorial/run.sh


## Following links

In [None]:
%%bash
scrapy shell "https://quotes.toscrape.com"
response.css("li.next a").get()
response.css("li.next a::attr(href)").get()
response.css("li.next a").attrib["href"]

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
      'https://quotes.toscrape.com/page/1/'
    ]

    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }
        next_page = response.css('li.next a::attr(href)').get()
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)

In [None]:
%%bash
cd tutorial
rm -rf tutorial/quotes.jl
scrapy crawl quotes -o quotes.jl

### Supports relative URLs directly

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'https://quotes.toscrape.com/page/1/'
    ]
    
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }
        next_page = response.css('li.next a::attr(href)').get()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)

In [None]:
%%bash
cd tutorial
rm -rf tutorial/quotes.jl
scrapy crawl quotes -o quotes.jl

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'https://quotes.toscrape.com'
    ]
    
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }
        for href in response.css('li.next a::attr(href)'):
            yield response.follow(href, callback=self.parse)

In [None]:
%%bash
cd tutorial
rm -rf tutorial/quotes.jl
scrapy crawl quotes -o quotes.jl

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'https://quotes.toscrape.com'
    ]
    
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }
        for a in response.css('ul.pager li.next a'):
            yield response.follow(a, callback=self.parse)

In [None]:
%%bash
cd tutorial
rm -rf tutorial/quotes.jl
scrapy crawl quotes -o quotes.jl

### Create multiple requests from an iterable

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'https://quotes.toscrape.com'
    ]
    
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }
        anchors = response.css('ul.pager li.next a')
        yield from response.follow_all(anchors, callback=self.parse)

In [None]:
%%bash
cd tutorial
rm -rf tutorial/quotes.jl
scrapy crawl quotes -o quotes.jl

In [None]:
%%writefile ./tutorial/tutorial/spiders/quotes_spider.py
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'https://quotes.toscrape.com'
    ]
    
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall()
            }
        yield from response.follow_all(css='ul.pager li.next a', callback=self.parse)

In [None]:
%%bash
cd tutorial
rm -rf tutorial/quotes.jl
scrapy crawl quotes -o quotes.jl

## More patterns

In [None]:
%%writefile ./tutorial/tutorial/spiders/author_spider.py
import scrapy

class AuthorSpider(scrapy.Spider):
    name = 'author'
    
    start_urls = ['https://quotes.toscrape.com/']
    
    def parse(self, response):
        author_page_links = response.css('.author + a')