# Creating the Spider

- Requires Imports

In [17]:
import scrapy
import csv
from scrapy.crawler import CrawlerProcess

- The part we will focus on: the actual spider

In [None]:
class SpiderClassName(scrapy.Spider):
    name = "spider_name"

    def start_requests(self):
        url = 'https://www.datacamp.com/courses/all'
        yield scrapy.Request( url = url, callback = self.parse )

    def parse(self, response):
        # simple example: write out the html
        html_file = 'DC_courses.html'
        with open( html_file, 'wb') as fout:
            fout.write( response.body )

- Running the spider

In [None]:
# initiate a CrawlerProcessa
process = CrawlerProcess()

# tell the process which spider to use
process.crawl(YourSpider)

# start the crawling process
process.start()

- Example of Spider

In [None]:
from typing import Iterable
from scrapy.http import Request
import scrapy
from scrapy.crawler import CrawlerProcess


class DC_Chapter_Spider(scrapy.Spider):
    
    name = 'dc_chapter_spider'

    def start_requests(self):
        url = 'https://www.datacamp.com/courses/all'
        yield scrapy.Request( url = url, callback = self.parse_front)
    
    def parse_front( self, response ):
        # Narrow in on the course blocks
        course_blocks = response.css( 'div.course-block' )
        # Direct to the course links
        course_links = course_blocks.xpath( './a/@href' )
        # Extract the links
        links_to_follow = course_links.extract()
        # Follow the links to the next parser
        for url in links_to_follow:
            yield response.follow( url = url, callback = self.parse_pages )
   
    def parse_pages( self, response):
        # Direct to the course title text
        crs_title = response.xpath( '//h1[contains(@class,"title")]/text()' )
        # Extract and clean the course title text
        crs_title_ext = crs_title.extract_first().strip()
        # Direct to the chapter title text
        ch_titles = response.css( 'h4.chapter__title::text' )
        # Extract and clean the chapter titles text
        ch_titles_ext = [t.strip() for t in ch_titles.extract()]
        # Store this in our dictionary
        dc_dict[ crs_title_ext ] = ch_titles_ext
        
dc_dict = dict()

process = CrawlerProcess()
process.crawl(DC_Chapter_Spider)
process.start()

In [19]:
class YuGiHo_Spider(scrapy.Spider):
    
    name = 'YuGiHo_Spider'
    
    def start_requests(self):
        url = 'https://db.ygoprodeck.com/api/v7/cardinfo.php?archetype=Blue-Eyes'
        yield scrapy.Request( url=url , callback=self.parse)
        
    def parse(self, response):
        data = response.json()
        with open('cards_test.csv', 'w', newline='') as csvfile:
            fieldnames = ['name','type']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for card in data['data']:
                writer.writerow({
                    'name': card['name'],
                    'type': card['type']
                })
        csvfile.close()


process = CrawlerProcess()
process.crawl(YuGiHo_Spider)
process.start()




2023-10-29 18:44:15 [scrapy.utils.log] INFO: Scrapy 2.11.0 started (bot: scrapybot)
2023-10-29 18:44:15 [scrapy.utils.log] INFO: Versions: lxml 4.9.3.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 22.10.0, Python 3.12.0 (tags/v3.12.0:0fb18b0, Oct  2 2023, 13:03:39) [MSC v.1935 64 bit (AMD64)], pyOpenSSL 23.3.0 (OpenSSL 3.1.4 24 Oct 2023), cryptography 41.0.5, Platform Windows-10-10.0.19042-SP0
2023-10-29 18:44:15 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2023-10-29 18:44:15 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2023-10-29 18:44:15 [scrapy.extensions.telnet] INFO: Telnet Password: afc2de888de7ecdc
2023-10-29 18:44:15 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.lo

ReactorNotRestartable: 