In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.selector import HtmlXPathSelector
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule, CrawlSpider
import logging
from bs4 import BeautifulSoup

In [2]:
class JobItem(scrapy.Item):
    joburl = scrapy.Field()
    title = scrapy.Field()
    desc = scrapy.Field()
    salary = scrapy.Field()

In [None]:
class IndeedScrapping(scrapy.Spider):
    name = "IndeedScrapping"
    
    allowed_domains = ["indeed.com"]
    
    start_urls = [
        'https://www.indeed.com/jobs?q=software+engineer&l=Boston%2C+MA',
    ]
    
    rules = [
        Rule(
            LinkExtractor(
                canonicalize=True,
                unique=True
            ),
            follow=True,
            callback="parse_next_site"
        )
    ]
    
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'DEPTH_LIMIT': 1,
        'FEED_FORMAT':'json',
        'FEED_URI': 'indeed_jobs.json',
    }
    
    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url, callback=self.parse, dont_filter=True)

    def parse_next_site(self, response):
        item = response.meta['item']
        for job in response.css('div.jobsearch-JobComponent'):
            item['title'] = job.css('div.jobsearch-DesktopStickyContainer > h3.jobsearch-JobInfoHeader-title::text').get()
            desc = job.css('div.jobsearch-JobComponent-description').get()
            item['desc'] = cleantext = BeautifulSoup(desc, "lxml").text
            item['salary'] = job.css('span.icl-u-xs-mr--xs::text').get()
        yield item

    def parse(self, response):
        items=[]
        for job in response.css('div.jobsearch-SerpJobCard'):
            item = JobItem()

            url = job.css('a::attr("href")').get()
            if url is not None:
                url = response.urljoin(url)
                item['joburl'] = url
                request = scrapy.Request(url, callback=self.parse_next_site, dont_filter=True)
                request.meta['item'] = item
                yield request
#         yield{'software_engineer': request.meta['item']}
            

In [None]:
process = CrawlerProcess()
process.crawl(IndeedScrapping)
process.start()