In [None]:
# -*- coding: utf-8 -*-
#source: https://python.gotrained.com/scrapy-tutorial-web-scraping-craigslist/
import scrapy
from scrapy import Request
 

class JobsSpider(scrapy.Spider):
    name = 'jobs'    #Name of the spider
    allowed_domains = ['https://seattle.craigslist.org/search/hea?']     #List of domains the spider is allowed to scrape
    start_urls = ['https://seattle.craigslist.org/search/hea?']   #One of the domains that the spider starts with for crawling

#Scrapy adds "http://" and an extra '/' to the start_urls. They need to be removed.
    
    def parse(self, response):
        #Extracting several details about "healthcare" jobs
        jobs = response.xpath('//p[@class="result-info"]')
        
        #No extract() since we are scrapping all the wrappers from the page
        
        #response refers to the whole HTML code extracted
        #xpath refers to the rules based on which the extraction happens
        #'//' means to start extracting from the tag 'p' mentioned after it
        #the <p> has <a> tag. Use inspect element on any listing to know more
        
        for job in jobs:
            title = job.xpath('a/text()').extract_first()
            #or title = job.xpath('.//a/text()').extract_first()
            #to yield more than one element from the wrapper, this technique is preferred    
            #consider extracting address and URL from the same wrapper at one shot inside this for loop
            address = job.xpath('span[@class="result-meta"]/span[@class="result-hood"]/text()').extract_first("")[2:-1]
            
            #the above statement goes to span, extracts text() from result-hood, slices string
            #extract_first("") is essential - when slicing returns "None", then the result is defaulted to empty string
            #Used for cases when location is NULL in the advertisement
            
            relative_url = job.xpath('a/@href').extract_first()
            absolute_url = response.urljoin(relative_url)
            #Or use absolute_url = "https://newyork.craigslist.org" + relative_url
            yield{'URL':absolute_url, 'Title':title, 'Address':address}
        
        
        relative_next_url = response.xpath('//a[@class="button next"]/@href').extract_first("")
        absolute_next_url = response.urljoin(relative_next_url)
        if(relative_next_url != ""):
            yield Request(absolute_next_url, callback=self.parse, dont_filter=True)
            
    def parse_page(self, response)
        url = response.meta.get('URL')
        title = response.meta.get('Title')
        address = response.meta.get('Address')
        
        description = ''.join(line for line in response.xpath('//*[@id = "postingbody"]/text()').extract())
        
        compensation = response.xpath('//p[@class = "attrgroup"]/b/text()')[0].extract()
        
        employment_type = response.xpath('//p[@class = "attrgroup"]/b/text()')[1].extract()
        
        yield{'URL': url, 'Title': title, 'Address':address, 'Description':description, 'Compensation':compensation, 'Employment Type':employment_type}