# Indeed Scrapper

## Importing Libraries

In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.selector import HtmlXPathSelector
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule, CrawlSpider
import logging
from bs4 import BeautifulSoup
import time 
import csv

### Defining Class
Defining the attributes of the scrapper

In [2]:
class JobItem(scrapy.Item):
    job_title = scrapy.Field() # Software engineer
    location = scrapy.Field() # Boston,MA
    scrap_date = scrapy.Field() 
    scrap_websiste = scrapy.Field() #indeed.com
    job_posting_date = scrapy.Field() 
    job_posting_title = scrapy.Field() #Software enginner in mathworks
    company = scrapy.Field()
    job_posting_url = scrapy.Field() # www.indeed.com/softwareengineer1.html
    job_posting_desc = scrapy.Field() #hey.. blah
    job_posting_salary = scrapy.Field() #90000 a year

url_searches consists of a list of search attributes used to scrap indeed.com

Search Attributes: 
- Title
- City
- State

In [3]:
url_searches = [{'title':'software engineer','city':'Boston','state':'MA'},
                  {'title':'data scientist','city':'Boston','state':'MA'},
                   {'title':'technical writer','city':'San Francisco','state':'CA'}]
#                {'title':'data analyst','city':'San Francisco','state':'CA'}]

Defingin class IndeedScrapper

Scarpping is done until a depth level of 1. That is, the main url link is scrapped to find the jobs. Each job is then directed to another link and scrapped for the job attributes defined in class JobItem. The scrapper then returns to the main page to scrap the next job.

In [4]:
class IndeedScrapper(scrapy.Spider):
    name = "IndeedScrapping"

    allowed_domains = ["indeed.com"]
    rules = [
        Rule(
            LinkExtractor(
                canonicalize=True,
                unique=True
            ),
            follow=True,
            callback="parse_next_site"
        )
    ]
    
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'DEPTH_LIMIT': 1,
        'FEED_FORMAT':'json',
        'FEED_URI': 'indeed_'+time.strftime("%d-%m-%Y")+'.json',
    }
    
    def __init__(self, url_searches=url_searches):
        self.url_searches = url_searches

    def start_requests(self):
        for obj in url_searches:
            item = JobItem()
            item['location'] = obj['city']+", "+obj['state']
            item['job_title'] = obj['title']
            for page in range(10,40,10):
                paginated_url = "https://www.indeed.com/jobs?q="+obj['title'].replace(' ','+')+"&l="+obj['city']+"%%2C+"+obj['state']+"&start="+str(page)
                item['job_posting_url'] = paginated_url
                request = scrapy.Request(paginated_url, meta={'start_url':paginated_url}, callback=self.parse)
                request.meta['item'] = item
                yield request
                
    def parse_next_site(self, response):
        item = response.meta['item']
        for job in response.css('div.jobsearch-JobComponent'):
            title = job.css('div.jobsearch-DesktopStickyContainer > h3.jobsearch-JobInfoHeader-title::text').get()
            item['job_posting_title'] = title
            desc = job.css('div.jobsearch-JobComponent-description').get()
            item['job_posting_desc'] = BeautifulSoup(desc, "lxml").text
            item['job_posting_salary'] = job.css('span.icl-u-xs-mr--xs::text').get()
            item['company'] = job.css('div.icl-u-lg-mr--sm::text').get()
        yield item

    def parse(self, response):
        item = response.meta['item']
        open('indeed_'+time.strftime("%d-%m-%Y")+'.json', 'w').close()
        for job in response.css('div.jobsearch-SerpJobCard'):
            url = job.css('a::attr("href")').get()
            if url is not None:
                url = response.urljoin(url)
                item['scrap_date'] = time.strftime("%d-%m-%Y")
                item['job_posting_url'] = url
                request = scrapy.Request(url, callback=self.parse_next_site, dont_filter=True)
                request.meta['item'] = item
                yield request

In [5]:
process = CrawlerProcess()
process.crawl(IndeedScrapping)
process.start()

2019-04-15 15:53:34 [scrapy.utils.log] INFO: Scrapy 1.5.1 started (bot: scrapybot)
2019-04-15 15:53:34 [scrapy.utils.log] INFO: Versions: lxml 4.1.1.0, libxml2 2.9.4, cssselect 1.0.3, parsel 1.5.0, w3lib 1.19.0, Twisted 18.7.0, Python 3.5.4 |Anaconda custom (64-bit)| (default, Nov  8 2017, 14:34:30) [MSC v.1900 64 bit (AMD64)], pyOpenSSL 17.5.0 (OpenSSL 1.0.2r  26 Feb 2019), cryptography 2.1.4, Platform Windows-10-10.0.17134-SP0
2019-04-15 15:53:34 [scrapy.crawler] INFO: Overridden settings: {'FEED_FORMAT': 'json', 'FEED_URI': 'indeed_15-04-2019.json', 'LOG_LEVEL': 30, 'DEPTH_LIMIT': 1}
