Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
GoTrained committed Aug 5, 2017
1 parent 6c4107f commit e46c2cd
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 67 deletions.
58 changes: 29 additions & 29 deletions craigslist/spiders/all-pages-content.py
Expand Up @@ -2,33 +2,33 @@
from scrapy import Request

class JobsSpider(scrapy.Spider):
name = "jobscontent"
allowed_domains = ["craigslist.org"]
start_urls = ["https://newyork.craigslist.org/search/egr"]
name = "jobscontent"
allowed_domains = ["craigslist.org"]
start_urls = ["https://newyork.craigslist.org/search/egr"]

def parse(self, response):
jobs = response.xpath('//p[@class="result-info"]')
for job in jobs:
relative_url = job.xpath('a/@href').extract_first()
absolute_url = response.urljoin(relative_url)
title = job.xpath('a/text()').extract_first()
address = job.xpath('span[@class="result-meta"]/span[@class="result-hood"]/text()').extract_first("")[2:-1]
yield Request(absolute_url, callback=self.parse_page, meta={'URL': absolute_url, 'Title': title, 'Address':address})
relative_next_url = response.xpath('//a[@class="button next"]/@href').extract_first()
absolute_next_url = "https://newyork.craigslist.org" + relative_next_url
yield Request(absolute_next_url, callback=self.parse)
def parse_page(self, response):
url = response.meta.get('URL')
title = response.meta.get('Title')
address = response.meta.get('Address')
description = "".join(line for line in response.xpath('//*[@id="postingbody"]/text()').extract()).strip()
compensation = response.xpath('//p[@class="attrgroup"]/span[1]/b/text()').extract_first()
employment_type = response.xpath('//p[@class="attrgroup"]/span[2]/b/text()').extract_first()
yield{'URL': url, 'Title': title, 'Address':address, 'Description':description, 'Compensation':compensation, 'Employment Type':employment_type}
def parse(self, response):
jobs = response.xpath('//p[@class="result-info"]')
for job in jobs:
relative_url = job.xpath('a/@href').extract_first()
absolute_url = response.urljoin(relative_url)
title = job.xpath('a/text()').extract_first()
address = job.xpath('span[@class="result-meta"]/span[@class="result-hood"]/text()').extract_first("")[2:-1]
yield Request(absolute_url, callback=self.parse_page, meta={'URL': absolute_url, 'Title': title, 'Address':address})
relative_next_url = response.xpath('//a[@class="button next"]/@href').extract_first()
absolute_next_url = "https://newyork.craigslist.org" + relative_next_url
yield Request(absolute_next_url, callback=self.parse)
def parse_page(self, response):
url = response.meta.get('URL')
title = response.meta.get('Title')
address = response.meta.get('Address')
description = "".join(line for line in response.xpath('//*[@id="postingbody"]/text()').extract()).strip()
compensation = response.xpath('//p[@class="attrgroup"]/span[1]/b/text()').extract_first()
employment_type = response.xpath('//p[@class="attrgroup"]/span[2]/b/text()').extract_first()
yield{'URL': url, 'Title': title, 'Address':address, 'Description':description, 'Compensation':compensation, 'Employment Type':employment_type}
34 changes: 17 additions & 17 deletions craigslist/spiders/all-pages.py
Expand Up @@ -2,21 +2,21 @@
from scrapy import Request

class JobsSpider(scrapy.Spider):
name = "jobsall"
allowed_domains = ["craigslist.org"]
start_urls = ["https://newyork.craigslist.org/search/egr"]
name = "jobsall"
allowed_domains = ["craigslist.org"]
start_urls = ["https://newyork.craigslist.org/search/egr"]

def parse(self, response):
jobs = response.xpath('//p[@class="result-info"]')
for job in jobs:
relative_url = job.xpath('a/@href').extract_first()
absolute_url = response.urljoin(relative_url)
title = job.xpath('a/text()').extract_first()
address = job.xpath('span[@class="result-meta"]/span[@class="result-hood"]/text()').extract_first("")[2:-1]
yield{'URL':absolute_url, 'Title':title, 'Address':address}
relative_next_url = response.xpath('//a[@class="button next"]/@href').extract_first()
absolute_next_url = response.urljoin(relative_next_url)
yield Request(absolute_next_url, callback=self.parse)
def parse(self, response):
jobs = response.xpath('//p[@class="result-info"]')
for job in jobs:
relative_url = job.xpath('a/@href').extract_first()
absolute_url = response.urljoin(relative_url)
title = job.xpath('a/text()').extract_first()
address = job.xpath('span[@class="result-meta"]/span[@class="result-hood"]/text()').extract_first("")[2:-1]
yield{'URL':absolute_url, 'Title':title, 'Address':address}
relative_next_url = response.xpath('//a[@class="button next"]/@href').extract_first()
absolute_next_url = response.urljoin(relative_next_url)
yield Request(absolute_next_url, callback=self.parse)
26 changes: 13 additions & 13 deletions craigslist/spiders/one-page.py
@@ -1,17 +1,17 @@
import scrapy

class JobsSpider(scrapy.Spider):
name = "jobsone"
allowed_domains = ["craigslist.org"]
start_urls = ["https://newyork.craigslist.org/search/egr"]
name = "jobsone"
allowed_domains = ["craigslist.org"]
start_urls = ["https://newyork.craigslist.org/search/egr"]

def parse(self, response):
jobs = response.xpath('//p[@class="result-info"]')
for job in jobs:
relative_url = job.xpath('a/@href').extract_first()
absolute_url = response.urljoin(relative_url)
title = job.xpath('a/text()').extract_first()
address = job.xpath('span[@class="result-meta"]/span[@class="result-hood"]/text()').extract_first("")[2:-1]
yield{'URL':absolute_url, 'Title':title, 'Address':address}
def parse(self, response):
jobs = response.xpath('//p[@class="result-info"]')
for job in jobs:
relative_url = job.xpath('a/@href').extract_first()
absolute_url = response.urljoin(relative_url)
title = job.xpath('a/text()').extract_first()
address = job.xpath('span[@class="result-meta"]/span[@class="result-hood"]/text()').extract_first("")[2:-1]
yield{'URL':absolute_url, 'Title':title, 'Address':address}
16 changes: 8 additions & 8 deletions craigslist/spiders/titles.py
@@ -1,12 +1,12 @@
import scrapy

class JobsSpider(scrapy.Spider):
name = "titles"
allowed_domains = ["craigslist.org"]
start_urls = ["https://newyork.craigslist.org/search/egr"]
name = "titles"
allowed_domains = ["craigslist.org"]
start_urls = ["https://newyork.craigslist.org/search/egr"]

def parse(self, response):
titles = response.xpath('//a[@class="result-title hdrlnk"]/text()').extract()
for title in titles:
yield {'Title': title}
def parse(self, response):
titles = response.xpath('//a[@class="result-title hdrlnk"]/text()').extract()
for title in titles:
yield {'Title': title}

0 comments on commit e46c2cd

Please sign in to comment.