From e46c2cd1b2a57e1793c85a337891f2d675c0d687 Mon Sep 17 00:00:00 2001 From: GoTrained Date: Sat, 5 Aug 2017 13:40:31 +0300 Subject: [PATCH] Add files via upload --- craigslist/spiders/all-pages-content.py | 58 ++++++++++++------------- craigslist/spiders/all-pages.py | 34 +++++++-------- craigslist/spiders/one-page.py | 26 +++++------ craigslist/spiders/titles.py | 16 +++---- 4 files changed, 67 insertions(+), 67 deletions(-) diff --git a/craigslist/spiders/all-pages-content.py b/craigslist/spiders/all-pages-content.py index 455c836..373193d 100644 --- a/craigslist/spiders/all-pages-content.py +++ b/craigslist/spiders/all-pages-content.py @@ -2,33 +2,33 @@ from scrapy import Request class JobsSpider(scrapy.Spider): - name = "jobscontent" - allowed_domains = ["craigslist.org"] - start_urls = ["https://newyork.craigslist.org/search/egr"] + name = "jobscontent" + allowed_domains = ["craigslist.org"] + start_urls = ["https://newyork.craigslist.org/search/egr"] - def parse(self, response): - jobs = response.xpath('//p[@class="result-info"]') - - for job in jobs: - relative_url = job.xpath('a/@href').extract_first() - absolute_url = response.urljoin(relative_url) - title = job.xpath('a/text()').extract_first() - address = job.xpath('span[@class="result-meta"]/span[@class="result-hood"]/text()').extract_first("")[2:-1] - - yield Request(absolute_url, callback=self.parse_page, meta={'URL': absolute_url, 'Title': title, 'Address':address}) - - relative_next_url = response.xpath('//a[@class="button next"]/@href').extract_first() - absolute_next_url = "https://newyork.craigslist.org" + relative_next_url - yield Request(absolute_next_url, callback=self.parse) - - def parse_page(self, response): - url = response.meta.get('URL') - title = response.meta.get('Title') - address = response.meta.get('Address') - - description = "".join(line for line in response.xpath('//*[@id="postingbody"]/text()').extract()).strip() - - compensation = response.xpath('//p[@class="attrgroup"]/span[1]/b/text()').extract_first() - employment_type = response.xpath('//p[@class="attrgroup"]/span[2]/b/text()').extract_first() - - yield{'URL': url, 'Title': title, 'Address':address, 'Description':description, 'Compensation':compensation, 'Employment Type':employment_type} \ No newline at end of file + def parse(self, response): + jobs = response.xpath('//p[@class="result-info"]') + + for job in jobs: + relative_url = job.xpath('a/@href').extract_first() + absolute_url = response.urljoin(relative_url) + title = job.xpath('a/text()').extract_first() + address = job.xpath('span[@class="result-meta"]/span[@class="result-hood"]/text()').extract_first("")[2:-1] + + yield Request(absolute_url, callback=self.parse_page, meta={'URL': absolute_url, 'Title': title, 'Address':address}) + + relative_next_url = response.xpath('//a[@class="button next"]/@href').extract_first() + absolute_next_url = "https://newyork.craigslist.org" + relative_next_url + yield Request(absolute_next_url, callback=self.parse) + + def parse_page(self, response): + url = response.meta.get('URL') + title = response.meta.get('Title') + address = response.meta.get('Address') + + description = "".join(line for line in response.xpath('//*[@id="postingbody"]/text()').extract()).strip() + + compensation = response.xpath('//p[@class="attrgroup"]/span[1]/b/text()').extract_first() + employment_type = response.xpath('//p[@class="attrgroup"]/span[2]/b/text()').extract_first() + + yield{'URL': url, 'Title': title, 'Address':address, 'Description':description, 'Compensation':compensation, 'Employment Type':employment_type} \ No newline at end of file diff --git a/craigslist/spiders/all-pages.py b/craigslist/spiders/all-pages.py index 8ced5ee..e18f213 100644 --- a/craigslist/spiders/all-pages.py +++ b/craigslist/spiders/all-pages.py @@ -2,21 +2,21 @@ from scrapy import Request class JobsSpider(scrapy.Spider): - name = "jobsall" - allowed_domains = ["craigslist.org"] - start_urls = ["https://newyork.craigslist.org/search/egr"] + name = "jobsall" + allowed_domains = ["craigslist.org"] + start_urls = ["https://newyork.craigslist.org/search/egr"] - def parse(self, response): - jobs = response.xpath('//p[@class="result-info"]') - - for job in jobs: - relative_url = job.xpath('a/@href').extract_first() - absolute_url = response.urljoin(relative_url) - title = job.xpath('a/text()').extract_first() - address = job.xpath('span[@class="result-meta"]/span[@class="result-hood"]/text()').extract_first("")[2:-1] - - yield{'URL':absolute_url, 'Title':title, 'Address':address} - - relative_next_url = response.xpath('//a[@class="button next"]/@href').extract_first() - absolute_next_url = response.urljoin(relative_next_url) - yield Request(absolute_next_url, callback=self.parse) \ No newline at end of file + def parse(self, response): + jobs = response.xpath('//p[@class="result-info"]') + + for job in jobs: + relative_url = job.xpath('a/@href').extract_first() + absolute_url = response.urljoin(relative_url) + title = job.xpath('a/text()').extract_first() + address = job.xpath('span[@class="result-meta"]/span[@class="result-hood"]/text()').extract_first("")[2:-1] + + yield{'URL':absolute_url, 'Title':title, 'Address':address} + + relative_next_url = response.xpath('//a[@class="button next"]/@href').extract_first() + absolute_next_url = response.urljoin(relative_next_url) + yield Request(absolute_next_url, callback=self.parse) \ No newline at end of file diff --git a/craigslist/spiders/one-page.py b/craigslist/spiders/one-page.py index ab68820..3fc4c48 100644 --- a/craigslist/spiders/one-page.py +++ b/craigslist/spiders/one-page.py @@ -1,17 +1,17 @@ import scrapy class JobsSpider(scrapy.Spider): - name = "jobsone" - allowed_domains = ["craigslist.org"] - start_urls = ["https://newyork.craigslist.org/search/egr"] + name = "jobsone" + allowed_domains = ["craigslist.org"] + start_urls = ["https://newyork.craigslist.org/search/egr"] - def parse(self, response): - jobs = response.xpath('//p[@class="result-info"]') - - for job in jobs: - relative_url = job.xpath('a/@href').extract_first() - absolute_url = response.urljoin(relative_url) - title = job.xpath('a/text()').extract_first() - address = job.xpath('span[@class="result-meta"]/span[@class="result-hood"]/text()').extract_first("")[2:-1] - - yield{'URL':absolute_url, 'Title':title, 'Address':address} \ No newline at end of file + def parse(self, response): + jobs = response.xpath('//p[@class="result-info"]') + + for job in jobs: + relative_url = job.xpath('a/@href').extract_first() + absolute_url = response.urljoin(relative_url) + title = job.xpath('a/text()').extract_first() + address = job.xpath('span[@class="result-meta"]/span[@class="result-hood"]/text()').extract_first("")[2:-1] + + yield{'URL':absolute_url, 'Title':title, 'Address':address} \ No newline at end of file diff --git a/craigslist/spiders/titles.py b/craigslist/spiders/titles.py index ba68c78..a5cc35e 100644 --- a/craigslist/spiders/titles.py +++ b/craigslist/spiders/titles.py @@ -1,12 +1,12 @@ import scrapy class JobsSpider(scrapy.Spider): - name = "titles" - allowed_domains = ["craigslist.org"] - start_urls = ["https://newyork.craigslist.org/search/egr"] + name = "titles" + allowed_domains = ["craigslist.org"] + start_urls = ["https://newyork.craigslist.org/search/egr"] - def parse(self, response): - titles = response.xpath('//a[@class="result-title hdrlnk"]/text()').extract() - - for title in titles: - yield {'Title': title} \ No newline at end of file + def parse(self, response): + titles = response.xpath('//a[@class="result-title hdrlnk"]/text()').extract() + + for title in titles: + yield {'Title': title} \ No newline at end of file