#### 通过列表页爬取所有文章的url地址
爬取完第一页后，将第二页传递给scrapy，让scrapy自动去爬取第二页
1. 改写start_urls
    * start_urls = \['http://blog.jobbole.com/110287/'] -> start_urls = ['http://blog.jobbole.com']
2. 改写parse函数
    ```
    def parse(self, response):
    '''
        1.获取文章列表页中的文章url并交给解析函数进行具体字段解析
        2.获取下一页的url并交给scrapy进行下载
    '''
    # 解析列表页中的所有文章url并交给scrapy下载后(通过Request函数创建对象交给scrapy)进行解析
    # from scrapy.http import Request
    post_urls = response.css("#archive .floated-thumb .post-thumb a::attr(href)").extract()
    '''
    因为在某些网站里的的url是不完整的，也就是没有域名的，没有域名默认当前页面的域名
    比如'http://blog.jobbole.com/110287/'写成了'/110287/'，但是提取出href之后是提取不到域名的，
    所以需要把当前页面的域名和这个href做一个连接，这才是一个完整的HTTP的url地址
    只有当post_url域名完整，才可以使用下面的写法
        yield Request(url=post_url, callback=self.parse_detail)
    所以考虑这样一个逻辑：
    完整域名 = response.url(域名地址) + post_url(href值)
    Python提供了一个函数 -> parse.urljoin
        from urllib import parse
    parse.urljoin(response.url, post_url):
        response.url取域名 + post_url取除域名外的部分
        例：urljoin('http://www.cwi.nl/Python.html', 'http://www.cwi.nl/FAQ.html')
        结果为：'http://www.cwi.nl/FAQ.html'
    '''
    for post_url in post_urls:
        yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse_detail)
    
    # 提取下一页并交给scrapy进行下载
    (此时的回调函数不是parse_detail了，而是parse，因为此时的next_url为列表页，不是详情页)
    next_url = response.css('.next.page-numbers::attr(href)').extract()
    if next_url:
        yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)

    pass
    ```
3.增添parse_detail函数作为解析函数(回调函数)

In [2]:
'''
 def parse_detail(self, response):
        # 提取文章具体字段

        title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()

        create_time = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0]. \
            strip().replace('·', '').strip()

        praise_num = int(response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0])

        fav_num = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
        match_re = re.match('.*?(\d+).*', fav_num)
        if match_re:
            fav_num = match_re.group(1)
        else:
            fav_num = 0

        comments_num = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]
        match_re = re.match('.*?(\d+).*', comments_num)
        if match_re:
            comments_num = match_re.group(1)
        else:
            comments_num = 0
        
        tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
        tag_list = [element for element in tag_list if not element.strip().endswith('评论')]
        tags = ','.join(tag_list)
        pass
'''

'\n def parse_detail(self, response):\n        # 提取文章具体字段\n\n        title = response.xpath(\'//div[@class="entry-header"]/h1/text()\').extract()\n\n        create_time = response.xpath(\'//p[@class="entry-meta-hide-on-mobile"]/text()\').extract()[0].             strip().replace(\'·\', \'\').strip()\n\n        praise_num = int(response.xpath("//span[contains(@class, \'vote-post-up\')]/h10/text()").extract()[0])\n\n        fav_num = response.xpath("//span[contains(@class, \'bookmark-btn\')]/text()").extract()[0]\n        match_re = re.match(\'.*?(\\d+).*\', fav_num)\n        if match_re:\n            fav_num = match_re.group(1)\n\n        comments_num = response.xpath("//a[@href=\'#article-comment\']/span/text()").extract()[0]\n        match_re = re.match(\'.*?(\\d+).*\', comments_num)\n        if match_re:\n            comments_num = match_re.group(1)\n\n        tag_list = response.xpath(\'//p[@class="entry-meta-hide-on-mobile"]/a/text()\').extract()\n        tag_list = [element for el

response.css("#archive .floated-thumb .post-thumb a::attr(href)").extract()  
Out[4]:
['http://blog.jobbole.com/114321/',
 'http://blog.jobbole.com/114319/',
 'http://blog.jobbole.com/114311/',
 'http://blog.jobbole.com/114308/',
 'http://blog.jobbole.com/114303/',
 'http://blog.jobbole.com/114297/',
 'http://blog.jobbole.com/114285/',
 'http://blog.jobbole.com/114283/',
 'http://blog.jobbole.com/114280/',
 'http://blog.jobbole.com/114276/',
 'http://blog.jobbole.com/114273/',
 'http://blog.jobbole.com/114270/',
 'http://blog.jobbole.com/114268/',
 'http://blog.jobbole.com/114261/',
 'http://blog.jobbole.com/114168/',
 'http://blog.jobbole.com/114256/',
 'http://blog.jobbole.com/114253/',
 'http://blog.jobbole.com/114250/',
 'http://blog.jobbole.com/114167/',
 'http://blog.jobbole.com/114241/']
 ___
 ？如何获取下一页

<img src='14.png' width=500>
<img src='15.png' width=500>

next_url = response.css('.next.page-numbers::attr(href)').extract()  
Out[10]: ['http://blog.jobbole.com/all-posts/page/2/']

### 完整代码

In [4]:
class JobboleSpider(scrapy.Spider):
    name = 'jobbole'
    allowed_domains = ['blog.jobbole.com']
    start_urls = ['http://blog.jobbole.com/all-posts/']

    def parse(self, response):
        """
        1.获取文章列表页中的文章url并交给解析函数进行具体字段解析
        2.获取下一页的url并交给scrapy进行下载

        :param response:
        :return:
        """
        print(response.url)
        # 获取文章列表页中的文章url并交给解析函数(parse_detail)进行具体字段解析
        post_urls = response.css("#archive .floated-thumb .post-thumb a::attr(href)").extract()
        for post_url in post_urls:
            print(response.url + post_url)
            yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse_detail)

        # 提取下一页并交给scrapy进行下载(此时的回调函数不是parse_detail了，而是parse，因为此时的next_url为列表页，不是详情页)
        next_url = response.css('.next.page-numbers::attr(href)').extract()
        if next_url:
            yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)

    def parse_detail(self, response):
        # 提取文章具体字段

        title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()

        create_time = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0]. \
            strip().replace('·', '').strip()

        praise_num = int(response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0])

        fav_num = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
        match_re = re.match('.*?(\d+).*', fav_num)
        if match_re:
            fav_num = int(match_re.group(1))
        else:
            fav_num = 0

        comments_num = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]
        match_re = re.match('.*?(\d+).*', comments_num)
        if match_re:
            comments_num = int(match_re.group(1))
        else:
            comments_num = 0

        tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
        tag_list = [element for element in tag_list if not element.strip().endswith('评论')]
        tags = ','.join(tag_list)
        pass

NameError: name 'scrapy' is not defined