In [4]:
class BasicAnalyzer(object):
    """
    一个最基本的网页解析器，从response对象中获取text字段
    """
    def parse(self, task_name, r): # 解析response
        return r.text

# class Cralwer(object):

In [5]:
import requests
from bs4 import BeautifulSoup
import time


class Cralwer(object):
    """
    一个用来爬取网页的类，其主要功能是依次抓取URL，并将返回的结果交给后续的解析器（Analyzer）进行处理。
    """
    
    def __init__(self, task_or_tasks, analyzer=BasicAnalyzer(), 
                 headers={}, timeout=30, encoding=None, wait_time=-1): 
        if isinstance(task_or_tasks, str):
            self.tasks = [task_or_tasks]
        if isinstance(task_or_tasks, list) or isinstance(task_or_tasks, tuple):
            self.tasks = list(task_or_tasks)
        print(self.tasks)
        self.analyzer = analyzer
        self.headers = headers
        self.timeout = timeout
        self.encoding = encoding
        self.wait_time = wait_time
        
        # 用于保存抓取请求返回的状态码
        self.response_codes = []
        
        # 用于遍历所有任务的迭代器
        self.__iterator = iter(self.tasks)
        #创建iter对象“self.__iterator” 
        #依此单击iter（）.next()方法可以依序return返回一个一个元素
    def add_tasks(self, task_or_tasks):
        if isinstance(task_or_tasks, str):
            self.tasks.append(task_or_tasks)
        if isinstance(task_or_tasks, list) or isinstance(task_or_tasks, tuple):
            self.tasks += list(task_or_tasks)
    
    def crawl(self):
        """
        该方法会从任务序列中取出下一个抓取任务，调用__process_task方法进行抓取。
        """
        task_uri = next(self.__iterator)
        if self.wait_time > 0:
            print("等待{}秒后开始抓取".format(self.wait_time))
            time.sleep(self.wait_time)
        return self.__process_task(task_uri)
            
    def __process_task(self, task):
        """
        完成task指定的抓取任务。
        """
        if isinstance(task, str): # 如果task是一个字符串，那么task代表要抓取的网页URI
            task_name = None
        elif isinstance(task, tuple) and len(task) == 2: # 如果task是一个长度为2的元组，那么task表示（任务名，网页URI）
            task_name, task = task
        else: # 否则报错
            raise ValueError("无法识别任务:{}".format(task))
        try:
            print(task_name, task)
            r = requests.get(task, headers=self.headers, timeout=self.timeout)
            if self.encoding is not None:
                r.encoding = self.encoding
            self.response_codes.append((task_name, r.status_code))
        except:
            self.response_codes.append((task_name, None)) # 若遇到链接错误等问题，则此次任务的响应状态码为None
            return None
        return self.analyzer.parse(task_name, r) # 将response对象交给analyzer处理，这时会调用__call__方法

    def __iter__(self):
        """
        通过重载__iter__和__next__两个方法，可以使得我们能够通过类似for x in Crawler()这样的方式依次完成所有抓取任务。
        __iter__会在循环遍历开始前被调用，这个方法应该返回一个可遍历对象。
        由于实现了上述两个方法，Crawler对象本身就是一个可遍历对象，所以我们直接返回self。
        """
        return self
    
    def __next__(self):
        """
        __next__方法应该依次返回抓取的结果，因此我们调用crawl()方法，完成队列中的一个抓取任务，返回经过analyzer处理后的结果。
        """
        return self.crawl()
    
    def crawl_all(self):
        """
        完成所有抓取任务，将结果保存到一个list中返回。
        这里我们直接使用列表推导式方法，循环完成抓取。
        注意，这里能将self（即Crawler对象自身）用于列表推导式的原因是
        Crawler对象是一个可遍历对象，即一个实现了__iter__和__next__特殊方法对象。
        """
        return [result for result in self]


In [8]:
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0'}
c = Cralwer([('CSDN', 'https://blog.csdn.net/sxingming/article/details/51479039')], encoding='utf8', wait_time=2,headers=headers)
c.crawl()

[('CSDN', 'https://blog.csdn.net/sxingming/article/details/51479039')]
等待2秒后开始抓取
CSDN https://blog.csdn.net/sxingming/article/details/51479039


'\n<!DOCTYPE html>\n<html lang="zh-CN">\n<head>\n    <meta charset="utf-8">\n    <link rel="canonical" href="https://blog.csdn.net/sxingming/article/details/51479039"/>\n    <meta http-equiv="content-type" content="text/html; charset=utf-8">\n    <meta name="renderer" content="webkit"/>\n    <meta name="force-rendering" content="webkit"/>\n    <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"/>\n    <meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0, maximum-scale=1.0, user-scalable=no">\n    <meta name="report" content=\'{"pid": "blog", "spm":"1001.2101"}\'>\n    <meta name="referrer" content="always">\n    <meta http-equiv="Cache-Control" content="no-siteapp" /><link rel="alternate" media="handheld" href="#" />\n    <meta name="shenma-site-verification" content="5a59773ab8077d4a62bf469ab966a63b_1497598848">\n    <meta name="applicable-device" content="pc">\n    <link  href="https://g.csdnimg.cn/static/logo/favicon32.ico"  rel="shortcut

In [None]:
class BasicAnalyzer(object):
    """
    一个最基本的网页解析器，从response对象中获取text字段
    """
    def parse(self, task_name, r): # 解析response
        return r.text

# class LinkAnalyzer(BasicAnalyzer):

In [11]:
# 我们也可以获取页面上所有的链接，把他们输出到一个csv文件
class LinkAnalyzer(BasicAnalyzer):
    def __init__(self, filename, encodings=None):
        
        self.filename = filename
        
    def parse(self, task_name, r):
        html_text = super().parse(task_name, r)
        soup = BeautifulSoup(html_text, 'html.parser') # 使用自带的解析器，解析上述html文档
        with open(self.filename, 'w') as fout:
            fout.write("锚文本,超链接\n")
            for tag in soup.find_all('a'):
                if tag.string is not None:
                    fout.write("{},{}\n".format(tag.string, tag.get('href', None)))
        
c = Cralwer(["http://jinyongxiaoshuo.com/xiaoaojianghu/"], 
            analyzer=LinkAnalyzer('links.csv'),
            encoding='utf8',
            wait_time=2,
           headers=headers
           )
c.crawl_all()

['http://jinyongxiaoshuo.com/xiaoaojianghu/']
等待2秒后开始抓取
None http://jinyongxiaoshuo.com/xiaoaojianghu/


[None]

# class FileStorageAnalyzer(BasicAnalyzer):

In [21]:
# 我们还可以把所有页面都依次抓取下来并保存到文件里
import os

class FileStorageAnalyzer(BasicAnalyzer):
    def __init__(self, dir_path):
        self.dir_path = dir_path
        self.cnt = 0
    
    def parse(self, task_name, r):
        html_text = super().parse(task_name, r) # 调用父类，获取html text
        
        # 将其保存到文件
        if task_name is not None and isinstance(task_name, str):
            file_path = os.path.join(self.dir_path, "{}.html".format(task_name))
        else:
            file_path = os.path.join(self.dir_path, "{0:04}.html".format(self.cnt))
            self.cnt += 1
        with open(file_path, 'w') as fout:
                fout.write(html_text)
        return html_text

tasks = []
with open("links.csv", 'r') as fin: # 注意这里的links.csv文件是上一个cell运行后生成的文件。
    header = fin.readline()
    for line in fin:
        name, uri = line.strip().split(',')
        if uri.startswith('http://jinyongxiaoshuo.com/xiaoaojianghu/15'):
            tasks.append((name, uri))
        
c = Cralwer(tasks, analyzer=FileStorageAnalyzer("D:/A1Python/人工智能与Python/作业/大作业（2）/大作业2-爬虫网页解析文本分析/笑傲江湖/"), 
            wait_time=2, encoding='utf8')
c.crawl_all()
print(c.response_codes)

[('第01章\u3000灭门', 'http://jinyongxiaoshuo.com/xiaoaojianghu/1573.html'), ('第02章\u3000聆秘', 'http://jinyongxiaoshuo.com/xiaoaojianghu/1572.html'), ('第03章\u3000救难', 'http://jinyongxiaoshuo.com/xiaoaojianghu/1571.html'), ('第04章\u3000坐斗', 'http://jinyongxiaoshuo.com/xiaoaojianghu/1570.html'), ('第05章\u3000治伤', 'http://jinyongxiaoshuo.com/xiaoaojianghu/1569.html'), ('第06章\u3000洗手', 'http://jinyongxiaoshuo.com/xiaoaojianghu/1568.html'), ('第07章\u3000授谱', 'http://jinyongxiaoshuo.com/xiaoaojianghu/1567.html'), ('第08章\u3000面壁', 'http://jinyongxiaoshuo.com/xiaoaojianghu/1566.html'), ('第09章\u3000邀客', 'http://jinyongxiaoshuo.com/xiaoaojianghu/1565.html'), ('第10章\u3000传剑', 'http://jinyongxiaoshuo.com/xiaoaojianghu/1564.html'), ('第11章\u3000聚气', 'http://jinyongxiaoshuo.com/xiaoaojianghu/1563.html'), ('第12章\u3000围攻', 'http://jinyongxiaoshuo.com/xiaoaojianghu/1562.html'), ('第13章\u3000学琴', 'http://jinyongxiaoshuo.com/xiaoaojianghu/1561.html'), ('第14章\u3000论杯', 'http://jinyongxiaoshuo.com/xiaoaojianghu/1560

#  提取正文和词频统计

In [45]:
import os
from bs4 import BeautifulSoup
import jieba
import pandas as pd
import numpy as np

class XAJHAnalyzer():
    
        
    def parse(self,location):
        htmls=os.listdir(location)
        for html in htmls:
            if html[-4:]=='html':
                path=os.path.join(location,html)
                with open(path,'r',encoding='utf8') as f:
                    soup=BeautifulSoup(f.read(),'html.parser')
                    chapter=soup.h1.text[:7]
                    contents=[]
                    p_ls=soup.find_all('p')
                    for i in p_ls:
                        contents.append(i.text)
                    loc="D:/A1Python/人工智能与Python/作业/大作业（2）/大作业2-爬虫网页解析文本分析/笑傲江湖/"
                    article='{}.txt'.format(chapter)
                    with open(os.path.join(loc,article),'w',encoding='utf8')as chap:
                        chap.writelines(contents)
                    
    def text_cut(self,location,dictionary):
        files= os.listdir(location) #得到文件夹下的所有文件名称
        data_dict={}
        names=[]
        with open(dictionary,'r',encoding='utf8') as d:
            for line in d.readlines():
                for name in line.split():
                    names.append(name.strip())
        #print(names)
        for chapter in files:
            if chapter[-3:]=='txt':
                #print(chapter)
                path=os.path.join(location,chapter)
                with open(path,'r',encoding='utf8') as f:
                    contents=f.read()
                    for ch in '，。《》\、“”（）!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~':
                        contents = contents.replace(ch, " ") #将文本中特殊字符替换为空格
                    data_dict[chapter]=self.process(dictionary,contents,names)
        self.save(data_dict,names)
        print('done')
                
    def process(self,dictionary,contents,names):
        jieba.load_userdict(dictionary)
        words=jieba.lcut(contents)
        #print(words)      
        #用双list储存词频，替代dict的键值对
        counts = np.linspace(0,0,160).tolist()
        #print(counts)
        #couts长度与names相同，且元素皆为0

        for word in words:
            if word not in names: #排除非人物名字的分词结果
                continue
            else:
                #print(word)
                tempindex = names.index(word)
                    #list.index() 函数用于从列表中找出某个值第一个匹配项的索引位置
                counts[tempindex] = counts[tempindex] + 1
        return counts
    def save(self,data_dict,names):
        df=pd.DataFrame(index=names,data=data_dict)
        df.to_csv('笑傲江湖中人物出现次数.csv')
   
a=XAJHAnalyzer() 
dictionary="笑傲江湖人物.txt"    
location='D:/A1Python/人工智能与Python/作业/大作业（2）/大作业2-爬虫网页解析文本分析/笑傲江湖/'
#a.parse(location)
a.text_cut(location,dictionary)

done
