汽车投诉信息采集：
---
- 数据源：http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-1.shtml
- 投诉编号，投诉品牌，投诉车系，投诉车型，问题简述，典型问题，投诉时间，投诉状态
- 可以采用Python爬虫，或者第三方可视化工具

# 1.导入模块
---

In [1]:
import re
import datetime

import requests
import pandas as pd
from bs4 import BeautifulSoup

## 1.1 版本信息

In [2]:
pd.__version__

'1.0.5'

In [3]:
requests.__version__

'2.24.0'

## 1.2 构建js的每日链接
典型问题的数据信息存储在js中，该js链接中参数version的值为当天日期。

In [4]:
a = datetime.date.today()
print(a)
a.strftime("%Y%m%d")

2020-08-22


'20200822'

In [5]:
params = {"version": a.strftime("%Y%m%d")}
js_url = "http://www.12365auto.com/js/cTypeInfo.js"
headers = {
    'user-agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
res = requests.get(js_url, params=params, headers=headers)
res.status_code

200

In [7]:
res.text[:100]

'var cTypeInfo = [{"id":1,"name":"发动机","value":"A","zf":"z","items":[{"id":9,"title":"异响"},{"id":11,"'

以上猜想可行。

# 2.定义爬取车质网投诉信息的类
---
1. 爬取前10页的信息
2. 根据爬取到的典型问题的字母找到它对应的文字内容；
3. 将爬取到的内容写入存入本地csv。

In [11]:
class CarQualitySpider:
    
    def __init__(self, pages=10):
        self.headers = {
            'user-agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
        }
        # 创建空的DataFrame
        # 存储总信息
        self.all_info_df = pd.DataFrame(
            columns=['id', 'brands', 'car_model', 'types', 'desc', 'problems_cate', 'datetime', 'status',
                     'typical_problems'])
        
        # 将"典型问题"js网页中解析到的典型问题的嵌套列表存储在该DataFrame中
        self.typical_infos_df = pd.DataFrame(columns=['id', 'name', 'value', 'zf', 'items'])
        
        # 更新typical_infos_df属性
        self.__get_typical_problems_info()
        
        # 存储typical_infos_df中通过'value'的字母定位到的'items'
        self.problems_items_df = pd.DataFrame(columns=['id', 'title'])
        
        # 构建URL列表
        self.start_urls = []
        link = 'http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-{p}.shtml'
        # 页码从1开始，构建pages页的URL
        for p in range(1, pages + 1):
            self.start_urls.append(link.format(p=p))
    
    def __get_typical_problems_info(self):
        """获得"典型问题"的中文及代码的对应的列表，并存入属性typical_infos_df中"""
        url = 'http://www.12365auto.com/js/cTypeInfo.js'
        # 今日日期，实现获取当日最新典型问题列表
        today_ = datetime.date.today()
        params = {"version": today_.strftime("%Y%m%d")}
        response = requests.get(url, headers=self.headers, params=params)
        if response.status_code == 200:
            # 正则提取出json格式数据
            patten = re.compile("\[.*\]")
            temp = re.findall(patten, response.text)[0]
            # 万能eval将json格式转为Python列表
            temp = eval(temp)
            # 将列表嵌套的字典转为DataFrame 形状(17, 5)
            self.typical_infos_df = self.typical_infos_df.append(temp)
        else:
            raise Exception("js爬取失败，状态码:", response.status_code)
    
    def __parse_problems(self, value, item_id):
        """将'典型问题'的问题代码解析为中文, 更新problems_items_df属性，并返回符合的中文字符串"""
        # 获取value字母所对应的items，得到的是一行的Series，值是嵌套列表
        # 提取出Series中的列表，并转换为DataFrame
        items = self.typical_infos_df[self.typical_infos_df['value'] == value]['items'].iloc[0]
        
        # 将items的嵌套列表转为DataFrame，columns=['id', 'title']
        self.problems_items_df = self.problems_items_df.append(items)
        problem_title = self.problems_items_df.query('id == %d' % item_id)['title'].iloc[0]
        return problem_title
    
    def spider(self):
        """爬取网页，并将解析结果存入all_info_df属性中"""
        for url in self.start_urls:
            res = requests.get(url, headers=self.headers)
            print(url, res.status_code)
            if res.status_code != 200:
                print("该链接爬取出现了异常:", url)
                continue
            soup = BeautifulSoup(res.text, 'html.parser')
            table = soup.find('table', class_='ar_c')
            # 定位到table的所有子标签tr，从第2个tr标签开始循环
            for item in table.find_all('tr')[1:]:
                # 提取投诉汽车信息
                temp_dict = {}
                temp = item.find_all('td')
                id, brands, car_model, types, desc, problems_cate, datetime, status = [t.text for t in temp]
                temp_dict['id'] = id
                temp_dict['brands'] = brands
                temp_dict['car_model'] = car_model
                temp_dict['types'] = types
                temp_dict['desc'] = desc
                temp_dict['problems_cate'] = problems_cate
                temp_dict['datetime'] = datetime
                temp_dict['status'] = status
                # 将problems_cate中的英文代码解析为中文
                problems_split = problems_cate.split(',')
                typical_problems = ''
                for s in problems_split:
                    if s != '':
                        # print('s:', s)
                        value, item_id = s[0], int(s[1:])
                        expr = self.problems_items_df['id'] == item_id
                        # 若item_id不在problems_items_df，则调用__parse_problems方法
                        if expr.sum() == 0:
                            title = self.__parse_problems(value, item_id)
                        else:
                            # 若item_id在problems_items_df属性中，则直接定位title
                            title = self.problems_items_df[expr]['title'].iloc[0]
                        typical_problems += title + ','
                # 将解析好的中文存入字典的键typical_problems中
                temp_dict['typical_problems'] = typical_problems
                # 向all_info_df属性添加一行数据
                self.all_info_df = self.all_info_df.append(temp_dict, ignore_index=True)

# 3.开始爬虫

## 3.1实例化后，查看属性初始值

In [12]:
car_spider = CarQualitySpider(10)

In [13]:
car_spider.all_info_df

Unnamed: 0,id,brands,car_model,types,desc,problems_cate,datetime,status,typical_problems


In [14]:
# "典型问题"的文字内容及字母的DataFrame
car_spider.typical_infos_df

Unnamed: 0,id,name,value,zf,items
0,1,发动机,A,z,"[{'id': 9, 'title': '异响'}, {'id': 11, 'title':..."
1,2,变速器,B,z,"[{'id': 19, 'title': '顿挫'}, {'id': 20, 'title'..."
2,3,离合器,C,z,"[{'id': 16, 'title': '打滑'}, {'id': 17, 'title'..."
3,4,转向系统,D,z,"[{'id': 57, 'title': '失灵'}, {'id': 58, 'title'..."
4,5,制动系统,E,z,"[{'id': 29, 'title': '刹车失灵'}, {'id': 42, 'titl..."
5,6,轮胎,F,z,"[{'id': 10, 'title': '磨损'}, {'id': 39, 'title'..."
6,7,前后桥及悬挂系统,G,z,"[{'id': 25, 'title': '减震器漏油'}, {'id': 51, 'tit..."
7,8,车身附件及电器,H,z,"[{'id': 15, 'title': '车身异响'}, {'id': 21, 'titl..."
8,272,服务态度,I,f,"[{'id': 281, 'title': '互相推诿'}, {'id': 282, 'ti..."
9,273,人员技术,J,f,"[{'id': 298, 'title': '多次返修'}, {'id': 299, 'ti..."


In [15]:
car_spider.problems_items_df

Unnamed: 0,id,title


In [16]:
car_spider.start_urls

['http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-1.shtml',
 'http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-2.shtml',
 'http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-3.shtml',
 'http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-4.shtml',
 'http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-5.shtml',
 'http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-6.shtml',
 'http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-7.shtml',
 'http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-8.shtml',
 'http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-9.shtml',
 'http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-10.shtml']

## 3.2 调用爬取方法

In [17]:
car_spider.spider()

http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-1.shtml 200
http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-2.shtml 200
http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-3.shtml 200
http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-4.shtml 200
http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-5.shtml 200
http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-6.shtml 200
http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-7.shtml 200
http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-8.shtml 200
http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-9.shtml 200
http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-10.shtml 200


## 3.3 调用all_info_df属性，将结果写入CSV

In [18]:
car_spider.all_info_df.to_csv('./car_qualites.csv', index=False)

print("已完成!")

已完成!


In [19]:
# 校验下每一列是否有爬取遗漏。
car_spider.all_info_df.isnull().any() 

id                  False
brands              False
car_model           False
types               False
desc                False
problems_cate       False
datetime            False
status              False
typical_problems    False
dtype: bool