In [51]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
from lxml import etree

import re
import csv
import math
import time
import requests

# 1.1 二手车之家，省份直辖市级别字典(key:中文名，value:拼音)
address_dict = {
    '安徽': 'anhui',
    '北京': 'beijing',
    '重庆': 'chongqing',
    '福建': 'fujian',
    '广东': 'guangdong',
    '广西': 'guangxi',
    '贵州': 'guizhou',
    '甘肃': 'gansu',
    '海南': 'hainan',
    '河南': 'henan',
    '湖北': 'hubei',
    '湖南': 'hunan',
    '河北': 'hebei',
    '黑龙江': 'heilongjiang',
    '江苏': 'jiangsu',
    '江西': 'jiangxi',
    '吉林': 'jilin',
    '辽宁': 'liaoning',
    '内蒙古': 'namenggu',
    '宁夏': 'ningxia',
    '青海': 'qinghai',
    '陕西': 'shan_xi',
    '四川': 'sichuan',
    '上海': 'shanghai',
    '山西': 'shanxi',
    '山东': 'shandong',
    '天津': 'tianjin',
    '新疆': 'xinjiang',
    '西藏': 'xizang',
    '云南': 'yunnan',
    '浙江': 'zhejiang'
}


# 2.1 判定车辆总数
def get_car_num(choose_url):
    choose_html = session.get(choose_url, headers=headers).content

    html = etree.HTML(choose_html)
    
    # 绝对路径获取car_num，并转化为整数型
    car_num = int(html.xpath("/html/body/input[@id='firstcarnum']/@value")[0])
    # 相对路径形式
    # car_num = html.xpath("//*[@id='firstcarnum']/@value")

    return car_num


# 2.2 获取车辆URL,添加至列表
def add_car_url_to_list(url):
    #car_url_list = []
    
    #html = session.get(url, headers=headers)
    html = urlopen(url)

    bsObj = BeautifulSoup(html, 'lxml')

    li = bsObj.find_all('li', {'class': 'cards-li list-photo-li'})

    for i in li:
        a_tag = i.find('a')

        a_tag_href = a_tag['href']
        
        # print(a_tag_href)

        car_url_list.append(a_tag_href)

    #return car_url_list


# 3.1.0 长链接转化
def long_url_change(url):
    html = session.get(url, headers=headers)

    bsObj = BeautifulSoup(html.text, 'lxml')

    mobile_agent = str(bsObj.find('head').find('meta', {'http-equiv': 'mobile-agent'}))

    re_url = re.compile(r"/(dealer|lianmeng)/\d*/\d*\.html")

    result_url = re.search(re_url, mobile_agent).group()

    return result_url


# 3.1 判断每辆二手车的url地址类型并格式化为手机端url
def judge_and_format_url(url):
    # 手机端url前部
    mobile_url_head = "https://m.che168.com"

    # 长链接转化步骤
    if "https" in url:
        new_url = mobile_url_head + long_url_change(url)
    # 短链接转化步骤
    elif "/dealer/" or "/lianmeng/" in url:
        new_url = mobile_url_head + url
    else:
        pass

    return new_url


# 4.1 获取每辆二手车HTML中的车型id,vincode
def get_vincode(url):
    html = session.get(url, headers=headers)

    bsObj = BeautifulSoup(html.text, 'lxml')

    # print(bsObj)

    script = bsObj.find('body').find('script', {'type': 'text/javascript'}).get_text()

    re_id = re.compile(r"specid: '\d*'")
    re_vincode = re.compile(r"vincode: '\w*'")

    one_vincode_list = []
    one_vincode_list.append(url)

    # 确保网页中未找到车型id和vincode时，程序正常运行
    try:
        result_id = re.search(re_id, script).group()
        result_vincode = re.search(re_vincode, script).group()

        one_vincode_list.append(result_id)
        one_vincode_list.append(result_vincode)
    except:
        one_vincode_list.append('')
        one_vincode_list.append('')

    return one_vincode_list


# --------------------------以上为封装函数------------------------- #

# --------------------------以下为采集程序------------------------- #

# 请求头设置
session = requests.Session()

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}

# 手动选择品牌
brand = 'maibahe'

# 测试地区
'''
address_dict = {
    '福建': 'fujian',
    '广东': 'guangdong',
    '四川': 'sichuan',
    '云南': 'yunnan',
}
'''



# 成果列表初始化
all_vin_list = []

# 采集成功与否记录列表初始化
tip_list = []

# 车辆总数统计
count_car_num = 0

try:
    #################################数据采集############################################
    # 遍历选择各个省份直辖市
    for key, value in address_dict.items():
        address_chineseName = key
        address = value

        # 给出筛选页面URL
        choose_url = 'https://www.che168.com/%s/%s/a0_0msdgscncgpi1ltocsp%dexv1/' % (address, brand, 1)

        print(choose_url)

        # 判定车辆总数
        car_num = get_car_num(choose_url)
        print(car_num)
        
        count_car_num += car_num

        if car_num == 0:
            tip = "<<%s>>地区无符合条件车辆" % address_chineseName
            tip_list.append(tip)
            print(tip)
        elif car_num < 4000:
            print("<<%s>>地区开始采集..." % address_chineseName)
            # 获取总页数
            choose_url_max_num = math.ceil(car_num / 40)

            #车辆URL列表初始化
            car_url_list = []

            for choose_url_num in range(1, choose_url_max_num + 1):
                # 遍历筛选页面URL
                choose_url = 'https://www.che168.com/%s/%s/a0_0msdgscncgpi1ltocsp%dexv1/' % (address, brand, choose_url_num)

                # 获取车辆URL列表，并向car_url_list列表添加
                car_url = add_car_url_to_list(choose_url)

                time.sleep(1)

            # 遍历车辆URL列表，格式化为手机端URL
            for no_format_url in car_url_list:
                format_url = judge_and_format_url(no_format_url)

                result = get_vincode(format_url)

                all_vin_list.append(result)

                time.sleep(1)

            time.sleep(3)



            tip = "<<%s>>地区二手车VIN采集完成！" % address_chineseName
            tip_list.append(tip)
            print(tip)

        else:
            tip = "<<%s>>地区符合条件数量超100页，请重新选择！" % address_chineseName
            tip_list.append(tip)
            print(tip)

        time.sleep(5)

    #print(all_vin_list)
    ##########################################################################################

finally:
    #######################################写入文件#####################################
    # 采集成功与否记录写入txt文件
    result_tip_file = open('result_tip_%s.txt' % brand, 'w')

    result_tip_file.write('------品牌《%s》------VIN采集结果记录(%d条，包括未采集条数)' % (brand, count_car_num))

    for tips in tip_list:
        result_tip_file.write('\n' + tips)

    result_tip_file.close()

    # 采集vin结果存入csv文件
    result_vincode_file = open('%s.csv' % brand, 'w', newline='')

    writer = csv.writer(result_vincode_file)

    # 写入标题
    title_list = ['url', '车型id', 'vincode']
    writer.writerow(title_list)

    # 遍历成果列表并写入
    for i in all_vin_list:
        writer.writerow(i)

    # 关闭csv文件

    result_vincode_file.close()

    print("Game Over!")

    ###############################################################################



https://www.che168.com/anhui/maibahe/a0_0msdgscncgpi1ltocsp1exv1/
0
<<安徽>>地区无符合条件车辆
https://www.che168.com/beijing/maibahe/a0_0msdgscncgpi1ltocsp1exv1/
0
<<北京>>地区无符合条件车辆
https://www.che168.com/chongqing/maibahe/a0_0msdgscncgpi1ltocsp1exv1/
2
<<重庆>>地区开始采集...
<<重庆>>地区二手车VIN采集完成！
https://www.che168.com/fujian/maibahe/a0_0msdgscncgpi1ltocsp1exv1/
0
<<福建>>地区无符合条件车辆
https://www.che168.com/guangdong/maibahe/a0_0msdgscncgpi1ltocsp1exv1/
1
<<广东>>地区开始采集...
<<广东>>地区二手车VIN采集完成！
https://www.che168.com/guangxi/maibahe/a0_0msdgscncgpi1ltocsp1exv1/
0
<<广西>>地区无符合条件车辆
https://www.che168.com/guizhou/maibahe/a0_0msdgscncgpi1ltocsp1exv1/
0
<<贵州>>地区无符合条件车辆
https://www.che168.com/gansu/maibahe/a0_0msdgscncgpi1ltocsp1exv1/
0
<<甘肃>>地区无符合条件车辆
https://www.che168.com/hainan/maibahe/a0_0msdgscncgpi1ltocsp1exv1/
0
<<海南>>地区无符合条件车辆
https://www.che168.com/henan/maibahe/a0_0msdgscncgpi1ltocsp1exv1/
0
<<河南>>地区无符合条件车辆
https://www.che168.com/hubei/maibahe/a0_0msdgscncgpi1ltocsp1exv1/
0
<<湖北>>地区无符合条件车辆
https://www.che168.c

KeyboardInterrupt: 

In [3]:
# 采集成功与否记录写入txt文件
result_tip_file = open('result_tip.txt', 'w')

result_tip_file.write('------品牌《%s》------VIN采集结果记录(%d条，包括未采集条数)' % (brand, count_car_num))

for tips in tip_list:
    result_tip_file.write('\n' + tips)

result_tip_file.close()

# 采集vin结果存入csv文件
result_vincode_file = open('second_hand_car_vincode_%s.csv' % brand, 'w', newline='')

writer = csv.writer(result_vincode_file)

# 写入标题
title_list = ['url', '车型id', 'vincode']
writer.writerow(title_list)

# 遍历成果列表并写入
for i in all_vin_list:
    writer.writerow(i)

# 关闭csv文件

result_vincode_file.close()

print("Game Over!")

Game Over!
