In [59]:
import requests
from bs4 import BeautifulSoup
import queue
import threading
import re
import csv
import functools
import os
from time import time

In [60]:
class jobObject:
    'job描述对象'
    __slots__ = ('tag_href', 'job', 'company', 'address', 'salary', 'tag')

In [61]:
# 获取函数运行时间装饰器
def excute_time(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        start = time()
        result = func(*args, **kwargs)
        spend = time() - start
        print('%s() 运行时间: %d' % (func.__name__, spend))
        thread_num = ''
        if 'thread_num' in kwargs:
            thread_num = kwargs['thread_num']
        
        # 运行时间写入文件
        headers = ['函数', '运行时间', '线程数']
        if os.path.isfile('运行时间统计.csv'):
            with open('运行时间统计.csv', 'a', newline='') as f:
                writer = csv.DictWriter(f, headers)
                data = {'函数':func.__name__, '运行时间':spend, '线程数':thread_num}
                writer.writerow(data)
        else:
            with open('运行时间统计.csv', 'a', newline='') as f:
                writer = csv.DictWriter(f, headers)
                writer.writeheader()
                data = {'函数':func.__name__, '运行时间':spend, '线程数':thread_num}
                writer.writerow(data)
            
        return result
    return wrapper

In [62]:
# 打印当前运行线程信息
def get_current_thread_info(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        print('[%s()] %s processing!' % (func.__name__, threading.current_thread().name))
        #print('[%s()] processing!' % (func.__name__))
        return func(*args, **kwargs)
    return wrapper

In [63]:
# free-proxy-list.net上面的ip保存在本地ip_pool.txt文件
@excute_time
def ipToFile(file_name='ip_pool.txt'):
    url = 'https://free-proxy-list.net/'
    r = requests.get(url)
    html = r.text
    soup = BeautifulSoup(html, features='html5lib')
    div_tag = soup.find('div', 'table-responsive')
    tbody_list = div_tag.tbody.contents
    result = ''
    for tr in tbody_list:
        ip = tr.next_element.string
        port = tr.next_element.next_sibling.string
        result += ip + ':' + port + '\n'

    with open(file_name, 'w') as f:
        f.write(result)

In [64]:
# 从ip_pool读取ip信息
# 返回proxies(queue)
@excute_time
def getIPool(file_name='ip_pool.txt'):
    proxies = queue.Queue()

    with open(file_name, 'r') as f:
        ip_pool = f.read()

    ip_pool = ip_pool.split('\n')

    # 删除最后一个空值
    del ip_pool[-1]

    for ip in ip_pool:
        proxy = {'http': ip, 'https': ip}
        proxies.put(proxy)

    return proxies

In [65]:
# 获得清理过后的proxies
# 剔除不能连接网页的proxy
# 接收queue, 线程数
# 返回清洗之后的proxies(queue)
@excute_time
def getCleanProxies(proxies, **thread_num):
    #     url = 'http://www.baidu.com'
    url = 'https://jobs.51job.com/'

    # 有效的ip队列
    ok_proxies = queue.Queue()

    def check_proxy(thread_proxies):

        thread_proxies_size = thread_proxies.qsize()

        for i in range(thread_proxies_size):
            thread_proxies_size = thread_proxies.qsize()
            print('当前剩余处理的ip: ', thread_proxies_size)

            # 如果当前ip为0退出当前线程
            if (thread_proxies_size == 0):
                break

            proxy = thread_proxies.get()
            try:
                r = requests.get(url, proxies=proxy, timeout=5)
                print(threading.current_thread().name + ' is processing')
                print('r.status_code:', r.status_code)
                if (r.status_code != 200):
                    print('remove proxy:', proxy)
                else:
                    ok_proxies.put(proxy)

                print('-------------------------------------')

            except requests.Timeout as e:
                print(threading.current_thread().name + ' is processing')
                print(e)
                print('remove proxy:', proxy)
                print('-------------------------------------')
            except requests.ConnectionError as e:
                print(threading.current_thread().name + ' is processing')
                print(e)
                print('remove proxy:', proxy)
                print('-------------------------------------')
            except requests.InvalidHeader as e:
                print(threading.current_thread().name + ' is processing')
                print(e)
                print('remove proxy:', proxy)
                print('-------------------------------------')


    threads = []

    for i in range(thread_num.get('thread_num')):
        thread = threading.Thread(target=check_proxy, args=(proxies,))
        threads.append(thread)

    print('线程开始：')
    for thread in threads:
        print(thread.name + '开始运行')
        thread.start()

    # 每次在剩余处理ip为0之后，程序就会停住不动
    # 当当前线程检测到剩余ip为0后，调用join()方法
    print('线程开始停止运行：')
    for thread in threads:
        thread.join()
        print(thread.name + '已停止')

    return ok_proxies

In [68]:
# 获得指定链接中的jobObject
# 组装成jobObjectList
# 返回jobObjectList
##############################
# 要不要返回出来交给外面的函数处理？
# 还是就在内部处理？
# 在外面封装个函数吧还是
@excute_time
def get_jobObjectList(url, proxies):
    flag = True
    proxy = proxies.get()
    # TODO 查看信息
    print('get_jobObjectList()当前处理链接:', url))
    while flag:
        try:
            r = requests.get(url, proxies=proxy, timeout=5)
            # 报了异常的话捕捉了之后
            # 获得新的proxy之后就直接开始下次循环
        except requests.exceptions.Timeout as e:
            print(e)
            proxy = proxies.get()
            print('尝试新的proxy')
            #print('----------------------')
            continue
        except requests.exceptions.ProxyError as e:
            print(e)
            proxy = proxies.get()
            print('尝试新的proxy')
            #print('----------------------')
            continue
        except requests.exceptions.ConnectionError as e:
            print(e)
            proxy = proxies.get()
            print('尝试新的proxy')
            #print('----------------------')
            continue

    # 不报异常就取消循环
    flag = False
    
    r.encoding = 'gbk'
    html = r.text
    soup = BeautifulSoup(html, features='html5lib')
    # 找到了工作列表的开头
    title = soup.find('div', 'el title')
    # 找到每一个工作列，它们分别是一个div class='el'
    job_info_list = title.find_next_siblings('div', 'el')

    # 存放当前页jobObject对象到jobObjectList中
    jobObjectList = []
    for tag in job_info_list:
        object = jobObject()
        object.tag_href = tag.find('a').attrs['href']
        object.job = tag.find('a').attrs['title']

        infos = tag.find_all(class_=re.compile('t[2-4]'))
        object.company = infos[0].string
        object.address = infos[1].string
        object.salary = infos[2].string
        jobObjectList.append(object)
    
    print('get_jobObjectList处理完成!')

    return jobObjectList

In [69]:
# 获取一个jobObject的tag属性列表
# 获取一个详情页标签
# 接收详情页的链接， ip代理
# 返回一个包含有标签的list
# TODO 这个函数读取网页的tag时，耗费时间比较长
# @excute_time
@get_current_thread_info
def getTag(url, proxies):
    tag_url = url
    proxy = proxies.get()

    # 存储标签的List
    tag_list = []
    flag = True

    # TODO 查看信息
    print('getTag()当前处理链接:', tag_url)
    print('getTag()获得的ip:', proxy)
    while flag:
        #     headers = {'user-agent','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/66.0.3359.181 Chrome/66.0.3359.181 Safari/537.36'}
        try:
            r_tag = requests.get(tag_url, timeout=5, proxies=proxy)

        # 报了异常的话捕捉了之后
        # 获得新的proxy之后就直接开始下次循环
        except requests.exceptions.Timeout as e:
            print(e)
            proxy = proxies.get()
            print('尝试新的proxy')
            #print('----------------------')
            continue
        except requests.exceptions.ProxyError as e:
            print(e)
            proxy = proxies.get()
            print('尝试新的proxy')
            #print('----------------------')
            continue
        except requests.exceptions.ConnectionError as e:
            print(e)
            proxy = proxies.get()
            print('尝试新的proxy')
            #print('----------------------')
            continue

    # 不报异常就取消循环
    flag = False

    r_tag.encoding = 'gbk'
    tag_html = r_tag.text
    tag_soup = BeautifulSoup(tag_html, features='html5lib')

    # 找到详情页第一行标签
    tag_1 = tag_soup.find_all('span', 'sp4')
    for i in tag_1:
        tag_list.append(i.contents[1])

    # 找到详情页第二行标签
    tag_2 = tag_soup.find('p', 't2')

    # 有些详情页没有第二行标签
    if tag_2 is None:
        print('getTag()中tag_list:', tag_list)
        print('----------------------')
        return tag_list

    ll = []
    for i in tag_2.strings:
        ll.append(i)

    # 清除ll中的空值
    def not_empty(s):
        return s and s.strip()

    for i in filter(not_empty, ll):
        tag_list.append(i)

    print('getTag()中tag_list:', tag_list)
    print('----------------------')

    return tag_list

In [70]:
# 组装jobObject的tag属性
@excute_time
def pack_tag(jobObjectList, proxies, **thread_num):
    # 用一个队列来存储特定的jobObject的详情链接
    #这里都可以不用写
    # tag_href_queue = queue.Queue()
    #
    # for i in jobObjectList:
    #     print(i.tag_href)
    #     tag_href_queue.put(i.tag_href)
    #
    # print('队列中的tag_href: ', tag_href_queue.qsize())
    
    # 用队列多线程来处理
    # 不在使用单线程取列表中一个值然后处理
    jobObject_queue = queue.Queue()
    
    for jobObject in jobObjectList:
        jobObject_queue.put(jobObject)
    
    def pack_object(jobObject_queue):
        flag = True
        while flag:
            jobObject = jobObject_queue.get()
            jobObject.tag = getTag(jobObject.tag_href, proxies)
            if jobObject_queue.qsize() == 0:
                flag = False
            
    
    threads = []
    
    print('thread_num.get(\'thread_num\'): ', thread_num.get('thread_num'))
    
    for i in range(thread_num.get('thread_num')):
        thread = threading.Thread(target=pack_object, args=(jobObject_queue,))
        threads.append(thread)
    
    for thread in threads:
        thread.start()
        
        
    for thread in threads:
        thread.join()
        print('%s 已停止' % thread.name)
    

    print('tag组装完成!')


In [71]:
# 写入csv
@excute_time
def save_to_csv(jobObjectList):
    headers = ['职位', '公司', '地址', '薪酬', '标签']
    datas = []

    for jobObject in jobObjectList:
        data = {headers[0]: jobObject.job,
                headers[1]: jobObject.company,
                headers[2]: jobObject.address,
                headers[3]: jobObject.salary,
                headers[4]: jobObject.tag}
        datas.append(data)
        
    if os.path.isfile('51job.csv'):
        with open('51job.csv', 'a', newline='') as f:
            writer = csv.DictWriter(f, headers)
            for row in datas:
                writer.writerow(row)
    else:
        with open('51job.csv', 'a', newline='') as f:
            writer = csv.DictWriter(f, headers)
            writer.writeheader()
            for row in datas:
                writer.writerow(row)

In [None]:
def handle_single(url, proxies):
#     # 先爬个10页看看
#     base_url = queue.Queue()
#     for i in range(10):
#         url_str = 'https://search.51job.com/list/000000,000000,0000,00,9,99,Java,2,{}.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' \
#             .format(i + 1)
#         base_url.put(url_str)

    # while base_url.not_empty:
    #     print(base_url.get())
    
    

#     print('当前队列中页链接数:', base_url.qsize())

    # 处理ip的代码应该放在其他地方，放在这里拉低了单页的处理速度
    #  获取可用的ip
#     ipToFile()
#     proxies = getIPool()
#     proxies = getCleanProxies(proxies, 25)
#     print('可用的ip: ', proxies.qsize())}
#     threads = []

#     proxy = proxies.get()
    jobObjectList = get_jobObjectList(url, proxies)
    pack_tag(jobObjectList, proxies, thread_num=20)
    save_to_csv(jobObjectList)

In [76]:
def main():
    url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,Java,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
    # 获取ip 
    ipToFile()
    proxies = getIPool()
    proxies = getCleanProxies(proxies, thread_num=40)
    print('可用的ip: ', proxies.qsize())
    
    start_time = time()
    handle_single(url, proxies)
    end_time = time()
    finish_time = end_time - start_time
    print('完成写入单页jobObject!')
    print('完成时间: ', finish_time)
    
    # todo 分割运行时间统计
    with open('运行时间统计.csv', 'a', newline='') as f:
        headers = ['函数', '运行时间', '线程数']
        writer = csv.DictWriter(f, headers)
        data = {'函数':'', '运行时间':'', '线程数':''}
        writer.writerow(data)

main()

ipToFile() 运行时间: 1
getIPool() 运行时间: 0
线程开始：
Thread-179开始运行
当前剩余处理的ip: Thread-180开始运行
 300
当前剩余处理的ip: Thread-181开始运行
 299
当前剩余处理的ip: Thread-182开始运行
 298
当前剩余处理的ip: Thread-183开始运行
 297
当前剩余处理的ip: Thread-184开始运行
 296
当前剩余处理的ip: Thread-185开始运行
 295
当前剩余处理的ip: Thread-186开始运行
 294
当前剩余处理的ip: Thread-187开始运行
 293
当前剩余处理的ip: Thread-188开始运行
 292
当前剩余处理的ip: Thread-189开始运行
 291
当前剩余处理的ip: Thread-190开始运行
 290
当前剩余处理的ip: Thread-191开始运行
 289
当前剩余处理的ip: Thread-192开始运行
 288
当前剩余处理的ip: Thread-193开始运行
 287
当前剩余处理的ip: Thread-194开始运行
 286
当前剩余处理的ip: Thread-195开始运行
 285
当前剩余处理的ip: Thread-196开始运行
 284
当前剩余处理的ip: Thread-197开始运行
 283
当前剩余处理的ip: Thread-198开始运行
 282
当前剩余处理的ip: Thread-199开始运行
 281
当前剩余处理的ip: Thread-200开始运行
 280
当前剩余处理的ip: Thread-201开始运行
 279
当前剩余处理的ip: Thread-202开始运行
 278
当前剩余处理的ip: Thread-203开始运行
 277
当前剩余处理的ip: Thread-204开始运行
 276
当前剩余处理的ip: Thread-205开始运行
 275
当前剩余处理的ip: Thread-206开始运行
 274
当前剩余处理的ip: Thread-207开始运行
 273
当前剩余处理的ip: Thread-208开始运行
 272
当前剩余处理的ip: Thread-209开始运行
 271
当前剩余处理的ip: 

KeyboardInterrupt: 

- 装饰器查看每个函数处理的时间，具体优化
- 清洗ip的函数的代码不应该放在处理单页数据的函数里面
----
现在想的是多线程来处理，一个线程调用一个handle_single()，就是
一个线程执行完成一个页面的所有操作，不知道这样速度是不是会提升

可以写一个模拟程序，来看看是不是会提升速度

----

最耗时的操作是pack_tag，98%的时间在这里被消耗!