# Parser

In [1]:
import sys
import time
import requests
from lxml import etree

from utils.logger import logger

class Parser(object):
    def __init__(self):
        pass

    def parse_kuaidaili(self, num=5):
        items = []
        target = ['inha', 'intr']
        for target_id in range(len(target)):
            for page_id in range(1, num+1):
                response = requests.get("https://www.kuaidaili.com/free/%s/%d/" % (target[target_id], page_id))
                if response.status_code != 200:
                    logger.error("download kuaidaili page <%s,%d> failed.(status_code:%d)" % (
                        target[target_id], page_id, response.status_code))
                    break
                html = etree.HTML(response.text)
                trs = html.xpath('//*[@id="list"]/table/tbody/tr')
                for tr in trs:
                    item = {}
                    tds = tr.xpath('td')
                    item['ip'] = tds[0].text.strip()                # IP
                    item['port'] = tds[1].text.strip()              # Port
                    item['http_type'] = tds[3].text.strip()         # HTTP or HTTPS
                    items.append(item)
                time.sleep(1)
        logger.info("get %d IPs from kuaidaili." % len(items))
        return items

    def parse_shenjidaili(self):
        items = []
        response = requests.get("http://www.shenjidaili.com/open/")
        if response.status_code == 200:
            html = etree.HTML(response.text)
            trs_http = html.xpath('//*[@id="pills-stable_http"]/table/tr')[1:]
            trs_https = html.xpath('//*[@id="pills-stable_https"]/table/tr')[1:]
            trs = trs_http + trs_https
            for tr in trs:
                item = {}
                tds = tr.xpath('td')
                item['ip'] = tds[0].text.strip()                # IP
                item['port'] = tds[1].text.strip()              # Port
                item['http_type'] = tds[3].text.strip()         # HTTP or HTTPS
                items.append(item)
        else:
            logger.error("download shenjidaili page failed.(status_code:%d)")
        logger.info("get %d IPs from shenjidaili." % len(items))
        return items

    def parse_qydaili(self, num = 10):
        items = []
        for page_id in range(1, num+1):
            response = requests.get("http://www.qydaili.com/free/?action=china&page=%d" % page_id)
            if response.status_code != 200:
                logger.error("download qydaili page <%d> failed.(status_code:%d)" % (page_id,response.status_code))
                break
            html = etree.HTML(response.text)
            trs = html.xpath('//table/tbody/tr')
            for tr in trs:
                item = {}
                tds = tr.xpath('td')
                item['ip'] = tds[0].text.strip()                # IP
                item['port'] = tds[1].text.strip()              # Port
                item['http_type'] = tds[3].text.strip()         # HTTP or HTTPS
                items.append(item)
            time.sleep(1)
        logger.info("get %d IPs from qydaili." % len(items))
        return items

    def parse_superfastip(self, num = 10):
        items = []
        for page_id in range(1, num+1):
            response = requests.get("http://www.superfastip.com/welcome/freeip/%d" % page_id)
            if response.status_code != 200:
                logger.error("download superfastip page <%d> failed.(status_code:%d)" % (page_id,response.status_code))
                break
            html = etree.HTML(response.text)
            trs = html.xpath('//table/tbody/tr')
            for tr in trs:
                item = {}
                tds = tr.xpath('td')
                item['ip'] = tds[0].text.strip()                # IP
                item['port'] = tds[1].text.strip()              # Port
                item['http_type'] = tds[3].text.strip()         # HTTP or HTTPS
                items.append(item)
            time.sleep(1)
        logger.info("get %d IPs from superfastip." % len(items))
        return items

    def parse_89ip(self, num=18):
        items = []
        for page_id in range(1, num+1):
            response = requests.get("http://www.89ip.cn/index_%d.html" % page_id)
            if response.status_code != 200:
                logger.error("download 89ip page <%d> failed.(status_code:%d)" % (page_id,response.status_code))
                break
            html = etree.HTML(response.text)
            trs = html.xpath('//table/tbody/tr')
            for tr in trs:
                item = {}
                tds = tr.xpath('td')
                item['ip'] = tds[0].text.strip()        # IP
                item['port'] = tds[1].text.strip()      # Port
                item['http_type'] = ''                  # HTTP or HTTPS
                items.append(item)
            time.sleep(1)
        logger.info("get %d IPs from 89ip." % len(items))
        return items

    def parse_data5u(self):
        target = ['', '/gngn','/gnpt','/gwgn','/gwpt']
        header = {
            'Host': 'www.data5u.com',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        }
        base_url = "http://www.data5u.com/free%s/index.shtml"
        items = []
        for page_id in range(len(target)):
            response = requests.get(base_url % target[page_id], headers = header)
            # print(response.text)
            if response.status_code != 200:
                logger.error("download data5u page <%s> failed.(status_code:%d)" % (target[page_id],response.status_code))
                break
            html = etree.HTML(response.text)
            trs = html.xpath('//ul[@class="l2"]')
            for tr in trs:
                item = {}
                tds = tr.xpath('span/li')
                item['ip'] = tds[0].text.strip()                # IP
                item['port'] = tds[1].text.strip()              # Port
                item['http_type'] = tds[3].text.strip()         # HTTP or HTTPS
                items.append(item)
            time.sleep(1)
        logger.info("get %d IPs from data5u." % len(items))
        return items

    ''' 521 问题需要绕过 js 
    def parse_66ip(self, num=5):
        items = []
        for page_id in range(1, num+1):
            response = requests.get("http://www.66ip.cn/%d.html" % page_id)
            if response.status_code != 200:
                logger.error("download 66ip page <%d> failed.(status_code:%d)" % (page_id, response.status_code))
                break
            html = etree.HTML(response.text)
            trs = html.xpath('//table/tr')
            for tr in trs:
                item = {}
                tds = tr.xpath('td')
                item['ip'] = tds[0].text.strip()        # IP
                item['port'] = tds[1].text.strip()      # Port
                item['http_type'] = ''                  # HTTP or HTTPS
                items.append(item)
            time.sleep(1)
        logger.info("get %d IPs from 66ip." % len(items))
        return items
    '''

    def parse_31f(self):
        target = ['http-proxy/', 'https-proxy/']
        base_url = "http://31f.cn/%s"
        items = []
        for page_id in range(len(target)):
            response = requests.get(base_url % target[page_id])
            if response.status_code != 200:
                logger.error("download 66ip page <%s> failed." % (target[page_id],response.status_code))
                break
            html = etree.HTML(response.text)
            trs = html.xpath('//table[1]/tr')
            for tr in trs[1:]:
                item = {}
                tds = tr.xpath('td')
                item['ip'] = tds[1].text.strip()                # IP
                item['port'] = tds[2].text.strip()              # Port
                item['http_type'] = 'http' if page_id == 0 else 'https' # HTTP or HTTPS
                items.append(item)
            time.sleep(1)
        logger.info("get %d IPs from 31f." % len(items))
        return items

if __name__ == '__main__':
    parser = Parser()

    res = parser.parse_31f()
    print(len(res))

get 100 IPs from 31f.
100


In [37]:
import execjs
import re
response = requests.get("http://www.66ip.cn/1.html")
if response.status_code == 521:
#res = response.text.replace("<script>","").replace("eval","").replace("</script>.*","").strip('\\n')
    res = ''.join(re.findall('<script>(.*?)</script>', response.text))
    res = res.replace('eval','var eval_test=')
    x=execjs.compile(res)
    print(x.eval('eval_test'))

var _58=function(){setTimeout('location.href=location.pathname+location.search.replace(/[\?|&]captcha-challenge/,\'\')',1500);document.cookie='__jsl_clearance=1555510517.408|0|'+(function(){var _58=[(-~(+!!/!/)+[]+[[]][0]),[-~[]]+[5],[-~[]]+(~~![]+[[]][0]),[-~[]]+[-~(+[])-~[]+4],[5],[-~[]]+[-~[]],[-~[]]+[-~[]+(-~![]+[-~((-~{}+[-~{}-~{}]>>-~{}-~{}))]>>-~![])],[-~[]+(-~![]+[-~((-~{}+[-~{}-~{}]>>-~{}-~{}))]>>-~![])],[-~((-~{}<<((+!!/!/)|-~(+!!/!/))))],[-~[]]+(-~(+!!/!/)+[]+[[]][0]),[-~[]]+(-~[(-~{}+[-~{}-~{}]>>-~{}-~{})+(-~{}+[-~{}-~{}]>>-~{}-~{})]+[]+[[]][0]),[-~[-~{}-~{}]],[-~[]],[(-~{}<<2)],[-~[]]+[-~[-~{}-~{}]],[-~[]]+[(-~{}<<2)],[-~(+[])-~[]+4],(-~[(-~{}+[-~{}-~{}]>>-~{}-~{})+(-~{}+[-~{}-~{}]>>-~{}-~{})]+[]+[[]][0]),(~~![]+[[]][0])];for(var _9=0;_9<_58.length;_9++){_58[_9]=['D','mv','C',(-~[(-~{}+[-~{}-~{}]>>-~{}-~{})+(-~{}+[-~{}-~{}]>>-~{}-~{})]+[]+[[]][0])+[[(-~[]<<-~[])]/~~{}+[]+[[]][0]][0].charAt(-~[-~{}-~{}]+(-~[]<<-~[])+(-~[]<<-~[]))+[{}+[]][0].charAt((-~{}<<((+!!/!/)|-~(+!!/!/

# Checker

In [2]:
import re
import time
import requests
from lxml import etree

from utils.logger import logger

HTTP_TYPE = {
    'HTTP':1,
    'HTTPS':2,
    'HTTP,HTTPS':3
}

class Checker(object):
    def __init__(self):
        pass
    
    def _check_ip(self, ip):
        ip_regex='^((25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)\.){3}(25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)$'
        return re.match(ip_regex, ip) is not None
        
    def _check_port(self, port):
        flag = True
        try:
            temp = int(port)
            if temp < 0 or temp > 65535:
                flag = False
        except ValueError:
            logger.error("%s is not a valid port." % srt(port) )
            flag = False
        return flag
    
    def check(self, ip, port, timeout=5):
        http_type = 0
        if self._check_ip(ip) and self._check_port(port):
            try:
                proxies = {"http" : "http://%s:%d" % (ip, port)}
                res = requests.get('http://www.httpbin.org/headers', proxies=proxies, timeout=timeout)
                if res.ok:
                    http_type |= 1
            except:
                logger.error("<http://%s:%d> is not available." % (ip,port))
            try:
                proxies = {"https" : "https://%s:%d" % (ip, port)}
                requests.get('http://www.httpbin.org/headers', proxies=proxies, timeout=timeout)
                if res.ok:
                    http_type |= 2
            except:
                logger.error("<https://%s:%d> is not available." % (ip,port))
        
        if http_type > 0:
            return (ip, port, http_type)
        return None

    def get_address():
        pass
if __name__ == '__main__':
    checker = Checker()
    print(checker.check('118.24.61.165', 8118))

None


# Scheduler

# ProxyHub

In [None]:
import threading
from parser import Parser
from checker import Checker

class ProxyHub(object):
    def __init__(self):
        self.pre_working_list = []
        self.working_list = []
        self.error_list = []
    
    def _parser_threading(self):
        parser = Parser()
        while True:
            self.pre_working_list.extend(parser.parse_kuaidaili())
            self.pre_working_list.extend(parser.parse_qydaili())
            self.pre_working_list.extend(parser.parse_data5u())
            self.pre_working_list.extend(parser.parse_shenjidaili())
            self.pre_working_list.extend(parser.parse_superfastip())
            self.pre_working_list.extend(parser.parse_31f())
            self.pre_working_list.extend(parser.parse_89ip())
            print(len(self.pre_working_list), len(self.working_list))
            sleep(30)
    
    def _checker_thread_1(self):
        checker = Checker()
        while True:
            if len(self.pre_working_list) > 0:
                if len(self.pre_working_list) > 50:
                    sleep(1)
                item = pre_working_list.pop(0)
                res = checker.check(item['ip'],item['port'])
                if res:
                    self.working_list.append(res)
            else:
                sleep(1)
    
    def start(self):
        pass
        
    def get(self, n=1, callback = None):
        callback()
        
    def stop(self):
        pass