# 数据库连接对象

In [9]:
import pymysql
import traceback


class DBHelper:
    def __init__(self):
        # 链接数据库
        try:
            # charset 默认是 latin1, 查询到中文会是？？
            # charset='utf8mb4' 避免有表情时插入错误
            self.__db = pymysql.connect(
                host='127.0.0.1',
                user='root',
                password='111111',
                database='test',
                charset='utf8mb4')
            self.__cur = self.__db.cursor()
        except pymysql.Error as e:
            print('链接数据库失败：', traceback.print_exc())

    def insert(self, table, myDict):
        # 答案中存在表情会出错
        # 答案中存在双引号会出错，sql语句会发生歧义
        # 插入一条数据
        try:
            cols = ','.join(myDict.keys())
            values = ','.join(
                map(lambda x: '"' + str(x) + '"', myDict.values()))
            sql = 'INSERT INTO %s (%s) VALUES (%s)' % (table, cols, values)
            result = self.__cur.execute(sql)
            self.__db.commit()
        except pymysql.Error as e:
            print('插入失败：', traceback.print_exc())
            # 发生错误时回滚
            # DML 语句，执行完之后，处理的数据，都会放在回滚段中（除了 SELECT 语句），
            # 等待用户进行提交（COMMIT）或者回滚 （ROLLBACK），当用户执行 COMMIT / ROLLBACK后，
            # 放在回滚段中的数据就会被删除。
            self.__db.rollback()

    def query(self, sql):
        try:
            self.__cur.execute(sql)
            result = self.__cur.fetchall()
            self.__db.commit()
            if result:
                return result
            else:
                return None
            
        except  pymysql.Error as e:
            print("数据库-查询异常", traceback.print_exc())
            
    def check_exist(self, table, record):
        try:
            sql = 'SELECT COUNT(*) FROM %s WHERE url= %s' % (table, record)
            result = self.__cur.execute(sql)
            if result > 0:
                return True
            else:
                return False
        except pymysql.Error as e:
            print('查询失败！', traceback.print_exc())
            return False
            
            
    def close(self ):
        self.__cur.close()
        self.__db.close()


# 测试使用代理

In [57]:
import random
UserAgent_List = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]

def constructHeaders():
    headers = {
    'User-Agent': random.choice(UserAgent_List),
    'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    'Accept-Encoding': 'gzip',
    }
    return headers
constructHeaders()

{'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 'Accept-Encoding': 'gzip',
 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1'}

In [58]:
import requests
import re

url_proxy = 'http://www.xicidaili.com/nt/'
num = 2 #爬取页数，前两页
def ip_test(ip, headers, url_for_test='https://www.baidu.com', set_timeout=10):
    try:
        rp = requests.get(url_for_test, headers=headers, proxies={'http': ip[0] + ':' + ip[1]}, timeout = set_timeout)
        if rp.status_code == 200:
            return True
        else:
            return False
        
    except BaseException as e:
        return False
    
    
def sraw_ip(url_proxy, num,headers, url_for_test='https://www.baidu.com'):    
    ip_list = []
    dbhelper = DBHelper()
    for num_page in range (1, num +1):
        url_proxy = url_proxy + str(num_page)
        resp = requests.get(url_proxy, headers=headers, timeout = 10)
        resp.encoding = 'utf-8'
        htmlContent = resp.text
        
        pattern = re.compile('<td class="country">.*?alt="Cn" />.*?</td>.*?<td>(.*?)</td>.*?<td>(.*?)</td>',re.S)
        iterms = re.findall(pattern, htmlContent)
        for ip in iterms:
            if ip_test(ip, headers, url_for_test):
                print('测试通过，IP地址为' + str(ip[0]) + ':' + str(ip[1]))
                ip_list.append(ip[0] + ':' + ip[1])
                myDict = {'ip':ip[0] + ':' + ip[1]}
                dbhelper.insert('t_meizitu_proxy_ip', myDict)
            else:
                print('测试失败！')
    dbhelper.close()
    return ip_list

headers = constructHeaders()
sraw_ip(url_proxy, num, headers)
    

测试通过，IP地址为220.249.185.178:9999
测试通过，IP地址为61.155.164.108:3128
测试通过，IP地址为14.29.84.50:8080
测试通过，IP地址为123.7.38.31:9999
测试通过，IP地址为122.114.122.212:9999
测试通过，IP地址为123.139.56.238:9999
测试通过，IP地址为221.223.64.30:9797
测试通过，IP地址为115.183.11.158:9999
测试通过，IP地址为61.163.136.52:9999
测试通过，IP地址为119.39.68.6:808
测试通过，IP地址为60.191.134.165:9999
测试通过，IP地址为113.200.214.164:9999
测试通过，IP地址为171.37.209.103:9797
测试通过，IP地址为116.17.15.156:9999
测试通过，IP地址为101.81.140.63:9000
测试通过，IP地址为61.155.164.111:3128
测试通过，IP地址为122.72.18.34:80
测试通过，IP地址为116.25.100.62:9797
测试通过，IP地址为122.72.18.35:80
测试通过，IP地址为112.74.94.142:3128
测试通过，IP地址为121.43.178.58:3128
测试通过，IP地址为219.135.164.245:3128
测试通过，IP地址为113.78.88.148:9797
测试通过，IP地址为118.119.168.172:9999
测试通过，IP地址为113.77.243.161:9797
测试通过，IP地址为125.93.192.192:3128
测试通过，IP地址为202.98.197.242:3128
测试通过，IP地址为27.46.39.172:9797
测试通过，IP地址为112.250.65.222:53281
测试通过，IP地址为119.29.92.171:8888
测试通过，IP地址为101.81.104.211:53281
测试通过，IP地址为113.88.65.83:9797
测试通过，IP地址为203.174.112.13:3128
测试通过，IP地址为125.47.64.161:9999
测试通过，

['220.249.185.178:9999',
 '61.155.164.108:3128',
 '14.29.84.50:8080',
 '123.7.38.31:9999',
 '122.114.122.212:9999',
 '123.139.56.238:9999',
 '221.223.64.30:9797',
 '115.183.11.158:9999',
 '61.163.136.52:9999',
 '119.39.68.6:808',
 '60.191.134.165:9999',
 '113.200.214.164:9999',
 '171.37.209.103:9797',
 '116.17.15.156:9999',
 '101.81.140.63:9000',
 '61.155.164.111:3128',
 '122.72.18.34:80',
 '116.25.100.62:9797',
 '122.72.18.35:80',
 '112.74.94.142:3128',
 '121.43.178.58:3128',
 '219.135.164.245:3128',
 '113.78.88.148:9797',
 '118.119.168.172:9999',
 '113.77.243.161:9797',
 '125.93.192.192:3128',
 '202.98.197.242:3128',
 '27.46.39.172:9797',
 '112.250.65.222:53281',
 '119.29.92.171:8888',
 '101.81.104.211:53281',
 '113.88.65.83:9797',
 '203.174.112.13:3128',
 '125.47.64.161:9999',
 '58.252.6.165:9000',
 '219.136.175.142:9797',
 '202.112.180.59:808',
 '61.155.164.106:3128',
 '1.196.161.20:9999',
 '14.117.208.159:9797',
 '222.186.45.58:56859',
 '27.46.37.228:9797',
 '113.200.159.155:9999'

In [None]:
import requests
import random
import time


dbhelper = DBHelper()
result = dbhelper.query('SELECT ip from t_meizitu_proxy_ip')
headers = constructHeaders()
if result: 
    while True:
        proxy_ip = random.choice(result)[0].split(':')[0]
        d = {'loginname':'admin', 'loginpwd': '111111'}
        resp = requests.post("https://yfalogin.wdcloud.cc/admin-login/user/login/auth", headers = headers, data = d, proxies = {'https' : proxy_ip}, timeout = 10)
        print(resp.text, proxy_ip)
        

In [6]:
list = range(1,100)
list_len = len(list) -1
for i, x in enumerate(list):
    if i == list_len:
        print("=====================")
    else:
        pass


0 1
1 2
2 3
3 4
4 5
5 6
6 7
7 8
8 9
9 10
10 11
11 12
12 13
13 14
14 15
15 16
16 17
17 18
18 19
19 20
20 21
21 22
22 23
23 24
24 25
25 26
26 27
27 28
28 29
29 30
30 31
31 32
32 33
33 34
34 35
35 36
36 37
37 38
38 39
39 40
40 41
41 42
42 43
43 44
44 45
45 46
46 47
47 48
48 49
49 50
50 51
51 52
52 53
53 54
54 55
55 56
56 57
57 58
58 59
59 60
60 61
61 62
62 63
63 64
64 65
65 66
66 67
67 68
68 69
69 70
70 71
71 72
72 73
73 74
74 75
75 76
76 77
77 78
78 79
79 80
80 81
81 82
82 83
83 84
84 85
85 86
86 87
87 88
88 89
89 90
90 91
91 92
92 93
93 94
94 95
95 96
96 97
97 98
