# 反爬：代理伺服器/IP

* 了解「IP 黑/白名單」的反爬蟲機制
* 「IP 黑/白名單」反爬蟲的因應策略

## 作業目標

* 目前程式中的 proxy_ips 是手動輸入的，請根據 https://free-proxy-list.net/ 寫一個可自動化抓取可用 Proxy 的 proxy_ips。




In [1]:
from bs4 import BeautifulSoup
import requests
import random

In [2]:
proxy_ips = []

# 對 https://free-proxy-list.net/ 發送請求，並從表格中整理出免費的代理伺服器

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Cafari/537.36'}
url = 'https://free-proxy-list.net/'
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')

for tr in soup.find('tbody').find_all('tr')[:100]:
    proxy = tr.find_all('td')[0].text + ':' + tr.find_all('td')[1].text
    proxy_ips.append(proxy)
    
print(proxy_ips)

['201.76.11.140:31181', '5.39.101.15:8118', '91.195.162.178:8080', '176.119.134.71:23500', '46.151.108.6:30874', '46.101.131.190:8118', '185.37.213.76:30695', '103.216.48.81:8080', '222.124.173.147:53281', '179.184.230.194:46495', '118.172.201.105:36307', '203.190.116.140:56163', '36.67.47.187:51827', '154.66.217.126:40476', '182.53.197.202:45609', '95.158.153.69:49753', '115.74.201.137:42108', '36.67.85.218:55021', '182.160.117.130:53281', '45.125.222.97:30569', '93.179.209.216:57520', '103.36.124.121:54803', '36.89.183.241:43219', '103.80.238.199:53281', '139.255.87.236:41882', '41.217.216.45:33602', '1.10.187.149:44976', '79.78.184.12:56834', '176.111.180.209:54237', '103.53.72.20:53749', '201.48.226.249:41846', '124.41.240.171:42190', '103.18.79.246:8080', '203.153.20.64:53281', '103.209.65.12:6666', '212.3.208.252:30091', '45.235.87.4:51996', '189.5.248.83:3128', '190.7.141.66:56146', '83.171.99.160:57659', '90.183.101.238:43511', '195.182.22.178:45063', '182.52.238.111:30098', '1

In [3]:
# 將免費的代理伺服器，發送至 http://ip.filefab.com/index.php 找出可用的代理伺服器

available_proxies = []

for i in range(100):
    ip = proxy_ips[i]
    print('Use', ip)
    try:
        resp = requests.get('http://ip.filefab.com/index.php',
                        proxies={'http': ip, 'https': ip}, timeout=10)
        soup = BeautifulSoup(resp.text, 'html5lib')
        print(soup.find('h1', id='ipd').text.strip())
        available_proxies.append(ip)
        if len(available_proxies) > 5:
            break
    except:
        print('Fail')
    
print(available_proxies)

Use 201.76.11.140:31181
Fail
Use 5.39.101.15:8118
Fail
Use 91.195.162.178:8080
Your IP address: 91.195.162.178
Use 176.119.134.71:23500
Fail
Use 46.151.108.6:30874
Fail
Use 46.101.131.190:8118
Fail
Use 185.37.213.76:30695
Your IP address: 185.37.213.76
Use 103.216.48.81:8080
Fail
Use 222.124.173.147:53281
Fail
Use 179.184.230.194:46495
Fail
Use 118.172.201.105:36307
Fail
Use 203.190.116.140:56163
Your IP address: 203.190.116.140
Use 36.67.47.187:51827
Fail
Use 154.66.217.126:40476
Fail
Use 182.53.197.202:45609
Fail
Use 95.158.153.69:49753
Your IP address: 95.158.153.69
Use 115.74.201.137:42108
Fail
Use 36.67.85.218:55021
Fail
Use 182.160.117.130:53281
Fail
Use 45.125.222.97:30569
Fail
Use 93.179.209.216:57520
Your IP address: 93.179.209.210
Use 103.36.124.121:54803
Fail
Use 36.89.183.241:43219
Fail
Use 103.80.238.199:53281
Fail
Use 139.255.87.236:41882
Fail
Use 41.217.216.45:33602
Your IP address: 41.217.216.45
['91.195.162.178:8080', '185.37.213.76:30695', '203.190.116.140:56163', '95

In [13]:
# 將前面找出的可用代理伺服器，再發送至 https://httpbin.org/ip 查看是否代理（偷換 IP）成功

for ip in available_proxies:
    print('Use', ip)
    try:
        resp = requests.get('https://httpbin.org/ip',
                        proxies={'http': ip, 'https': ip}, timeout=10)
        print(resp.json())
    except:
        print('Fail')

Use 91.195.162.178:8080
{'origin': '91.195.162.178, 91.195.162.178'}
Use 185.37.213.76:30695
{'origin': '185.37.213.76, 185.37.213.76'}
Use 203.190.116.140:56163
Fail
Use 95.158.153.69:49753
Fail
Use 93.179.209.216:57520
Fail
Use 41.217.216.45:33602
Fail
Use 91.195.162.178:8080
{'origin': '91.195.162.178, 91.195.162.178'}
