# Ref: https://codelike.pro/create-a-crawler-with-rotating-ip-proxy-in-python/

# Import Libraries

In [1]:
# Get the text data from the first job posting
import pandas as pd
import requests # For website connections
from bs4 import BeautifulSoup # For HTML parsing
import time
from fake_useragent import UserAgent
import random

# Import the Data

In [2]:
try:
    Https_txt_file = 'proxyLists/https_proxys_part_2.txt'
    #Download New HTTPS IPs .txt file from https://www.proxy-list.download/HTTPS
    https_proxys = pd.read_csv(Https_txt_file, sep=" ", header=None)
    #Convert to array
    https_proxys = https_proxys[0].values
    #Convert to list
    https_proxys = list(https_proxys)
except:
    print('There are no https proxies to import')

try:
    Http_txt_file = 'proxyLists/http_proxys_part_2.txt'
    #Download New HTTPS IPs .txt file from https://www.proxy-list.download/HTTP
    http_proxys = pd.read_csv(Http_txt_file, sep=" ", header=None)
    #Convert to array
    http_proxys = http_proxys[0].values
    #Convert to list
    http_proxys = list(http_proxys)
except:
    print('There are no http proxies to import')

In [3]:
# Make a dictionary for the proxies to test
testProxies = {'https':https_proxys,'http':http_proxys}

In [4]:
# Retrieve a random user-agent
header = {}
def randHeader():
    ua = UserAgent()
    header['User-Agent'] = ua.random

In [5]:
# Retrieve a random index for the https proxy
def randomIndex(conn_type):
    num = random.randint(0, len(testProxies[conn_type]) - 1)
    return num

In [6]:
proxyDict = {}
def randProx(conn_type):
    # Choose a single random proxy to use
    proxy_index = randomIndex(conn_type)
    proxy = testProxies[conn_type][proxy_index]
    proxyDict[conn_type] = conn_type +'://'+ proxy
    return proxy_index

In [7]:
# Initialize Dictionary for good proxies 
good_http_proxy = []
good_https_proxy = []
conn_time_http = []
conn_time_https = []
goodProxy = {'https':{'proxy':good_https_proxy,'time':conn_time_https},
             'http':{'proxy':good_http_proxy,'time':conn_time_http}} 

In [8]:
def filterProxys(url):
    timer = 0
    count = 0
    if url[0:5] == 'http:':
        conn_type = 'http'
        
    elif url[0:5] == 'https':
        conn_type = 'https'

    total_proxies = len(testProxies[conn_type])
    for n in range(len(testProxies[conn_type])):
        time.sleep(1)
        print(str(total_proxies-n)+' proxies left to filter')
            
        try:
            # Choose a random header
            randHeader()
            
        except:
            # Sometimes the header reassignment function returns an error
            pass
            
        # Choose a random proxy
        proxy_index = randProx(conn_type)

        try:
            # Try to connect to the url
            t1 = time.time()
            soup = requests.get(url,headers=header,proxies=proxyDict,timeout=5)
            t2 = time.time()
            t = t2-t1
                
            if  str(soup) == '<Response [200]>': # the connection is successful when <Response [200]> is returned
                print(str(testProxies[conn_type][proxy_index])+' is good')
                del testProxies[conn_type][proxy_index]
                timer += t
                count += 1
                print('average conn time: '+str(timer/count)+'\n')
                #Add to the good proxy and its connection time
                goodProxy[conn_type]['proxy'].append(proxyDict[conn_type])
                goodProxy[conn_type]['time'].append(t)
                    
            elif str(soup) != '<Response [200]>': #The connection was bad
                print(str(testProxies[conn_type][proxy_index])+' is bad'+'\n')
                del testProxies[conn_type][proxy_index]
                    
                    
        except: # The connection took too long 
            print(str(testProxies[conn_type][proxy_index])+' is slow'+'\n')
            del testProxies[conn_type][proxy_index]
    try:
        print('average proxy time:'+str(timer/count))
    except:
        print('No proxies to text')

# Filter the Https proxies

In [None]:
url = 'https://www.proxy-list.download/HTTPS'
filterProxys(url)

227 proxies left to filter
98.172.142.99:8080 is bad

226 proxies left to filter
157.230.163.11:80 is slow

225 proxies left to filter
50.195.207.133:47593 is bad

224 proxies left to filter
50.239.245.103:80 is slow

223 proxies left to filter
3.86.73.213:80 is slow

222 proxies left to filter
206.189.168.170:80 is slow

221 proxies left to filter
50.239.245.102:80 is slow

220 proxies left to filter
157.230.157.60:80 is slow

219 proxies left to filter
50.244.252.166:8081 is good
average conn time: 0.754698038101

218 proxies left to filter
35.229.113.175:443 is good
average conn time: 0.676149964333

217 proxies left to filter
192.119.203.170:48678 is slow

216 proxies left to filter
35.236.34.184:3128 is slow

215 proxies left to filter
35.236.26.70:80 is slow

214 proxies left to filter
50.235.28.146:3128 is slow

213 proxies left to filter
3.94.230.156:80 is slow

212 proxies left to filter
159.89.141.36:8080 is slow

211 proxies left to filter
198.199.122.218:80 is slow

210 pro

# Filter the Http proxies

In [10]:
url = 'https://www.proxy-list.download/HTTP'
filterProxys(url)

No proxies to text


# Overwrite the source text file with partially filtered or empty ip list

In [None]:
# overwrite the existing source of https proxy data with the filtered data
with open('proxyLists/https_proxys_part_2.txt', "w") as output:
    for line in testProxies['https']:
        output.write("%s\n" % line)

In [None]:
# overwrite the existing source of http proxy data with the filtered data
with open('proxyLists/http_proxys_part_2.txt', "w") as output:
    for line in testProxies['http']:
        output.write("%s\n" % line)

# Get all proxys that had connections times less than one second

In [None]:
df = pd.DataFrame(goodProxy['https'])
fast_proxies_https = list(df['proxy'][df['time']<1])

In [None]:
df = pd.DataFrame(goodProxy['http'])
fast_proxies_http = list(df['proxy'][df['time']<1])

# Import the single filtered files of proxys that had connections times less than one second and combine with the new filtered proxys

In [None]:
# Import
try:
    existing_https_proxies = list(pd.read_csv('../filtered_https_proxys.txt', header = None)[0])
except: #If the file is empty
    existing_https_proxies = []
try:
    existing_http_proxies = list(pd.read_csv('../filtered_http_proxys.txt', header = None)[0])
except: #If the file is empty
    existing_http_proxies = []

In [None]:
# add the new proxys to the existing ones if they are not already there
[existing_https_proxies.append(x) 
 for x in fast_proxies_https 
 if x not in existing_https_proxies]

[existing_http_proxies.append(x) 
 for x in fast_proxies_http 
 if x not in existing_http_proxies]

# Rewrite the final proxys text file with the new and old filtered proxies

In [None]:
#append filtered http proxys to existing .txt file
with open('../filtered_http_proxys.txt', "w") as output:
    for line in existing_http_proxies:
        output.write("%s\n" % line)

In [None]:
# append filtered https proxies to existing text file
with open('../filtered_https_proxys.txt', "w") as output:
    for line in existing_https_proxies:
        output.write("%s\n" % line)