Skip to content

Commit

Permalink
feat: grab up-to-date proxies and validate prior to returning
Browse files Browse the repository at this point in the history
  • Loading branch information
Justintime50 committed Dec 8, 2021
1 parent de750f3 commit fc25736
Show file tree
Hide file tree
Showing 8 changed files with 98 additions and 231 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
@@ -1,5 +1,11 @@
# CHANGELOG

## v0.2.0 (2021-12-07)

* Removes proxy list from repo entirely
* Retrieves an updated proxy list each time you request a proxy
* Tests that the proxy works prior to returning it to you

## v0.1.1 (2021-12-07)

* Overhauls the proxy list with different proxies (tested more thoroughly) as well as moves the list from a hardcoded constant to a text file
Expand Down
10 changes: 5 additions & 5 deletions README.md
Expand Up @@ -13,13 +13,11 @@ Retrieve proxy servers.

</div>

Finding and storing a list of proxies can be taxing. Simply import `proxlist` and have it give you a rotating random proxy to run your requests through.

The list of currently configured proxies have `SSL` support, were tested to be able to accept connections (3 independant tests to ensure consistency), and were able to serve requests within 15 seconds (your mileage may vary based on the content you are sending/receiving through the proxy and where you are located in the world, if you receive timeouts, simply bump the timeout up or try again). This may change over time as proxies change and the list gets updated.
Finding and storing a list of proxies can be taxing - especially ones that are free and may not work only minutes from now. `proxlist` will validate the proxy and return a rotating random proxy to you so you don't need to keep a list of proxies or ensure it's contents are still valid.

Proxies are returned in the form of strings (eg: `ip:port`).

These proxies come from all over the world and may not be performant, this package is intended for testing purposes and I make no guarantee about where the data sent through these proxies goes - this package should not (yet) be considered for production applications.
These proxies come from all over the world and may not be performant for a production application. This package (for now) is intended for testing purposes and I make no guarantee about where the data sent through these proxies goes or how it's handled. The list of proxies rotates rapidly and is free and open source.

## Install

Expand Down Expand Up @@ -47,7 +45,9 @@ proxies = {
'https': f'http://{proxy}',
}

response = requests.get('https://google.com', proxies=proxies)
# Depending on the proxy and you location in the world, you may need to adjust the timeout
# to provide the proxy enough time to route your request.
response = requests.get('https://google.com', proxies=proxies, timeout=15)
print(response.text)
```

Expand Down
100 changes: 0 additions & 100 deletions proxlist/data/proxies_to_validate.txt

This file was deleted.

24 changes: 0 additions & 24 deletions proxlist/data/proxy_list.txt

This file was deleted.

73 changes: 63 additions & 10 deletions proxlist/proxies.py
@@ -1,28 +1,81 @@
import os
import random
from typing import List

import requests
from bs4 import BeautifulSoup # type: ignore


def random_proxy() -> str:
"""Returns a random proxy (ip:port) from the currently configured list."""
proxy_list = _open_proxy_list()
random_proxy = random.choice(proxy_list)
valid_proxy_exists = False
proxy_list = _get_proxies()

for proxy in proxy_list:
if _validate_proxy(proxy):
valid_proxy_exists = True
random_proxy = random.choice(proxy_list)
break

if valid_proxy_exists is False:
raise Exception('No working proxies were found at this time, please try again later.')

return random_proxy


def list_proxies() -> List[str]:
"""Lists all proxies from the currently configured list."""
proxy_list = _open_proxy_list()
proxy_list = _get_proxies()

return proxy_list


def _open_proxy_list():
"""Opens the current proxy list text file."""
proxy_filepath = os.path.join('proxlist', 'data', 'proxy_list.txt')
with open(proxy_filepath, 'r') as filename:
data = filename.readlines()
proxy_list = [line_item.replace('\n', '').strip() for line_item in data]
def _get_proxies() -> List[str]:
"""Gets a list of proxies from https://www.sslproxies.org by scraping the proxy table."""
proxy_list = []

try:
website = requests.get('https://www.sslproxies.org')
except Exception:
raise

soup = BeautifulSoup(website.text, 'html.parser')
table = soup.find('table').find('tbody')

for table_entry in table.find_all('tr'):
entry_elements = [td.text.strip() for td in table_entry.find_all('td')]
ip_address = entry_elements[0]
port = entry_elements[1]
# TODO: Eventually get more info like the country, anonymity, etc from this list

proxy = f'{ip_address}:{port}'
proxy_list.append(proxy)

return proxy_list


def _validate_proxy(proxy: str) -> bool:
"""Validates that a proxy is working (these free proxies can come and go within minutes),
test them before returning to the user.
"""
proxy_works = False

url = 'https://google.com'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0",
}
proxies = {
"http": f"http://{proxy}",
"https": f"http://{proxy}",
}

try:
# A `3` second timeout here is pretty generous, but it's what we are going with for now
with requests.get(url, proxies=proxies, headers=headers, timeout=3, stream=True) as r:
if r.raw.connection.sock:
if r.raw.connection.sock.getpeername()[0] == proxies['http'].split(':')[1][2:]:
proxy_works = True
except Exception:
# Couldn't connect to proxy, discard
pass

return proxy_works
77 changes: 0 additions & 77 deletions scripts/validate_proxies.py

This file was deleted.

10 changes: 3 additions & 7 deletions setup.py
Expand Up @@ -5,6 +5,7 @@

REQUIREMENTS = [
'requests == 2.*',
'beautifulsoup4 == 4.*',
]

DEV_REQUIREMENTS = [
Expand All @@ -20,20 +21,15 @@

setuptools.setup(
name='proxlist',
version='0.1.1',
version='0.2.0',
description='Your project description here',
long_description=long_description,
long_description_content_type="text/markdown",
url='http://github.com/Justintime50/proxlist',
author='Justintime50',
license='MIT',
packages=setuptools.find_packages(),
package_data={
'proxlist': [
'py.typed',
'data/proxy_list.txt',
],
},
package_data={'proxlist': ['py.typed']},
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
Expand Down

0 comments on commit fc25736

Please sign in to comment.