Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 136 additions & 0 deletions src/html_tracing/utilities/agents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
"""Module User Agents."""

from __future__ import annotations

import json
import random
from logging import INFO, basicConfig, info
from pathlib import Path

import requests
from bs4 import BeautifulSoup, ResultSet, Tag

basicConfig(level=INFO)


class UserAgents:
"""Interface representing user agents utilities.

MDN Web Docs:
-
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent

Wikepedia:
-
https://en.wikipedia.org/wiki/User_agent
"""

def __init__(
self: UserAgents,
filename: str = "user-agents",
folder: str = "datas",
directory: Path = Path(__file__).parent,
) -> None:
"""Interface representing user agents utilities.

Args:
filename (str, optional):
The filename to save the list of user agents.
Defaults to "user-agents".
folder (str, optional):
The folder to save the list of user agents.
Defaults to "datas".
directory (Path, optional):
The directory to save the list of user agents.
Defaults to Path(__file__).parent.
"""
path = directory / folder
self.path_html = path / f"{filename}.html"
self.path_json = path / f"{filename}.json"
self.endpoints = ["windows", "macos", "ios", "chrome-os", "android"]
self.url = "https://www.whatismybrowser.com/guides/the-latest-user-agent/"

def fetch(
self: UserAgents,
) -> None:
"""Fetch and save the user agents HTML tables."""
data: str = ""

for endpoint in self.endpoints:
response = requests.get(url=f"{self.url}{endpoint}", timeout=10)
info(response.ok, response.status_code)
soup = BeautifulSoup(markup=response.content, features="html.parser")
data += str(soup.find("table"))

self.path_html.parent.mkdir(parents=True, exist_ok=True)
self.path_html.write_text(data=data, encoding="utf-8")

def convert(
self: UserAgents,
) -> None:
"""Convert the user agents HTML tables to JSON format."""
soup = BeautifulSoup(markup=self.path_html.read_text(), features="html.parser")
tables: ResultSet[Tag] = soup.find_all("table")
bodies: list[Tag] = [table.find("tbody") for table in tables]
rows: list[Tag] = [row for body in bodies for row in body.find_all("tr")]
lists = [row.select("td:last-child ul li span") for row in rows]
data = [span.string for spans in lists for span in spans]
self.save(data=data)

def refresh(
self: UserAgents,
) -> None:
"""Refresh the list of user agents."""
self.fetch()
self.convert()

def save(
self: UserAgents,
data: list[str],
path: Path | None = None,
) -> None:
"""Save the user agents list.

Args:
----
data (list[str]):
The user agents list.
path (Path | None, optional):
The save path. Defaults to None.
"""
(path or self.path_json).write_text(json.dumps(obj=data))

def load(
self: UserAgents,
) -> list[str]:
"""Load the user agents list.

Returns
-------
list[str]:
The list of user agents.
"""
return json.loads(s=self.path_json.read_text())

def extract(
self: UserAgents,
limit: int | None = None,
) -> list[str]:
"""Extract a list of random user agents.

Args:
----
limit (int | None, optional):
The maximum number of user agents. Defaults to None.

Returns
-------
list[str]:
The list of user agents.
"""
agents = self.load()

return random.sample(
population=agents,
k=len(agents) if limit is None else limit,
)
139 changes: 45 additions & 94 deletions src/html_tracing/utilities/proxies.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import functools
from dataclasses import asdict, dataclass
from logging import INFO, basicConfig, info
from logging import INFO, basicConfig
from pathlib import Path
from typing import Any, Callable, NamedTuple

Expand All @@ -13,7 +13,6 @@
import requests
from bs4 import BeautifulSoup, ResultSet, Tag
from pandas import DataFrame
from requests import exceptions

basicConfig(level=INFO)

Expand Down Expand Up @@ -94,28 +93,46 @@ class Query(NamedTuple):


class Proxies:
"""Interface representating proxy utilities."""
"""Interface representating proxy utilities.

MDN Web Docs:
-
https://developer.mozilla.org/en-US/docs/Web/HTTP/Proxy_servers_and_tunneling

Wikipedia:
-
https://en.wikipedia.org/wiki/Proxy_server
"""

def __init__(
self: Proxies,
url: str = "https://free-proxy-list.net/",
filename: str = "proxies",
folder: str = "datas",
directory: Path = Path(__file__).parent,
) -> None:
self.url = url
self.filename = filename
self.folder = folder
self.directory = directory
self.path = self.directory / self.folder
self.path_html = self.path / f"{self.filename}.html"
self.path_csv = self.path / f"{self.filename}.csv"
"""Interface representating proxy utilities.

Args:
filename (str, optional):
The filename to save the list of user agents.
Defaults to "proxies".
folder (str, optional):
The folder to save the list of user agents.
Defaults to "datas".
directory (Path, optional):
The directory to save the list of user agents.
Defaults to Path(__file__).parent.
"""
path = directory / folder
self.path_html = path / f"{filename}.html"
self.path_csv = path / f"{filename}.csv"
self.url = "https://free-proxy-list.net/"
self.headers = Headers()
self.operators = Operators()
self.session = requests.Session()
self.keys = list(asdict(self.headers).keys())

def fetch_proxies(
def fetch(
self: Proxies,
) -> None:
"""Fetch and save the proxy HTML table."""
Expand All @@ -124,17 +141,24 @@ def fetch_proxies(
self.path_html.parent.mkdir(parents=True, exist_ok=True)
self.path_html.write_text(data=str(soup.find("table")), encoding="utf-8")

def format_proxies(
def convert(
self: Proxies,
) -> None:
"""Convert the proxy HTML table to CSV format."""
table = BeautifulSoup(markup=self.path_html.read_text(), features="html.parser")
rows: ResultSet[Tag] = table.find("tbody").find_all("tr")
rows_cells: list[ResultSet[Tag]] = [row.find_all("td") for row in rows]
datas: list[str] = [(cell.string for cell in cells) for cells in rows_cells]
self.save_proxies(datas=datas)
self.save(datas=datas)

def query_proxies(
def refresh(
self: Proxies,
) -> None:
"""Refresh the list of proxies."""
self.fetch()
self.convert()

def query(
self: Proxies,
queries: list[Query],
) -> DataFrame:
Expand All @@ -143,7 +167,7 @@ def query_proxies(
Usage:
-----

dataframe = self.query_proxies(
dataframe = self.query(
queries=[
Query(data="US", key=keys.code, operator=operators.eq),

Expand All @@ -161,14 +185,14 @@ def query_proxies(
DataFrame:
The filtered dataframe.
"""
dataframe = self.load_proxies()
dataframe = self.load()
conditions = [
operator(data, dataframe.get(key)) for data, key, operator in queries
]
reducer = functools.reduce(np.logical_and, conditions)
return dataframe[reducer]

def extract_proxies(
def extract(
self: Proxies,
limit: int | None = None,
dataframe: DataFrame | None = None,
Expand All @@ -188,13 +212,11 @@ def extract_proxies(
The list of proxies.
"""
datas = (
self.load_proxies(limit=limit)
if dataframe is None
else dataframe.head(n=limit)
self.load(limit=limit) if dataframe is None else dataframe.head(n=limit)
)[[self.headers.host, self.headers.port]]
return [f"{host}:{port}" for _, (host, port) in datas.iterrows()]

def load_proxies(
def load(
self: Proxies,
limit: int | None = None,
) -> DataFrame:
Expand All @@ -212,7 +234,7 @@ def load_proxies(
"""
return pd.read_csv(filepath_or_buffer=self.path_csv, nrows=limit)

def save_proxies(
def save(
self: Proxies,
datas: list[str],
path: Path | None = None,
Expand All @@ -228,74 +250,3 @@ def save_proxies(
"""
dataframe = DataFrame(data=datas, columns=self.keys)
dataframe.to_csv(path_or_buf=path or self.path_csv, index=False)

def session_proxy(
self: Proxies,
proxy: str,
) -> None:
"""Assign a proxy to a requests session.

Args:
----
proxy (str):
The session proxy.
"""
self.session.proxies = {"http": proxy, "https": proxy}

def session_request(
self: Proxies,
url: str,
timeout: float = 5,
) -> requests.Response:
"""Request a URL using a session proxy.

Args:
----
url (str):
The URL to request.
timeout (float, optional):
The time (seconds) to wait before giving up. Defaults to 5.

Returns
-------
requests.Response:
The HTTP request reponse.
"""
return self.session.get(url=url, timeout=timeout)

def session_requests(
self: Proxies,
url: str,
proxies: list[str],
) -> requests.Response | None:
"""Request a URL with a session using different proxies.

Args:
----
url (str):
The URL to request.
proxies (list[str]):
The list of proxies.

Returns
-------
requests.Response | None:
The HTTP request reponse.
"""
for proxy in proxies:
self.session_proxy(proxy=proxy)

try:
response = self.session_request(url=url)

if response.ok:
info(f"PROXY SUCCESS: {proxy}")
return response

info(f"PROXY FAILED: {proxy} - RESPONSE: {response.status_code}")
continue
except exceptions.RequestException:
info(f"PROXY FAILED: {proxy}")
continue

return None
Loading