MenSeb · MenSeb · Oct 28, 2023 · Oct 28, 2023 · Oct 28, 2023 · Oct 28, 2023
diff --git a/src/html_tracing/utilities/agents.py b/src/html_tracing/utilities/agents.py
@@ -0,0 +1,136 @@
+"""Module User Agents."""
+
+from __future__ import annotations
+
+import json
+import random
+from logging import INFO, basicConfig, info
+from pathlib import Path
+
+import requests
+from bs4 import BeautifulSoup, ResultSet, Tag
+
+basicConfig(level=INFO)
+
+
+class UserAgents:
+    """Interface representing user agents utilities.
+
+    MDN Web Docs:
+    -
+    https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
+
+    Wikepedia:
+    -
+    https://en.wikipedia.org/wiki/User_agent
+    """
+
+    def __init__(
+        self: UserAgents,
+        filename: str = "user-agents",
+        folder: str = "datas",
+        directory: Path = Path(__file__).parent,
+    ) -> None:
+        """Interface representing user agents utilities.
+
+        Args:
+            filename (str, optional):
+                The filename to save the list of user agents.
+                Defaults to "user-agents".
+            folder (str, optional):
+                The folder to save the list of user agents.
+                Defaults to "datas".
+            directory (Path, optional):
+                The directory to save the list of user agents.
+                Defaults to Path(__file__).parent.
+        """
+        path = directory / folder
+        self.path_html = path / f"{filename}.html"
+        self.path_json = path / f"{filename}.json"
+        self.endpoints = ["windows", "macos", "ios", "chrome-os", "android"]
+        self.url = "https://www.whatismybrowser.com/guides/the-latest-user-agent/"
+
+    def fetch(
+        self: UserAgents,
+    ) -> None:
+        """Fetch and save the user agents HTML tables."""
+        data: str = ""
+
+        for endpoint in self.endpoints:
+            response = requests.get(url=f"{self.url}{endpoint}", timeout=10)
+            info(response.ok, response.status_code)
+            soup = BeautifulSoup(markup=response.content, features="html.parser")
+            data += str(soup.find("table"))
+
+        self.path_html.parent.mkdir(parents=True, exist_ok=True)
+        self.path_html.write_text(data=data, encoding="utf-8")
+
+    def convert(
+        self: UserAgents,
+    ) -> None:
+        """Convert the user agents HTML tables to JSON format."""
+        soup = BeautifulSoup(markup=self.path_html.read_text(), features="html.parser")
+        tables: ResultSet[Tag] = soup.find_all("table")
+        bodies: list[Tag] = [table.find("tbody") for table in tables]
+        rows: list[Tag] = [row for body in bodies for row in body.find_all("tr")]
+        lists = [row.select("td:last-child ul li span") for row in rows]
+        data = [span.string for spans in lists for span in spans]
+        self.save(data=data)
+
+    def refresh(
+        self: UserAgents,
+    ) -> None:
+        """Refresh the list of user agents."""
+        self.fetch()
+        self.convert()
+
+    def save(
+        self: UserAgents,
+        data: list[str],
+        path: Path | None = None,
+    ) -> None:
+        """Save the user agents list.
+
+        Args:
+        ----
+            data (list[str]):
+                The user agents list.
+            path (Path | None, optional):
+                The save path. Defaults to None.
+        """
+        (path or self.path_json).write_text(json.dumps(obj=data))
+
+    def load(
+        self: UserAgents,
+    ) -> list[str]:
+        """Load the user agents list.
+
+        Returns
+        -------
+            list[str]:
+                The list of user agents.
+        """
+        return json.loads(s=self.path_json.read_text())
+
+    def extract(
+        self: UserAgents,
+        limit: int | None = None,
+    ) -> list[str]:
+        """Extract a list of random user agents.
+
+        Args:
+        ----
+            limit (int | None, optional):
+                The maximum number of user agents. Defaults to None.
+
+        Returns
+        -------
+            list[str]:
+                The list of user agents.
+        """
+        agents = self.load()
+
+        return random.sample(
+            population=agents,
+            k=len(agents) if limit is None else limit,
+        )
diff --git a/src/html_tracing/utilities/proxies.py b/src/html_tracing/utilities/proxies.py
@@ -4,7 +4,7 @@
 
 import functools
 from dataclasses import asdict, dataclass
-from logging import INFO, basicConfig, info
+from logging import INFO, basicConfig
 from pathlib import Path
 from typing import Any, Callable, NamedTuple
 
@@ -13,7 +13,6 @@
 import requests
 from bs4 import BeautifulSoup, ResultSet, Tag
 from pandas import DataFrame
-from requests import exceptions
 
 basicConfig(level=INFO)
 
@@ -94,28 +93,46 @@ class Query(NamedTuple):
 
 
 class Proxies:
-    """Interface representating proxy utilities."""
+    """Interface representating proxy utilities.
+
+    MDN Web Docs:
+    -
+    https://developer.mozilla.org/en-US/docs/Web/HTTP/Proxy_servers_and_tunneling
+
+    Wikipedia:
+    -
+    https://en.wikipedia.org/wiki/Proxy_server
+    """
 
     def __init__(
         self: Proxies,
-        url: str = "https://free-proxy-list.net/",
         filename: str = "proxies",
         folder: str = "datas",
         directory: Path = Path(__file__).parent,
     ) -> None:
-        self.url = url
-        self.filename = filename
-        self.folder = folder
-        self.directory = directory
-        self.path = self.directory / self.folder
-        self.path_html = self.path / f"{self.filename}.html"
-        self.path_csv = self.path / f"{self.filename}.csv"
+        """Interface representating proxy utilities.
+
+        Args:
+            filename (str, optional):
+                The filename to save the list of user agents.
+                Defaults to "proxies".
+            folder (str, optional):
+                The folder to save the list of user agents.
+                Defaults to "datas".
+            directory (Path, optional):
+                The directory to save the list of user agents.
+                Defaults to Path(__file__).parent.
+        """
+        path = directory / folder
+        self.path_html = path / f"{filename}.html"
+        self.path_csv = path / f"{filename}.csv"
+        self.url = "https://free-proxy-list.net/"
         self.headers = Headers()
         self.operators = Operators()
         self.session = requests.Session()
         self.keys = list(asdict(self.headers).keys())
 
-    def fetch_proxies(
+    def fetch(
         self: Proxies,
     ) -> None:
         """Fetch and save the proxy HTML table."""
@@ -124,17 +141,24 @@ def fetch_proxies(
         self.path_html.parent.mkdir(parents=True, exist_ok=True)
         self.path_html.write_text(data=str(soup.find("table")), encoding="utf-8")
 
-    def format_proxies(
+    def convert(
         self: Proxies,
     ) -> None:
         """Convert the proxy HTML table to CSV format."""
         table = BeautifulSoup(markup=self.path_html.read_text(), features="html.parser")
         rows: ResultSet[Tag] = table.find("tbody").find_all("tr")
         rows_cells: list[ResultSet[Tag]] = [row.find_all("td") for row in rows]
         datas: list[str] = [(cell.string for cell in cells) for cells in rows_cells]
-        self.save_proxies(datas=datas)
+        self.save(datas=datas)
 
-    def query_proxies(
+    def refresh(
+        self: Proxies,
+    ) -> None:
+        """Refresh the list of proxies."""
+        self.fetch()
+        self.convert()
+
+    def query(
         self: Proxies,
         queries: list[Query],
     ) -> DataFrame:
@@ -143,7 +167,7 @@ def query_proxies(
         Usage:
         -----
 
-        dataframe = self.query_proxies(
+        dataframe = self.query(
             queries=[
                 Query(data="US", key=keys.code, operator=operators.eq),
 
@@ -161,14 +185,14 @@ def query_proxies(
             DataFrame:
                 The filtered dataframe.
         """
-        dataframe = self.load_proxies()
+        dataframe = self.load()
         conditions = [
             operator(data, dataframe.get(key)) for data, key, operator in queries
         ]
         reducer = functools.reduce(np.logical_and, conditions)
         return dataframe[reducer]
 
-    def extract_proxies(
+    def extract(
         self: Proxies,
         limit: int | None = None,
         dataframe: DataFrame | None = None,
@@ -188,13 +212,11 @@ def extract_proxies(
                 The list of proxies.
         """
         datas = (
-            self.load_proxies(limit=limit)
-            if dataframe is None
-            else dataframe.head(n=limit)
+            self.load(limit=limit) if dataframe is None else dataframe.head(n=limit)
         )[[self.headers.host, self.headers.port]]
         return [f"{host}:{port}" for _, (host, port) in datas.iterrows()]
 
-    def load_proxies(
+    def load(
         self: Proxies,
         limit: int | None = None,
     ) -> DataFrame:
@@ -212,7 +234,7 @@ def load_proxies(
         """
         return pd.read_csv(filepath_or_buffer=self.path_csv, nrows=limit)
 
-    def save_proxies(
+    def save(
         self: Proxies,
         datas: list[str],
         path: Path | None = None,
@@ -228,74 +250,3 @@ def save_proxies(
         """
         dataframe = DataFrame(data=datas, columns=self.keys)
         dataframe.to_csv(path_or_buf=path or self.path_csv, index=False)
-
-    def session_proxy(
-        self: Proxies,
-        proxy: str,
-    ) -> None:
-        """Assign a proxy to a requests session.
-
-        Args:
-        ----
-            proxy (str):
-                The session proxy.
-        """
-        self.session.proxies = {"http": proxy, "https": proxy}
-
-    def session_request(
-        self: Proxies,
-        url: str,
-        timeout: float = 5,
-    ) -> requests.Response:
-        """Request a URL using a session proxy.
-
-        Args:
-        ----
-            url (str):
-                The URL to request.
-            timeout (float, optional):
-                The time (seconds) to wait before giving up. Defaults to 5.
-
-        Returns
-        -------
-            requests.Response:
-                The HTTP request reponse.
-        """
-        return self.session.get(url=url, timeout=timeout)
-
-    def session_requests(
-        self: Proxies,
-        url: str,
-        proxies: list[str],
-    ) -> requests.Response | None:
-        """Request a URL with a session using different proxies.
-
-        Args:
-        ----
-            url (str):
-                The URL to request.
-            proxies (list[str]):
-                The list of proxies.
-
-        Returns
-        -------
-            requests.Response | None:
-                The HTTP request reponse.
-        """
-        for proxy in proxies:
-            self.session_proxy(proxy=proxy)
-
-            try:
-                response = self.session_request(url=url)
-
-                if response.ok:
-                    info(f"PROXY SUCCESS: {proxy}")
-                    return response
-
-                info(f"PROXY FAILED: {proxy} - RESPONSE: {response.status_code}")
-                continue
-            except exceptions.RequestException:
-                info(f"PROXY FAILED: {proxy}")
-                continue
-
-        return None