From 38f8edc78d28842311aa63bd38c033469f8fbbbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Menard?= Date: Sat, 28 Oct 2023 19:48:54 -0400 Subject: [PATCH 01/16] build(agents): remove logger --- src/html_tracing/utilities/agents.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/html_tracing/utilities/agents.py b/src/html_tracing/utilities/agents.py index 27ffbed..9224fda 100644 --- a/src/html_tracing/utilities/agents.py +++ b/src/html_tracing/utilities/agents.py @@ -4,7 +4,7 @@ import json import random -from logging import INFO, basicConfig, info +from logging import INFO, basicConfig from pathlib import Path import requests @@ -58,7 +58,6 @@ def fetch( for endpoint in self.endpoints: response = requests.get(url=f"{self.url}{endpoint}", timeout=10) - info(response.ok, response.status_code) soup = BeautifulSoup(markup=response.content, features="html.parser") data += str(soup.find("table")) From 303d6af952d1d202f8b7639594d9e02c58938d55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Menard?= Date: Sat, 28 Oct 2023 19:49:16 -0400 Subject: [PATCH 02/16] build(session): update session with internal function --- src/html_tracing/utilities/session.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/html_tracing/utilities/session.py b/src/html_tracing/utilities/session.py index 7e25b5b..1cc102c 100644 --- a/src/html_tracing/utilities/session.py +++ b/src/html_tracing/utilities/session.py @@ -43,17 +43,14 @@ def proxy( def request( self: Session, url: str, - agent: str | None = None, timeout: float = 10, ) -> requests.Response: - """Request a URL using a session proxy. + """Request a URL using a session. Args: ---- url (str): The URL to request. - agent (str): - The user agent for the headers. timeout (float, optional): The time (seconds) to wait before giving up. Defaults to 5. @@ -62,11 +59,7 @@ def request( requests.Response: The HTTP request reponse. """ - return self.session.get( - url=url, - timeout=timeout, - headers=None if agent is None else {"User-Agent": agent}, - ) + return self.session.get(url=url, timeout=timeout) def requests( self: Session, @@ -94,6 +87,7 @@ def requests( for proxy in proxies: self.proxy(proxy=proxy) agent = random.choice(seq=agents) # noqa: S311 + self.session.headers.update({"User-Agent": agent}) info(f"Session with proxy {proxy} and agent {agent}.\n") From 4186acedfc49c971b1ac11bea19edcc0bb81b176 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Menard?= Date: Sun, 29 Oct 2023 11:14:30 -0400 Subject: [PATCH 03/16] build(clone): implement functions to clone a website --- src/html_tracing/utilities/clone.py | 208 ++++++++++++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100644 src/html_tracing/utilities/clone.py diff --git a/src/html_tracing/utilities/clone.py b/src/html_tracing/utilities/clone.py new file mode 100644 index 0000000..59a83b2 --- /dev/null +++ b/src/html_tracing/utilities/clone.py @@ -0,0 +1,208 @@ +"""Module Clone.""" + +from __future__ import annotations + +import re +from pathlib import Path +from typing import TYPE_CHECKING + +from bs4 import BeautifulSoup, ResultSet, Tag + +if TYPE_CHECKING: + from utilities.session import Session + + +class Clone: + """Interface representing clone utilities.""" + + def __init__( + self: Clone, + domain: str, + markup: str | bytes, + folder: str = "temp", + directory: Path = Path(__file__).parent, + ) -> None: + """Interface representing clone utilities. + + Args: + ---- + domain (str): + The website domain to clone. + markup (str | bytes): + The website markup to clone. + folder (str, optional): + The clone folder. Defaults to "temp". + directory (Path, optional): + The clone directory. Defaults to Path(__file__).parent. + """ + self.assets: list[str] = [] + self.domain = domain + self.soup = BeautifulSoup(markup=markup, features="html5lib") + self.path = directory / folder / domain[domain.index("//") + 2 : -1] + self.path_assets = Path("assets") + self.setup() + + def setup( + self: Clone, + ) -> None: + """Create the directory and folders for the cloned website.""" + self.path.mkdir( + exist_ok=True, + parents=True, + ) + (self.path / self.path_assets).mkdir(exist_ok=True) + + def save_html( + self: Clone, + ) -> int: + """Save the HTML clone.""" + return (self.path / "index.html").write_text( + data=self.soup.prettify(), + encoding="utf-8", + ) + + def save_asset( + self: Clone, + data: bytes, + filename: str, + ) -> None: + """Save an asset file. + + Args: + ---- + data (bytes): + The asset data. + filename (str): + The asset filename. + """ + (self.path / self.path_assets / filename).write_bytes(data=data) + + def sync_images( + self: Clone, + session: Session, + ) -> None: + """Sync the images from the cloned website. + + Args: + ---- + session (Session): + The requests session. + """ + images = self.soup.find_all(name="img") + + for image in images: + source = image["src"] + filename = Path(source).name + image["src"] = self.path_assets / filename + self.assets.append(filename) + response = session.request(url=self.domain + source) + + if response is not None: + self.save_asset( + data=response.content, + filename=filename, + ) + + def sync_links( + self: Clone, + session: Session, + ) -> None: + """Sync the links from the cloned website. + + Args: + ---- + session (Session): + The requests session. + """ + stylesheets = self.soup.find_all(name="link") + + for stylesheet in stylesheets: + source: str = stylesheet["href"] + + if source.startswith("https"): + continue + + path = Path(source) + index = path.suffix.find("?") + filename = path.name if index < 0 else path.stem + path.suffix[0:index] + self.assets.append(filename) + stylesheet["href"] = self.path_assets / filename + response = session.request(url=self.domain + source) + + if response is not None: + self.save_asset( + data=response.content, + filename=filename, + ) + + def sync_scripts( + self: Clone, + session: Session, + *, + nosync: bool = True, + ) -> None: + """Sync the scripts from the cloned website. + + Args: + ---- + session (Session): + The requests session. + nosync (bool, optional): + If true, remove the scripts. Defaults to True. + """ + noscripts: ResultSet[Tag] = self.soup.find_all(name="noscript") + for noscript in noscripts: + noscript.extract() + + scripts: ResultSet[Tag] = self.soup.find_all(name="script") + for script in scripts: + if nosync or "src" not in script.attrs or script["src"].startswith("https"): + script.extract() + continue + + source: str = script["src"] + path = Path(source) + filename = path.stem + path.suffix[0 : path.suffix.find("?")] + self.assets.append(filename) + script["src"] = self.path_assets / filename + response = session.request(url=self.domain + source) + + if response is not None: + self.save_asset( + data=response.content, + filename=filename, + ) + + def sync_fonts( + self: Clone, + session: Session, + ) -> None: + """Sync the fonts from the cloned website. + + Args: + session (Session): + The requests session. + """ + stylesheets = list( + filter(lambda filename: filename.endswith("css"), self.assets), + ) + + for stylesheet in stylesheets: + path_stylesheet = self.path / self.path_assets / stylesheet + content = path_stylesheet.read_text() + + sources: list[str] = re.findall(r"src: ?url\(([^)]+)\)", string=content) + + for source in sources: + url = source.replace('"', "") + response = session.request(self.domain + url) + + if response is not None: + path = Path(url) + filename = path.stem + path.suffix[0 : path.suffix.find("?")] + self.assets.append(filename) + path_stylesheet.write_text(data=content.replace(url, filename)) + self.save_asset( + data=response.content, + filename=filename, + ) From c10c71cf4aa6ce84f2d3023cf8ea1d3d55fa52d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Menard?= Date: Sun, 29 Oct 2023 13:14:00 -0400 Subject: [PATCH 04/16] build(proxies): enable reuse of dataframe in query --- src/html_tracing/utilities/proxies.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/html_tracing/utilities/proxies.py b/src/html_tracing/utilities/proxies.py index dab4a15..d7cb82a 100644 --- a/src/html_tracing/utilities/proxies.py +++ b/src/html_tracing/utilities/proxies.py @@ -161,6 +161,7 @@ def refresh( def query( self: Proxies, queries: list[Query], + dataframe: DataFrame | None = None, ) -> DataFrame: """Find specific proxies using query conditions. @@ -185,7 +186,7 @@ def query( DataFrame: The filtered dataframe. """ - dataframe = self.load() + dataframe = self.load() if dataframe is None else dataframe conditions = [ operator(data, dataframe.get(key)) for data, key, operator in queries ] From dfaf938bfd9f80a386d026e893e6132c56e22b16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Menard?= Date: Sun, 29 Oct 2023 15:48:57 -0400 Subject: [PATCH 05/16] build(logger): implement logging utilities --- src/html_tracing/utilities/logger.py | 43 ++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 src/html_tracing/utilities/logger.py diff --git a/src/html_tracing/utilities/logger.py b/src/html_tracing/utilities/logger.py new file mode 100644 index 0000000..d8c5cc4 --- /dev/null +++ b/src/html_tracing/utilities/logger.py @@ -0,0 +1,43 @@ +"""Module Logger.""" + + +from __future__ import annotations + +import logging +import sys + + +class Logger: + """Interface representing logger utilities.""" + + def __init__(self: Logger, *, debug: bool = False) -> None: + logging.basicConfig(level=logging.DEBUG if debug else logging.INFO) + + def debug_(self: Logger, msg: str) -> None: + """Log a message with severity 'DEBUG'.""" + logging.debug(msg=msg) + + def warn_(self: Logger, msg: str) -> None: + """Log a message with severity 'WARN'.""" + logging.warning(msg=msg) + + def info_(self: Logger, msg: str) -> None: + """Log a message with severity 'INFO'.""" + logging.info(msg=msg) + + def error_(self: Logger, msg: str) -> None: + """Log a message with severity 'ERROR'.""" + logging.error(msg=msg) + + def critical_(self: Logger, msg: str) -> None: + """Log a message with severity 'CRITICAL'.""" + logging.critical(msg=msg) + + def trace_(self: Logger, msg: str | None = None) -> None: + """Log a message with severity 'DEBUG' tracing the called function.""" + function = sys._getframe(1).f_code.co_name # noqa: SLF001 + + self.debug_(msg=f"CALL function {function}. {msg}") + + +logger = Logger() From dc8f73e9c9c127be2bb55666dc4df70f25113d0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Menard?= Date: Sun, 29 Oct 2023 16:35:26 -0400 Subject: [PATCH 06/16] build(logger): update logger logic with tracing --- src/html_tracing/utilities/logger.py | 33 +++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/src/html_tracing/utilities/logger.py b/src/html_tracing/utilities/logger.py index d8c5cc4..387541e 100644 --- a/src/html_tracing/utilities/logger.py +++ b/src/html_tracing/utilities/logger.py @@ -10,8 +10,18 @@ class Logger: """Interface representing logger utilities.""" - def __init__(self: Logger, *, debug: bool = False) -> None: - logging.basicConfig(level=logging.DEBUG if debug else logging.INFO) + def __init__( + self: Logger, + *, + debugging: bool = False, + tracing: bool = False, + ) -> None: + self.debugging = debugging + self.tracing = tracing + + logging.basicConfig( + level=logging.DEBUG if debugging else logging.INFO, + ) def debug_(self: Logger, msg: str) -> None: """Log a message with severity 'DEBUG'.""" @@ -35,9 +45,22 @@ def critical_(self: Logger, msg: str) -> None: def trace_(self: Logger, msg: str | None = None) -> None: """Log a message with severity 'DEBUG' tracing the called function.""" - function = sys._getframe(1).f_code.co_name # noqa: SLF001 + if not self.tracing: + return + + frame = sys._getframe(1) # noqa: SLF001 + + message = "TRACE" + + if "self" in frame.f_locals: + message += f" - CLASS {frame.f_locals['self'].__class__.__name__}" + + message += f" - FUNCTION {frame.f_code.co_name}" + + if msg is not None: + message += f" - {msg}" - self.debug_(msg=f"CALL function {function}. {msg}") + self.info_(msg=f"{message}\n") -logger = Logger() +logger = Logger(tracing=True) From 3b5672b26a4d615eeb24c93996fd03aa1496a67a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Menard?= Date: Sun, 29 Oct 2023 16:47:25 -0400 Subject: [PATCH 07/16] build(logger): update logging with new line config --- src/html_tracing/utilities/logger.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/html_tracing/utilities/logger.py b/src/html_tracing/utilities/logger.py index 387541e..c328f40 100644 --- a/src/html_tracing/utilities/logger.py +++ b/src/html_tracing/utilities/logger.py @@ -15,11 +15,13 @@ def __init__( *, debugging: bool = False, tracing: bool = False, + newline: bool = True, ) -> None: self.debugging = debugging self.tracing = tracing logging.basicConfig( + format="%(msg)s" + "\n" if newline else "", level=logging.DEBUG if debugging else logging.INFO, ) @@ -60,7 +62,7 @@ def trace_(self: Logger, msg: str | None = None) -> None: if msg is not None: message += f" - {msg}" - self.info_(msg=f"{message}\n") + self.info_(msg=f"{message}") logger = Logger(tracing=True) From eb8094c4d376f342aecc2a8a6755cf21d6728073 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Menard?= Date: Sun, 29 Oct 2023 17:16:30 -0400 Subject: [PATCH 08/16] build(logger): update logger with internal logging --- src/html_tracing/utilities/logger.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/html_tracing/utilities/logger.py b/src/html_tracing/utilities/logger.py index c328f40..16f70f7 100644 --- a/src/html_tracing/utilities/logger.py +++ b/src/html_tracing/utilities/logger.py @@ -17,33 +17,37 @@ def __init__( tracing: bool = False, newline: bool = True, ) -> None: + formatter = "%(name)s:%(levelname)s => %(msg)s" + "\n" if newline else "" + handler = logging.StreamHandler() + handler.setFormatter(fmt=logging.Formatter(fmt=formatter)) + logger = logging.getLogger(name="LOG") + logger.addHandler(hdlr=handler) + logger.setLevel(level=logging.DEBUG if debugging else logging.INFO) + + self.logger = logger self.debugging = debugging self.tracing = tracing - - logging.basicConfig( - format="%(msg)s" + "\n" if newline else "", - level=logging.DEBUG if debugging else logging.INFO, - ) + self.newline = newline def debug_(self: Logger, msg: str) -> None: """Log a message with severity 'DEBUG'.""" - logging.debug(msg=msg) + self.logger.debug(msg=msg) def warn_(self: Logger, msg: str) -> None: """Log a message with severity 'WARN'.""" - logging.warning(msg=msg) + self.logger.warning(msg=msg) def info_(self: Logger, msg: str) -> None: """Log a message with severity 'INFO'.""" - logging.info(msg=msg) + self.logger.info(msg=msg) def error_(self: Logger, msg: str) -> None: """Log a message with severity 'ERROR'.""" - logging.error(msg=msg) + self.logger.error(msg=msg) def critical_(self: Logger, msg: str) -> None: """Log a message with severity 'CRITICAL'.""" - logging.critical(msg=msg) + self.logger.critical(msg=msg) def trace_(self: Logger, msg: str | None = None) -> None: """Log a message with severity 'DEBUG' tracing the called function.""" From 39b7e66661ba07866700801054a79757b864daf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Menard?= Date: Sun, 29 Oct 2023 17:17:26 -0400 Subject: [PATCH 09/16] build(logger): use logger in utilities methods to visualise progression --- src/html_tracing/utilities/agents.py | 5 ++-- src/html_tracing/utilities/clone.py | 40 ++++++++++++++++++--------- src/html_tracing/utilities/proxies.py | 5 ++-- src/html_tracing/utilities/session.py | 12 ++++---- 4 files changed, 36 insertions(+), 26 deletions(-) diff --git a/src/html_tracing/utilities/agents.py b/src/html_tracing/utilities/agents.py index 9224fda..82e4e54 100644 --- a/src/html_tracing/utilities/agents.py +++ b/src/html_tracing/utilities/agents.py @@ -4,13 +4,11 @@ import json import random -from logging import INFO, basicConfig from pathlib import Path import requests from bs4 import BeautifulSoup, ResultSet, Tag - -basicConfig(level=INFO) +from utilities.logger import logger class UserAgents: @@ -80,6 +78,7 @@ def refresh( self: UserAgents, ) -> None: """Refresh the list of user agents.""" + logger.trace_() self.fetch() self.convert() diff --git a/src/html_tracing/utilities/clone.py b/src/html_tracing/utilities/clone.py index 59a83b2..175b5d4 100644 --- a/src/html_tracing/utilities/clone.py +++ b/src/html_tracing/utilities/clone.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING from bs4 import BeautifulSoup, ResultSet, Tag +from utilities.logger import logger if TYPE_CHECKING: from utilities.session import Session @@ -46,16 +47,17 @@ def setup( self: Clone, ) -> None: """Create the directory and folders for the cloned website.""" - self.path.mkdir( - exist_ok=True, - parents=True, - ) + logger.trace_() + + self.path.mkdir(exist_ok=True, parents=True) (self.path / self.path_assets).mkdir(exist_ok=True) def save_html( self: Clone, ) -> int: """Save the HTML clone.""" + logger.trace_() + return (self.path / "index.html").write_text( data=self.soup.prettify(), encoding="utf-8", @@ -75,6 +77,8 @@ def save_asset( filename (str): The asset filename. """ + logger.trace_(msg=filename) + (self.path / self.path_assets / filename).write_bytes(data=data) def sync_images( @@ -88,11 +92,18 @@ def sync_images( session (Session): The requests session. """ - images = self.soup.find_all(name="img") + images: ResultSet[Tag] = self.soup.find_all(name="img") for image in images: - source = image["src"] - filename = Path(source).name + if ("src" not in image.attrs and "data-cfsrc" not in image.attrs) or image[ + "src" + ].startswith("https"): + continue + + source = image["src"] if "src" in image.attrs else image["data-cfsrc"] + path = Path(source) + index = path.suffix.find("?") + filename = path.name if index < 0 else path.stem + path.suffix[0:index] image["src"] = self.path_assets / filename self.assets.append(filename) response = session.request(url=self.domain + source) @@ -114,14 +125,13 @@ def sync_links( session (Session): The requests session. """ - stylesheets = self.soup.find_all(name="link") + stylesheets: ResultSet[Tag] = self.soup.find_all(name="link") for stylesheet in stylesheets: - source: str = stylesheet["href"] - - if source.startswith("https"): + if "href" not in stylesheet.attrs or stylesheet["href"].startswith("https"): continue + source: str = stylesheet["href"] path = Path(source) index = path.suffix.find("?") filename = path.name if index < 0 else path.stem + path.suffix[0:index] @@ -162,7 +172,8 @@ def sync_scripts( source: str = script["src"] path = Path(source) - filename = path.stem + path.suffix[0 : path.suffix.find("?")] + index = path.suffix.find("?") + filename = path.name if index < 0 else path.stem + path.suffix[0:index] self.assets.append(filename) script["src"] = self.path_assets / filename response = session.request(url=self.domain + source) @@ -199,7 +210,10 @@ def sync_fonts( if response is not None: path = Path(url) - filename = path.stem + path.suffix[0 : path.suffix.find("?")] + index = path.suffix.find("?") + filename = ( + path.name if index < 0 else path.stem + path.suffix[0:index] + ) self.assets.append(filename) path_stylesheet.write_text(data=content.replace(url, filename)) self.save_asset( diff --git a/src/html_tracing/utilities/proxies.py b/src/html_tracing/utilities/proxies.py index d7cb82a..3acdfd3 100644 --- a/src/html_tracing/utilities/proxies.py +++ b/src/html_tracing/utilities/proxies.py @@ -4,7 +4,6 @@ import functools from dataclasses import asdict, dataclass -from logging import INFO, basicConfig from pathlib import Path from typing import Any, Callable, NamedTuple @@ -13,8 +12,7 @@ import requests from bs4 import BeautifulSoup, ResultSet, Tag from pandas import DataFrame - -basicConfig(level=INFO) +from utilities.logger import logger @dataclass @@ -155,6 +153,7 @@ def refresh( self: Proxies, ) -> None: """Refresh the list of proxies.""" + logger.trace_() self.fetch() self.convert() diff --git a/src/html_tracing/utilities/session.py b/src/html_tracing/utilities/session.py index 1cc102c..25a3a7f 100644 --- a/src/html_tracing/utilities/session.py +++ b/src/html_tracing/utilities/session.py @@ -3,12 +3,10 @@ from __future__ import annotations import random -from logging import INFO, basicConfig, info, warning import requests from requests import exceptions - -basicConfig(level=INFO) +from utilities.logger import logger class Session: @@ -89,19 +87,19 @@ def requests( agent = random.choice(seq=agents) # noqa: S311 self.session.headers.update({"User-Agent": agent}) - info(f"Session with proxy {proxy} and agent {agent}.\n") + logger.info_(f"Session with proxy {proxy} and agent {agent}.") try: response = self.request(url=url, timeout=timeout) if response.ok: - info("Session SUCCESS\n") + logger.info_("Session SUCCESS") return response - warning(f"Session FAILED with status code {response.status_code}.\n") + logger.warn_(f"Session FAILED with code {response.status_code}.") continue except exceptions.RequestException as error: - warning(f"Session FAILED with error {error}.\n") + logger.error_(f"Session FAILED with error {error}.") continue return None From 9fae2705dadfa5f474fdc82fcc083f146846a399 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Menard?= Date: Sun, 29 Oct 2023 18:23:23 -0400 Subject: [PATCH 10/16] build(clone): skip fetch if file exists --- src/html_tracing/utilities/clone.py | 72 ++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 21 deletions(-) diff --git a/src/html_tracing/utilities/clone.py b/src/html_tracing/utilities/clone.py index 175b5d4..7e9b856 100644 --- a/src/html_tracing/utilities/clone.py +++ b/src/html_tracing/utilities/clone.py @@ -95,17 +95,26 @@ def sync_images( images: ResultSet[Tag] = self.soup.find_all(name="img") for image in images: - if ("src" not in image.attrs and "data-cfsrc" not in image.attrs) or image[ - "src" - ].startswith("https"): + with_src = "src" in image.attrs + with_data = "data-cfsrc" in image.attrs + + if ( + not (with_src or with_data) + or with_src + and image["src"].startswith("https") + ): continue - source = image["src"] if "src" in image.attrs else image["data-cfsrc"] + source = image["src"] if with_src else image["data-cfsrc"] path = Path(source) index = path.suffix.find("?") filename = path.name if index < 0 else path.stem + path.suffix[0:index] image["src"] = self.path_assets / filename - self.assets.append(filename) + + if (self.path / self.path_assets / filename).exists(): + self.assets.append(filename) + continue + response = session.request(url=self.domain + source) if response is not None: @@ -113,6 +122,7 @@ def sync_images( data=response.content, filename=filename, ) + self.assets.append(filename) def sync_links( self: Clone, @@ -125,18 +135,27 @@ def sync_links( session (Session): The requests session. """ - stylesheets: ResultSet[Tag] = self.soup.find_all(name="link") + links: ResultSet[Tag] = self.soup.find_all(name="link") - for stylesheet in stylesheets: - if "href" not in stylesheet.attrs or stylesheet["href"].startswith("https"): + for link in links: + if "href" not in link.attrs or link["href"].startswith("https"): + continue + + source: str = link["href"] + + if source.startswith("//"): + link["href"] = "https:" + source continue - source: str = stylesheet["href"] path = Path(source) index = path.suffix.find("?") filename = path.name if index < 0 else path.stem + path.suffix[0:index] - self.assets.append(filename) - stylesheet["href"] = self.path_assets / filename + link["href"] = self.path_assets / filename + + if (self.path / self.path_assets / filename).exists(): + self.assets.append(filename) + continue + response = session.request(url=self.domain + source) if response is not None: @@ -144,6 +163,7 @@ def sync_links( data=response.content, filename=filename, ) + self.assets.append(filename) def sync_scripts( self: Clone, @@ -174,8 +194,12 @@ def sync_scripts( path = Path(source) index = path.suffix.find("?") filename = path.name if index < 0 else path.stem + path.suffix[0:index] - self.assets.append(filename) script["src"] = self.path_assets / filename + + if (self.path / self.path_assets / filename).exists(): + self.assets.append(filename) + continue + response = session.request(url=self.domain + source) if response is not None: @@ -183,6 +207,7 @@ def sync_scripts( data=response.content, filename=filename, ) + self.assets.append(filename) def sync_fonts( self: Clone, @@ -202,21 +227,26 @@ def sync_fonts( path_stylesheet = self.path / self.path_assets / stylesheet content = path_stylesheet.read_text() - sources: list[str] = re.findall(r"src: ?url\(([^)]+)\)", string=content) + sources: list[str] = re.findall(r"url\(([^)]+)\)", string=content) for source in sources: url = source.replace('"', "") - response = session.request(self.domain + url) + path = Path(url) + index = path.suffix.find("?") + filename = path.name if index < 0 else path.stem + path.suffix[0:index] - if response is not None: - path = Path(url) - index = path.suffix.find("?") - filename = ( - path.name if index < 0 else path.stem + path.suffix[0:index] - ) + if (self.path / self.path_assets / filename).exists(): self.assets.append(filename) - path_stylesheet.write_text(data=content.replace(url, filename)) + continue + + response = session.request(url=self.domain + url, delay=2) + + if response is not None: self.save_asset( data=response.content, filename=filename, ) + content = content.replace(url, filename) + self.assets.append(filename) + + path_stylesheet.write_text(data=content) From 15e936517f4566aa340b47b57fb9b3a46b6e31d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Menard?= Date: Sun, 29 Oct 2023 18:23:43 -0400 Subject: [PATCH 11/16] build(session): add delay between request --- src/html_tracing/utilities/session.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/html_tracing/utilities/session.py b/src/html_tracing/utilities/session.py index 25a3a7f..0859803 100644 --- a/src/html_tracing/utilities/session.py +++ b/src/html_tracing/utilities/session.py @@ -3,6 +3,7 @@ from __future__ import annotations import random +import time import requests from requests import exceptions @@ -41,6 +42,7 @@ def proxy( def request( self: Session, url: str, + delay: float = 2, timeout: float = 10, ) -> requests.Response: """Request a URL using a session. @@ -57,6 +59,8 @@ def request( requests.Response: The HTTP request reponse. """ + time.sleep(delay) + return self.session.get(url=url, timeout=timeout) def requests( From 8c41bccec6c0f3afeae65484f3d52d00fb5be72d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Menard?= Date: Mon, 30 Oct 2023 06:53:01 -0400 Subject: [PATCH 12/16] docs(session): update docstring with delay time --- src/html_tracing/utilities/clone.py | 6 +++--- src/html_tracing/utilities/session.py | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/html_tracing/utilities/clone.py b/src/html_tracing/utilities/clone.py index 7e9b856..56897dc 100644 --- a/src/html_tracing/utilities/clone.py +++ b/src/html_tracing/utilities/clone.py @@ -7,7 +7,7 @@ from typing import TYPE_CHECKING from bs4 import BeautifulSoup, ResultSet, Tag -from utilities.logger import logger +from logger import logger if TYPE_CHECKING: from utilities.session import Session @@ -47,7 +47,7 @@ def setup( self: Clone, ) -> None: """Create the directory and folders for the cloned website.""" - logger.trace_() + logger.trace_(msg=f"DOMAIN {self.domain}") self.path.mkdir(exist_ok=True, parents=True) (self.path / self.path_assets).mkdir(exist_ok=True) @@ -77,7 +77,7 @@ def save_asset( filename (str): The asset filename. """ - logger.trace_(msg=filename) + logger.trace_(msg=f"FILE {filename}") (self.path / self.path_assets / filename).write_bytes(data=data) diff --git a/src/html_tracing/utilities/session.py b/src/html_tracing/utilities/session.py index 0859803..c55e705 100644 --- a/src/html_tracing/utilities/session.py +++ b/src/html_tracing/utilities/session.py @@ -51,6 +51,8 @@ def request( ---- url (str): The URL to request. + timeout (float, optional): + The time (seconds) to wait between requests. Defaults to 2. timeout (float, optional): The time (seconds) to wait before giving up. Defaults to 5. From 524805892686646ae9b8b15664781e1282fb339f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Menard?= Date: Sun, 5 Nov 2023 16:52:38 -0500 Subject: [PATCH 13/16] build(session): update request with delay for access and read --- src/html_tracing/utilities/session.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/html_tracing/utilities/session.py b/src/html_tracing/utilities/session.py index c55e705..a456265 100644 --- a/src/html_tracing/utilities/session.py +++ b/src/html_tracing/utilities/session.py @@ -63,7 +63,7 @@ def request( """ time.sleep(delay) - return self.session.get(url=url, timeout=timeout) + return self.session.get(url=url, timeout=(timeout, timeout)) def requests( self: Session, From 5ae6f638454eaff75c4ef841842cc7f3a2911e2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Menard?= Date: Sun, 5 Nov 2023 17:37:16 -0500 Subject: [PATCH 14/16] refactor(clone): update clone logic --- src/html_tracing/utilities/clone.py | 268 ++++++++++++++++++---------- 1 file changed, 174 insertions(+), 94 deletions(-) diff --git a/src/html_tracing/utilities/clone.py b/src/html_tracing/utilities/clone.py index 56897dc..db1fc92 100644 --- a/src/html_tracing/utilities/clone.py +++ b/src/html_tracing/utilities/clone.py @@ -4,24 +4,59 @@ import re from pathlib import Path +from types import SimpleNamespace from typing import TYPE_CHECKING from bs4 import BeautifulSoup, ResultSet, Tag -from logger import logger + +if __name__ == "__main__": + from logger import logger +else: + from utilities.logger import logger if TYPE_CHECKING: from utilities.session import Session +class ClonePaths(SimpleNamespace): + """Interface representing clone directory paths.""" + + def __init__( + self: ClonePaths, + directory: str | Path, + folder: str, + filename: str, + ) -> None: + directory = Path(directory) + path = directory / folder + + self.directory = directory + self.folder = folder + self.filename = filename + self.fonts = path / "fonts" + self.images = path / "images" + self.pages = path / "pages" + self.scripts = path / "scripts" + self.styles = path / "styles" + + if not path.exists(): + path.mkdir(parents=True) + self.fonts.mkdir() + self.images.mkdir() + self.pages.mkdir() + self.scripts.mkdir() + self.styles.mkdir() + + class Clone: """Interface representing clone utilities.""" def __init__( self: Clone, - domain: str, + directory: str | Path, + filename: str | Path, markup: str | bytes, - folder: str = "temp", - directory: Path = Path(__file__).parent, + url: str, ) -> None: """Interface representing clone utilities. @@ -36,29 +71,26 @@ def __init__( directory (Path, optional): The clone directory. Defaults to Path(__file__).parent. """ - self.assets: list[str] = [] - self.domain = domain - self.soup = BeautifulSoup(markup=markup, features="html5lib") - self.path = directory / folder / domain[domain.index("//") + 2 : -1] - self.path_assets = Path("assets") - self.setup() + logger.trace_(msg=url) - def setup( - self: Clone, - ) -> None: - """Create the directory and folders for the cloned website.""" - logger.trace_(msg=f"DOMAIN {self.domain}") + domain = url[url.find("//") + 2 : -1] - self.path.mkdir(exist_ok=True, parents=True) - (self.path / self.path_assets).mkdir(exist_ok=True) + self.paths = ClonePaths( + directory=directory, + filename=filename, + folder=domain, + ) + self.url = url + self.soup = BeautifulSoup(markup=markup, features="html5lib") + self.source_attributes = ["src", "href", "data-cfsrc"] def save_html( self: Clone, ) -> int: """Save the HTML clone.""" - logger.trace_() + logger.trace_(msg=self.paths.filename) - return (self.path / "index.html").write_text( + return (self.paths.pages / self.paths.filename).write_text( data=self.soup.prettify(), encoding="utf-8", ) @@ -66,7 +98,7 @@ def save_html( def save_asset( self: Clone, data: bytes, - filename: str, + path: Path, ) -> None: """Save an asset file. @@ -74,12 +106,71 @@ def save_asset( ---- data (bytes): The asset data. - filename (str): - The asset filename. + path (Path): + The asset path. """ - logger.trace_(msg=f"FILE {filename}") + logger.trace_(msg=f"ASSET {path}") + + path.write_bytes(data=data) + + def create_path_asset( + self: Clone, + source: str, + folder: Path, + ) -> Path: + """Create a path to an asset. - (self.path / self.path_assets / filename).write_bytes(data=data) + Args: + ---- + source (str): The asset source. + folder (Path): The asset folder. + + Returns + ------- + Path: The asset path. + """ + path = Path(source) + index = path.suffix.find("?") + filename = path.name if index < 0 else path.stem + path.suffix[0:index] + return folder / filename + + def create_path_source( + self: Clone, + filename: str, + folder: Path, + ) -> str: + """Create a path to a ressource. + + Args: + ---- + filename (str): The ressource filename. + folder (Path): The ressource folder. + + Returns + ------- + str: The ressource path. + """ + return Path("..", folder.name, filename).as_posix() + + def find_source_attribute( + self: Clone, + tag: Tag, + ) -> str | None: + """Find the HTML element source attribute. + + Args: + ---- + tag (Tag): The HTML element. + + Returns + ------- + str | None: The source attribute. + """ + for source_attribute in self.source_attributes: + if source_attribute in tag.attrs: + return source_attribute + + return None def sync_images( self: Clone, @@ -95,34 +186,25 @@ def sync_images( images: ResultSet[Tag] = self.soup.find_all(name="img") for image in images: - with_src = "src" in image.attrs - with_data = "data-cfsrc" in image.attrs - - if ( - not (with_src or with_data) - or with_src - and image["src"].startswith("https") - ): + attribute = self.find_source_attribute(tag=image) + + if attribute is None or image[attribute].startswith("https"): continue - source = image["src"] if with_src else image["data-cfsrc"] - path = Path(source) - index = path.suffix.find("?") - filename = path.name if index < 0 else path.stem + path.suffix[0:index] - image["src"] = self.path_assets / filename + source = image.get(key=attribute) + path = self.create_path_asset(source=source, folder=self.paths.images) + image["src"] = self.create_path_source( + filename=path.name, + folder=self.paths.images, + ) - if (self.path / self.path_assets / filename).exists(): - self.assets.append(filename) + if path.exists(): continue - response = session.request(url=self.domain + source) + response = session.request(url=self.url + source) if response is not None: - self.save_asset( - data=response.content, - filename=filename, - ) - self.assets.append(filename) + self.save_asset(data=response.content, path=path) def sync_links( self: Clone, @@ -138,32 +220,35 @@ def sync_links( links: ResultSet[Tag] = self.soup.find_all(name="link") for link in links: - if "href" not in link.attrs or link["href"].startswith("https"): + attribute = self.find_source_attribute(tag=link) + + if attribute is None or link[attribute].startswith("https"): continue - source: str = link["href"] + source = link.get(key=attribute) + + if source.startswith("https"): + continue if source.startswith("//"): link["href"] = "https:" + source continue - path = Path(source) - index = path.suffix.find("?") - filename = path.name if index < 0 else path.stem + path.suffix[0:index] - link["href"] = self.path_assets / filename + folder = ( + self.paths.styles + if link.get(key="rel")[0] == "stylesheet" + else self.paths.images + ) + path = self.create_path_asset(source=source, folder=folder) + link["href"] = self.create_path_source(filename=path.name, folder=folder) - if (self.path / self.path_assets / filename).exists(): - self.assets.append(filename) + if path.exists(): continue - response = session.request(url=self.domain + source) + response = session.request(url=self.url + source) if response is not None: - self.save_asset( - data=response.content, - filename=filename, - ) - self.assets.append(filename) + self.save_asset(data=response.content, path=path) def sync_scripts( self: Clone, @@ -186,28 +271,26 @@ def sync_scripts( scripts: ResultSet[Tag] = self.soup.find_all(name="script") for script in scripts: - if nosync or "src" not in script.attrs or script["src"].startswith("https"): + attribute = self.find_source_attribute(tag=script) + + if nosync or attribute is None or script[attribute].startswith("https"): script.extract() continue - source: str = script["src"] - path = Path(source) - index = path.suffix.find("?") - filename = path.name if index < 0 else path.stem + path.suffix[0:index] - script["src"] = self.path_assets / filename + source = script.get(key=attribute) + path = self.create_path_asset(source=source, folder=self.paths.scripts) + script["src"] = self.create_path_source( + filename=path.name, + folder=self.paths.scripts, + ) - if (self.path / self.path_assets / filename).exists(): - self.assets.append(filename) + if path.exists(): continue - response = session.request(url=self.domain + source) + response = session.request(url=self.url + source) if response is not None: - self.save_asset( - data=response.content, - filename=filename, - ) - self.assets.append(filename) + self.save_asset(data=response.content, path=path) def sync_fonts( self: Clone, @@ -219,34 +302,31 @@ def sync_fonts( session (Session): The requests session. """ - stylesheets = list( - filter(lambda filename: filename.endswith("css"), self.assets), - ) - - for stylesheet in stylesheets: - path_stylesheet = self.path / self.path_assets / stylesheet - content = path_stylesheet.read_text() + for path_stylesheet in list(self.paths.styles.iterdir()): + stylesheet = path_stylesheet.read_text() + urls: list[str] = re.findall(pattern=r"url\(([^)]+)\)", string=stylesheet) + fonts = filter(lambda url: url.find("woff") > -1, urls) - sources: list[str] = re.findall(r"url\(([^)]+)\)", string=content) + for font in fonts: + source = font.replace('"', "") + path = self.create_path_asset(source=source, folder=self.paths.fonts) - for source in sources: - url = source.replace('"', "") - path = Path(url) - index = path.suffix.find("?") - filename = path.name if index < 0 else path.stem + path.suffix[0:index] - - if (self.path / self.path_assets / filename).exists(): - self.assets.append(filename) + if path.exists(): continue - response = session.request(url=self.domain + url, delay=2) + response = session.request(url=self.url + source) if response is not None: self.save_asset( data=response.content, - filename=filename, + path=path, + ) + stylesheet = stylesheet.replace( + source, + self.create_path_source( + file=path.name, + folder=self.paths.fonts, + ), ) - content = content.replace(url, filename) - self.assets.append(filename) - path_stylesheet.write_text(data=content) + path_stylesheet.write_text(data=stylesheet) From 5cf20f6d1f91888d9c4bd911269e90ad7a155f4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Menard?= Date: Sun, 5 Nov 2023 21:25:52 -0500 Subject: [PATCH 15/16] build(packages): enable imports from different packages --- pyproject.toml | 2 +- src/__init__.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 src/__init__.py diff --git a/pyproject.toml b/pyproject.toml index 0301a31..afb19b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ classifiers = [ "Operating System :: OS Independent", "Programming Language :: Python", ] -packages = [] +packages = [{ include = "*", from = "src" }] include = [{ path = "tests", format = "sdist" }] exclude = [] diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e5f6dba --- /dev/null +++ b/src/__init__.py @@ -0,0 +1 @@ +"""Package Source.""" From d00e78391db1971bc463411104129720bbc0b340 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Menard?= Date: Sun, 5 Nov 2023 21:26:43 -0500 Subject: [PATCH 16/16] refactor(project): update project structure --- src/html_tracing/utilities/__init__.py | 1 - src/utilities/__init__.py | 8 ++++++++ src/{html_tracing => }/utilities/logger.py | 3 --- src/web_scraping/__init__.py | 9 +++++++++ src/{html_tracing/utilities => web_scraping}/agents.py | 2 +- src/{html_tracing/utilities => web_scraping}/clone.py | 8 ++------ src/{html_tracing/utilities => web_scraping}/proxies.py | 2 +- src/{html_tracing/utilities => web_scraping}/session.py | 2 +- 8 files changed, 22 insertions(+), 13 deletions(-) delete mode 100644 src/html_tracing/utilities/__init__.py create mode 100644 src/utilities/__init__.py rename src/{html_tracing => }/utilities/logger.py (98%) create mode 100644 src/web_scraping/__init__.py rename src/{html_tracing/utilities => web_scraping}/agents.py (99%) rename src/{html_tracing/utilities => web_scraping}/clone.py (98%) rename src/{html_tracing/utilities => web_scraping}/proxies.py (99%) rename src/{html_tracing/utilities => web_scraping}/session.py (98%) diff --git a/src/html_tracing/utilities/__init__.py b/src/html_tracing/utilities/__init__.py deleted file mode 100644 index 0db3994..0000000 --- a/src/html_tracing/utilities/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Package Utilities.""" diff --git a/src/utilities/__init__.py b/src/utilities/__init__.py new file mode 100644 index 0000000..2df5983 --- /dev/null +++ b/src/utilities/__init__.py @@ -0,0 +1,8 @@ +"""Package Utilities.""" + + +from .logger import Logger + +__all__ = ["Logger"] + +logger = Logger(tracing=True) diff --git a/src/html_tracing/utilities/logger.py b/src/utilities/logger.py similarity index 98% rename from src/html_tracing/utilities/logger.py rename to src/utilities/logger.py index 16f70f7..8f9c84c 100644 --- a/src/html_tracing/utilities/logger.py +++ b/src/utilities/logger.py @@ -67,6 +67,3 @@ def trace_(self: Logger, msg: str | None = None) -> None: message += f" - {msg}" self.info_(msg=f"{message}") - - -logger = Logger(tracing=True) diff --git a/src/web_scraping/__init__.py b/src/web_scraping/__init__.py new file mode 100644 index 0000000..6a960cf --- /dev/null +++ b/src/web_scraping/__init__.py @@ -0,0 +1,9 @@ +"""Package Web Scraping.""" + + +from .agents import UserAgents +from .clone import Clone +from .proxies import Proxies, Query +from .session import Session + +__all__ = ["UserAgents", "Clone", "Proxies", "Session", "Query"] diff --git a/src/html_tracing/utilities/agents.py b/src/web_scraping/agents.py similarity index 99% rename from src/html_tracing/utilities/agents.py rename to src/web_scraping/agents.py index 82e4e54..f3d4b66 100644 --- a/src/html_tracing/utilities/agents.py +++ b/src/web_scraping/agents.py @@ -8,7 +8,7 @@ import requests from bs4 import BeautifulSoup, ResultSet, Tag -from utilities.logger import logger +from utilities import logger class UserAgents: diff --git a/src/html_tracing/utilities/clone.py b/src/web_scraping/clone.py similarity index 98% rename from src/html_tracing/utilities/clone.py rename to src/web_scraping/clone.py index db1fc92..857d0bb 100644 --- a/src/html_tracing/utilities/clone.py +++ b/src/web_scraping/clone.py @@ -8,14 +8,10 @@ from typing import TYPE_CHECKING from bs4 import BeautifulSoup, ResultSet, Tag - -if __name__ == "__main__": - from logger import logger -else: - from utilities.logger import logger +from utilities import logger if TYPE_CHECKING: - from utilities.session import Session + from .session import Session class ClonePaths(SimpleNamespace): diff --git a/src/html_tracing/utilities/proxies.py b/src/web_scraping/proxies.py similarity index 99% rename from src/html_tracing/utilities/proxies.py rename to src/web_scraping/proxies.py index 3acdfd3..2d72642 100644 --- a/src/html_tracing/utilities/proxies.py +++ b/src/web_scraping/proxies.py @@ -12,7 +12,7 @@ import requests from bs4 import BeautifulSoup, ResultSet, Tag from pandas import DataFrame -from utilities.logger import logger +from utilities import logger @dataclass diff --git a/src/html_tracing/utilities/session.py b/src/web_scraping/session.py similarity index 98% rename from src/html_tracing/utilities/session.py rename to src/web_scraping/session.py index a456265..5c0b629 100644 --- a/src/html_tracing/utilities/session.py +++ b/src/web_scraping/session.py @@ -7,7 +7,7 @@ import requests from requests import exceptions -from utilities.logger import logger +from utilities import logger class Session: