diff --git a/pyproject.toml b/pyproject.toml index 0301a31..afb19b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ classifiers = [ "Operating System :: OS Independent", "Programming Language :: Python", ] -packages = [] +packages = [{ include = "*", from = "src" }] include = [{ path = "tests", format = "sdist" }] exclude = [] diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e5f6dba --- /dev/null +++ b/src/__init__.py @@ -0,0 +1 @@ +"""Package Source.""" diff --git a/src/html_tracing/utilities/__init__.py b/src/html_tracing/utilities/__init__.py deleted file mode 100644 index 0db3994..0000000 --- a/src/html_tracing/utilities/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Package Utilities.""" diff --git a/src/utilities/__init__.py b/src/utilities/__init__.py new file mode 100644 index 0000000..2df5983 --- /dev/null +++ b/src/utilities/__init__.py @@ -0,0 +1,8 @@ +"""Package Utilities.""" + + +from .logger import Logger + +__all__ = ["Logger"] + +logger = Logger(tracing=True) diff --git a/src/utilities/logger.py b/src/utilities/logger.py new file mode 100644 index 0000000..8f9c84c --- /dev/null +++ b/src/utilities/logger.py @@ -0,0 +1,69 @@ +"""Module Logger.""" + + +from __future__ import annotations + +import logging +import sys + + +class Logger: + """Interface representing logger utilities.""" + + def __init__( + self: Logger, + *, + debugging: bool = False, + tracing: bool = False, + newline: bool = True, + ) -> None: + formatter = "%(name)s:%(levelname)s => %(msg)s" + "\n" if newline else "" + handler = logging.StreamHandler() + handler.setFormatter(fmt=logging.Formatter(fmt=formatter)) + logger = logging.getLogger(name="LOG") + logger.addHandler(hdlr=handler) + logger.setLevel(level=logging.DEBUG if debugging else logging.INFO) + + self.logger = logger + self.debugging = debugging + self.tracing = tracing + self.newline = newline + + def debug_(self: Logger, msg: str) -> None: + """Log a message with severity 'DEBUG'.""" + self.logger.debug(msg=msg) + + def warn_(self: Logger, msg: str) -> None: + """Log a message with severity 'WARN'.""" + self.logger.warning(msg=msg) + + def info_(self: Logger, msg: str) -> None: + """Log a message with severity 'INFO'.""" + self.logger.info(msg=msg) + + def error_(self: Logger, msg: str) -> None: + """Log a message with severity 'ERROR'.""" + self.logger.error(msg=msg) + + def critical_(self: Logger, msg: str) -> None: + """Log a message with severity 'CRITICAL'.""" + self.logger.critical(msg=msg) + + def trace_(self: Logger, msg: str | None = None) -> None: + """Log a message with severity 'DEBUG' tracing the called function.""" + if not self.tracing: + return + + frame = sys._getframe(1) # noqa: SLF001 + + message = "TRACE" + + if "self" in frame.f_locals: + message += f" - CLASS {frame.f_locals['self'].__class__.__name__}" + + message += f" - FUNCTION {frame.f_code.co_name}" + + if msg is not None: + message += f" - {msg}" + + self.info_(msg=f"{message}") diff --git a/src/web_scraping/__init__.py b/src/web_scraping/__init__.py new file mode 100644 index 0000000..6a960cf --- /dev/null +++ b/src/web_scraping/__init__.py @@ -0,0 +1,9 @@ +"""Package Web Scraping.""" + + +from .agents import UserAgents +from .clone import Clone +from .proxies import Proxies, Query +from .session import Session + +__all__ = ["UserAgents", "Clone", "Proxies", "Session", "Query"] diff --git a/src/html_tracing/utilities/agents.py b/src/web_scraping/agents.py similarity index 96% rename from src/html_tracing/utilities/agents.py rename to src/web_scraping/agents.py index 27ffbed..f3d4b66 100644 --- a/src/html_tracing/utilities/agents.py +++ b/src/web_scraping/agents.py @@ -4,13 +4,11 @@ import json import random -from logging import INFO, basicConfig, info from pathlib import Path import requests from bs4 import BeautifulSoup, ResultSet, Tag - -basicConfig(level=INFO) +from utilities import logger class UserAgents: @@ -58,7 +56,6 @@ def fetch( for endpoint in self.endpoints: response = requests.get(url=f"{self.url}{endpoint}", timeout=10) - info(response.ok, response.status_code) soup = BeautifulSoup(markup=response.content, features="html.parser") data += str(soup.find("table")) @@ -81,6 +78,7 @@ def refresh( self: UserAgents, ) -> None: """Refresh the list of user agents.""" + logger.trace_() self.fetch() self.convert() diff --git a/src/web_scraping/clone.py b/src/web_scraping/clone.py new file mode 100644 index 0000000..857d0bb --- /dev/null +++ b/src/web_scraping/clone.py @@ -0,0 +1,328 @@ +"""Module Clone.""" + +from __future__ import annotations + +import re +from pathlib import Path +from types import SimpleNamespace +from typing import TYPE_CHECKING + +from bs4 import BeautifulSoup, ResultSet, Tag +from utilities import logger + +if TYPE_CHECKING: + from .session import Session + + +class ClonePaths(SimpleNamespace): + """Interface representing clone directory paths.""" + + def __init__( + self: ClonePaths, + directory: str | Path, + folder: str, + filename: str, + ) -> None: + directory = Path(directory) + path = directory / folder + + self.directory = directory + self.folder = folder + self.filename = filename + self.fonts = path / "fonts" + self.images = path / "images" + self.pages = path / "pages" + self.scripts = path / "scripts" + self.styles = path / "styles" + + if not path.exists(): + path.mkdir(parents=True) + self.fonts.mkdir() + self.images.mkdir() + self.pages.mkdir() + self.scripts.mkdir() + self.styles.mkdir() + + +class Clone: + """Interface representing clone utilities.""" + + def __init__( + self: Clone, + directory: str | Path, + filename: str | Path, + markup: str | bytes, + url: str, + ) -> None: + """Interface representing clone utilities. + + Args: + ---- + domain (str): + The website domain to clone. + markup (str | bytes): + The website markup to clone. + folder (str, optional): + The clone folder. Defaults to "temp". + directory (Path, optional): + The clone directory. Defaults to Path(__file__).parent. + """ + logger.trace_(msg=url) + + domain = url[url.find("//") + 2 : -1] + + self.paths = ClonePaths( + directory=directory, + filename=filename, + folder=domain, + ) + self.url = url + self.soup = BeautifulSoup(markup=markup, features="html5lib") + self.source_attributes = ["src", "href", "data-cfsrc"] + + def save_html( + self: Clone, + ) -> int: + """Save the HTML clone.""" + logger.trace_(msg=self.paths.filename) + + return (self.paths.pages / self.paths.filename).write_text( + data=self.soup.prettify(), + encoding="utf-8", + ) + + def save_asset( + self: Clone, + data: bytes, + path: Path, + ) -> None: + """Save an asset file. + + Args: + ---- + data (bytes): + The asset data. + path (Path): + The asset path. + """ + logger.trace_(msg=f"ASSET {path}") + + path.write_bytes(data=data) + + def create_path_asset( + self: Clone, + source: str, + folder: Path, + ) -> Path: + """Create a path to an asset. + + Args: + ---- + source (str): The asset source. + folder (Path): The asset folder. + + Returns + ------- + Path: The asset path. + """ + path = Path(source) + index = path.suffix.find("?") + filename = path.name if index < 0 else path.stem + path.suffix[0:index] + return folder / filename + + def create_path_source( + self: Clone, + filename: str, + folder: Path, + ) -> str: + """Create a path to a ressource. + + Args: + ---- + filename (str): The ressource filename. + folder (Path): The ressource folder. + + Returns + ------- + str: The ressource path. + """ + return Path("..", folder.name, filename).as_posix() + + def find_source_attribute( + self: Clone, + tag: Tag, + ) -> str | None: + """Find the HTML element source attribute. + + Args: + ---- + tag (Tag): The HTML element. + + Returns + ------- + str | None: The source attribute. + """ + for source_attribute in self.source_attributes: + if source_attribute in tag.attrs: + return source_attribute + + return None + + def sync_images( + self: Clone, + session: Session, + ) -> None: + """Sync the images from the cloned website. + + Args: + ---- + session (Session): + The requests session. + """ + images: ResultSet[Tag] = self.soup.find_all(name="img") + + for image in images: + attribute = self.find_source_attribute(tag=image) + + if attribute is None or image[attribute].startswith("https"): + continue + + source = image.get(key=attribute) + path = self.create_path_asset(source=source, folder=self.paths.images) + image["src"] = self.create_path_source( + filename=path.name, + folder=self.paths.images, + ) + + if path.exists(): + continue + + response = session.request(url=self.url + source) + + if response is not None: + self.save_asset(data=response.content, path=path) + + def sync_links( + self: Clone, + session: Session, + ) -> None: + """Sync the links from the cloned website. + + Args: + ---- + session (Session): + The requests session. + """ + links: ResultSet[Tag] = self.soup.find_all(name="link") + + for link in links: + attribute = self.find_source_attribute(tag=link) + + if attribute is None or link[attribute].startswith("https"): + continue + + source = link.get(key=attribute) + + if source.startswith("https"): + continue + + if source.startswith("//"): + link["href"] = "https:" + source + continue + + folder = ( + self.paths.styles + if link.get(key="rel")[0] == "stylesheet" + else self.paths.images + ) + path = self.create_path_asset(source=source, folder=folder) + link["href"] = self.create_path_source(filename=path.name, folder=folder) + + if path.exists(): + continue + + response = session.request(url=self.url + source) + + if response is not None: + self.save_asset(data=response.content, path=path) + + def sync_scripts( + self: Clone, + session: Session, + *, + nosync: bool = True, + ) -> None: + """Sync the scripts from the cloned website. + + Args: + ---- + session (Session): + The requests session. + nosync (bool, optional): + If true, remove the scripts. Defaults to True. + """ + noscripts: ResultSet[Tag] = self.soup.find_all(name="noscript") + for noscript in noscripts: + noscript.extract() + + scripts: ResultSet[Tag] = self.soup.find_all(name="script") + for script in scripts: + attribute = self.find_source_attribute(tag=script) + + if nosync or attribute is None or script[attribute].startswith("https"): + script.extract() + continue + + source = script.get(key=attribute) + path = self.create_path_asset(source=source, folder=self.paths.scripts) + script["src"] = self.create_path_source( + filename=path.name, + folder=self.paths.scripts, + ) + + if path.exists(): + continue + + response = session.request(url=self.url + source) + + if response is not None: + self.save_asset(data=response.content, path=path) + + def sync_fonts( + self: Clone, + session: Session, + ) -> None: + """Sync the fonts from the cloned website. + + Args: + session (Session): + The requests session. + """ + for path_stylesheet in list(self.paths.styles.iterdir()): + stylesheet = path_stylesheet.read_text() + urls: list[str] = re.findall(pattern=r"url\(([^)]+)\)", string=stylesheet) + fonts = filter(lambda url: url.find("woff") > -1, urls) + + for font in fonts: + source = font.replace('"', "") + path = self.create_path_asset(source=source, folder=self.paths.fonts) + + if path.exists(): + continue + + response = session.request(url=self.url + source) + + if response is not None: + self.save_asset( + data=response.content, + path=path, + ) + stylesheet = stylesheet.replace( + source, + self.create_path_source( + file=path.name, + folder=self.paths.fonts, + ), + ) + + path_stylesheet.write_text(data=stylesheet) diff --git a/src/html_tracing/utilities/proxies.py b/src/web_scraping/proxies.py similarity index 97% rename from src/html_tracing/utilities/proxies.py rename to src/web_scraping/proxies.py index dab4a15..2d72642 100644 --- a/src/html_tracing/utilities/proxies.py +++ b/src/web_scraping/proxies.py @@ -4,7 +4,6 @@ import functools from dataclasses import asdict, dataclass -from logging import INFO, basicConfig from pathlib import Path from typing import Any, Callable, NamedTuple @@ -13,8 +12,7 @@ import requests from bs4 import BeautifulSoup, ResultSet, Tag from pandas import DataFrame - -basicConfig(level=INFO) +from utilities import logger @dataclass @@ -155,12 +153,14 @@ def refresh( self: Proxies, ) -> None: """Refresh the list of proxies.""" + logger.trace_() self.fetch() self.convert() def query( self: Proxies, queries: list[Query], + dataframe: DataFrame | None = None, ) -> DataFrame: """Find specific proxies using query conditions. @@ -185,7 +185,7 @@ def query( DataFrame: The filtered dataframe. """ - dataframe = self.load() + dataframe = self.load() if dataframe is None else dataframe conditions = [ operator(data, dataframe.get(key)) for data, key, operator in queries ] diff --git a/src/html_tracing/utilities/session.py b/src/web_scraping/session.py similarity index 77% rename from src/html_tracing/utilities/session.py rename to src/web_scraping/session.py index 7e25b5b..5c0b629 100644 --- a/src/html_tracing/utilities/session.py +++ b/src/web_scraping/session.py @@ -3,12 +3,11 @@ from __future__ import annotations import random -from logging import INFO, basicConfig, info, warning +import time import requests from requests import exceptions - -basicConfig(level=INFO) +from utilities import logger class Session: @@ -43,17 +42,17 @@ def proxy( def request( self: Session, url: str, - agent: str | None = None, + delay: float = 2, timeout: float = 10, ) -> requests.Response: - """Request a URL using a session proxy. + """Request a URL using a session. Args: ---- url (str): The URL to request. - agent (str): - The user agent for the headers. + timeout (float, optional): + The time (seconds) to wait between requests. Defaults to 2. timeout (float, optional): The time (seconds) to wait before giving up. Defaults to 5. @@ -62,11 +61,9 @@ def request( requests.Response: The HTTP request reponse. """ - return self.session.get( - url=url, - timeout=timeout, - headers=None if agent is None else {"User-Agent": agent}, - ) + time.sleep(delay) + + return self.session.get(url=url, timeout=(timeout, timeout)) def requests( self: Session, @@ -94,20 +91,21 @@ def requests( for proxy in proxies: self.proxy(proxy=proxy) agent = random.choice(seq=agents) # noqa: S311 + self.session.headers.update({"User-Agent": agent}) - info(f"Session with proxy {proxy} and agent {agent}.\n") + logger.info_(f"Session with proxy {proxy} and agent {agent}.") try: response = self.request(url=url, timeout=timeout) if response.ok: - info("Session SUCCESS\n") + logger.info_("Session SUCCESS") return response - warning(f"Session FAILED with status code {response.status_code}.\n") + logger.warn_(f"Session FAILED with code {response.status_code}.") continue except exceptions.RequestException as error: - warning(f"Session FAILED with error {error}.\n") + logger.error_(f"Session FAILED with error {error}.") continue return None