Imageomics · Andrey170170 · Apr 17, 2025 · Aug 8, 2024 · Aug 13, 2024 · Aug 14, 2024
diff --git a/README.md b/README.md
@@ -29,7 +29,7 @@ in [issue #1](https://github.com/Imageomics/distributed-downloader/issues/1)).
    official websites:
     - [OpenMPI](https://docs.open-mpi.org/en/v5.0.x/installing-open-mpi/quickstart.html)
     - [IntelMPI](https://www.intel.com/content/www/us/en/docs/mpi-library/developer-guide-linux/2021-6/installation.html)
-3. Install required package:
+3. Install the required package:
     - For general use:
       ```commandline
       pip install git+https://github.com/Imageomics/distributed-downloader
@@ -44,33 +44,86 @@ in [issue #1](https://github.com/Imageomics/distributed-downloader/issues/1)).
 `distributed-downloader` utilizes multiple nodes on a High Performance Computing (HPC) system (specifically, an HPC
 with `slurm` workload manager) to download a collection of images specified in a given tab-delimited text file.
 
+### Configuration
+
+The downloader is configured using a YAML configuration file. Here's an example configuration:
+
+```yaml
+# Example configuration file
+path_to_input: "/path/to/input/urls.csv"
+path_to_output: "/path/to/output"
+
+output_structure:
+  urls_folder: "urls"
+  logs_folder: "logs"
+  images_folder: "images"
+  schedules_folder: "schedules"
+  profiles_table: "profiles.csv"
+  ignored_table: "ignored.csv"
+  inner_checkpoint_file: "checkpoint.json"
+  tools_folder: "tools"
+
+downloader_parameters:
+  num_downloads: 1
+  max_nodes: 20
+  workers_per_node: 20
+  cpu_per_worker: 1
+  header: true
+  image_size: 224
+  logger_level: "INFO"
+  batch_size: 10000
+  rate_multiplier: 0.5
+  default_rate_limit: 3
+
+tools_parameters:
+  num_workers: 1
+  max_nodes: 10
+  workers_per_node: 20
+  cpu_per_worker: 1
+  threshold_size: 10000
+  new_resize_size: 224
+```
+
 ### Main script
 
-There are one manual step to get the downloader running as designed:
+There is one manual step to get the downloader running as designed:
 You need to call function `download_images` from package `distributed_downloader` with the `config_path` as an argument.
 This will initialize filestructure in the output folder, partition the input file, profile the servers for their
 possible download speed, and start downloading images. If downloading didn't finish, you can call the same function with
 the same `config_path` argument to continue downloading.
 
+```python
+from distributed_downloader import download_images
+
+# Start or continue downloading
+download_images("/path/to/config.yaml")
+```
+
 Downloader has two logging profiles:
 
-- `INFO` - logs only the most important information, for example when a batch is started and finished. It also logs out
+- `INFO` - logs only the most important information, for example, when a batch is started and finished. It also logs out
   any error that occurred during download, image decoding, or writing batch to the filesystem
-- `DEBUG` - logs all information, for example logging start and finish of each downloaded image.
+- `DEBUG` - logs all information, for example, logging start and finish of each downloaded image.
 
 ### Tools script
 
-After downloading is finished, you can use the `tools` package perform various operations on them.
+After downloading is finished, you can use the `tools` package to perform various operations on the downloaded images.
 To do this, you need to call the function `apply_tools` from package `distributed_downloader` with the `config_path`
 and `tool_name` as an argument.
-Following tools are available:
 
-- `resize` - resizes images to a new size
-- `image_verification` - verifies images by checking if they are corrupted
-- `duplication_based` - removes duplicate images
-- `size_based` - removes images that are too small
+```python
+from distributed_downloader import apply_tools
 
-You can also add your own tool, the instructions are in the section below.
+# Apply a specific tool
+apply_tools("/path/to/config.yaml", "resize")
+```
+
+The following tools are available:
+
+- `resize` - resizes images to a new size (specified in config)
+- `image_verification` - verifies images by checking if they are corrupted
+- `duplication_based` - removes duplicate images using MD5 hashing
+- `size_based` - removes images that are too small (threshold specified in config)
 
 ### Creating a new tool
 
@@ -87,7 +140,34 @@ You can also add your own tool by creating 3 classes and registering them with r
 - Each tool should have a `run` method that will be called by the main script.
 - Each tool should be registered with a decorator from a respective package (`FilterRegister` from `filters` etc.)
 
-## Rules for scripts:
+Example of creating a custom tool:
+
+```python
+from distributed_downloader.tools import FilterRegister, SchedulerRegister, RunnerRegister, ToolsBase
+
+
+@FilterRegister("my_custom_tool")
+class MyCustomToolFilter(ToolsBase):
+  def run(self):
+    # Implementation of filter step
+    pass
+
+
+@SchedulerRegister("my_custom_tool")
+class MyCustomToolScheduler(ToolsBase):
+  def run(self):
+    # Implementation of scheduler step
+    pass
+
+
+@RunnerRegister("my_custom_tool")
+class MyCustomToolRunner(ToolsBase):
+  def run(self):
+    # Implementation of runner step
+    pass
+```
+
+## Environment Variables
 
 All scripts can expect to have the following custom environment variables, specific variables are only initialized
 when respective tool is called:
@@ -123,3 +203,63 @@ when respective tool is called:
     - `TOOLS_CPU_PER_WORKER`
     - `TOOLS_THRESHOLD_SIZE`
     - `TOOLS_NEW_RESIZE_SIZE`
+
+## Working with downloaded data
+
+Downloaded data is stored in `images_folder` (configured in config file),
+partitioned by `server_name` and `partition_id`, in two parquet files with following schemes:
+
+- `successes.parquet`:
+  - uuid: string - downloaded dataset internal unique identifier (created to distinguish between all component datasets downloaded with this package)
+  - source_id: string - id of the entry provided by its source (e.g., `gbifID`)
+  - identifier: string - source URL of the image
+  - is_license_full: boolean - True indicates that `license`, `source`, and `title` all have non-null values for that
+    particular entry.
+    - license: string
+    - source: string
+    - title: string
+  - hashsum_original: string - MD5 hash of the original image data
+  - hashsum_resized: string - MD5 hash of the resized image data
+  - original_size: [height, width] - dimensions of original image
+  - resized_size: [height, width] - dimensions after resizing
+  - image: bytes - binary image data
+
+- `errors.parquet`:
+  - uuid: string - downloaded dataset internal unique identifier (created to distinguish between all component datasets downloaded with this package)
+  - identifier: string - URL of the image
+  - retry_count: integer - number of download attempts
+  - error_code: integer - HTTP or other error code
+  - error_msg: string - detailed error message
+
+For general operations (that do not involve access to `image` column, e.g. count the total number of entries, create
+size distribution etc.) it is recommended to use Spark or similar applications. For any operation that does involve
+`image` column, it is recommended to use Pandas or similar library to access each parquet file separately.
+
+## Supported Image Formats
+
+The downloader supports most common image formats, including:
+
+- JPEG/JPG
+- PNG
+- GIF (first frame only)
+- BMP
+- TIFF
+
+## Error Handling and Troubleshooting
+
+Common issues and solutions:
+
+1. **Rate limiting errors**: If you see many errors with code 429, adjust the `default_rate_limit` in your config to a
+   lower value.
+
+2. **Memory issues**: If the process is killed due to memory constraints, try reducing `batch_size` or
+   `workers_per_node` in your config.
+
+3. **Corrupt images**: Images that cannot be decoded are logged in the errors parquet file with appropriate error codes.
+
+4. **Resuming failed downloads**: The downloader creates checkpoints automatically. Simply run the same command again to
+   resume from the last checkpoint.
+
+## License
+
+This project is licensed under the MIT License - see the LICENSE file for details.
diff --git a/environment.yaml b/environment.yaml
@@ -3,51 +3,35 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - openmpi
-  - python
-  - uv
-  - opencv
-  - pyspark
+  - python>=3.10 <=3.12
   - attrs
   - brotli
-  - certifi
-  - charset-normalizer
   - cramjam
   - cython
-  - exceptiongroup
   - fsspec
-  - hatchling
-  - idna
   - inflate64
-  - iniconfig
+  - openmpi
   - mpi4py
   - multivolumefile
-  - numpy
-  - packaging
+  - opencv
   - pandas
   - pathspec
   - pillow
-  - pip
-  - pluggy
   - psutil
-  - py4j
   - pyarrow
   - pybcj
   - pycryptodomex
   - pyppmd
-  - pytest
-  - python-dateutil
+  - pyspark>=3.4.0
   - python-dotenv
-  - pytz
   - pyyaml
   - pyzstd
   - requests
   - setuptools
-  - six
   - texttable
-  - tomli
   - trove-classifiers
   - typing-extensions
-  - tzdata
-  - urllib3
   - wheel
+  # Development dependencies
+  - pytest
+  - ruff
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,32 +1,61 @@
 [build-system]
-requires = ["hatchling", "hatch-requirements-txt"]
+requires = ["hatchling"]
 build-backend = "hatchling.build"
 
 [tool.hatch.build.targets.wheel]
 packages = ["src/distributed_downloader"]
 
 [project]
 name = "distributed_downloader"
-dynamic = ["dependencies", "version"]
+dynamic = ["version"]
 authors = [
     { name = "Andrey Kopanev", email = "kopanev.1@osu.edu" },
     { name = "Elizabeth G. Campolongo", email = "e.campolongo479@gmail.com" },
     { name = "Matthew J. Thompson", email = "thompson.m.j@outlook.com" },
 ]
 description = "A tool for downloading files from a list of URLs in parallel."
 readme = "README.md"
-requires-python = ">=3.8"
+requires-python = ">=3.10, <=3.12"
 classifiers = [
     "Programming Language :: Python :: 3",
     "License :: OSI Approved :: MIT License",
     "Operating System :: OS Independent",
 ]
-
-[tool.hatch.metadata.hooks.requirements_txt]
-files = ["requirements.txt"]
+dependencies = [
+    "attrs",
+    "brotli",
+    "cramjam",
+    "cython",
+    "fsspec",
+    "inflate64",
+    "mpi4py",
+    "multivolumefile",
+    "opencv-python",
+    "pandas",
+    "pathspec",
+    "pillow",
+    "psutil",
+    "pyarrow",
+    "pybcj",
+    "pycryptodomex",
+    "pyppmd",
+    "pyspark",
+    "python-dotenv",
+    "pyyaml",
+    "pyzstd",
+    "requests",
+    "setuptools",
+    "texttable",
+    "trove-classifiers",
+    "typing-extensions",
+    "wheel"
+]
 
 [project.optional-dependencies]
-dev = ["pytest"]
+dev = [
+    "pytest",
+    "ruff"
+]
 
 keywords = [
     "parallel",