Skip to content

added object detection #49

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions jigsawstack/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def __init__(
).image_generation



class AsyncJigsawStack:
validate: AsyncValidate
web: AsyncWeb
Expand Down Expand Up @@ -229,5 +230,6 @@ def __init__(
).image_generation



# Create a global instance of the Web class
__all__ = ["JigsawStack", "Search", "JigsawStackError", "AsyncJigsawStack"]
62 changes: 0 additions & 62 deletions jigsawstack/_client.py

This file was deleted.

3 changes: 2 additions & 1 deletion jigsawstack/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from .async_request import AsyncRequest, AsyncRequestConfig
from ._config import ClientConfig
from typing import Any, Dict, List, cast
from typing_extensions import NotRequired, TypedDict
from typing_extensions import NotRequired, TypedDict, Literal
from .custom_typing import SupportedAccents
from .helpers import build_path

Expand All @@ -14,6 +14,7 @@ class TextToSpeechParams(TypedDict):
accent: NotRequired[SupportedAccents]
speaker_clone_url: NotRequired[str]
speaker_clone_file_store_key: NotRequired[str]
return_type: NotRequired[Literal["url", "binary", "base64"]]


class TTSCloneParams(TypedDict):
Expand Down
2 changes: 2 additions & 0 deletions jigsawstack/image_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ class ImageGenerationParams(TypedDict):
File store key to use as image input.
"""

return_type: NotRequired[Literal["url", "binary", "base64"]]

class ImageGenerationResponse(TypedDict):
success: bool
"""
Expand Down
2 changes: 1 addition & 1 deletion jigsawstack/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ class SearchParams(TypedDict):
Two-letter country code to localize search results (e.g. 'US', 'GB')
"""

auto_scrape: bool
auto_scrape: NotRequired[bool]
"""
Whether to automatically scrape content from search result URLs
"""
Expand Down
4 changes: 3 additions & 1 deletion jigsawstack/translate.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import Any, Dict, List, Union, cast, overload
from typing_extensions import NotRequired, TypedDict
from typing_extensions import NotRequired, TypedDict, Literal
from .request import Request, RequestConfig
from .async_request import AsyncRequest
from typing import List, Union
Expand All @@ -20,6 +20,8 @@ class TranslateImageParams(TypedDict):
The file store key of the image to translate.
"""

return_type: NotRequired[Literal["url", "binary", "base64"]]

class TranslateParams(TypedDict):
target_language: str
"""
Expand Down
125 changes: 119 additions & 6 deletions jigsawstack/vision.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,128 @@
from typing import Any, Dict, List, Union, cast, Optional
from typing_extensions import NotRequired, TypedDict
from typing_extensions import NotRequired, TypedDict, Literal
from typing import Any, Dict, List, cast
from typing_extensions import NotRequired, TypedDict
from typing_extensions import NotRequired, TypedDict, Literal
from .request import Request, RequestConfig
from .async_request import AsyncRequest, AsyncRequestConfig
from ._config import ClientConfig


class OCRParams(TypedDict):
class Point(TypedDict):
x: int
"""
X coordinate of the point
"""

y: int
"""
Y coordinate of the point
"""


class BoundingBox(TypedDict):
top_left: Point
"""
Top-left corner of the bounding box
"""

top_right: Point
"""
Top-right corner of the bounding box
"""

bottom_left: Point
"""
Bottom-left corner of the bounding box
"""

bottom_right: Point
"""
Bottom-right corner of the bounding box
"""

width: int
"""
Width of the bounding box
"""

height: int
"""
Height of the bounding box
"""


class GuiElement(TypedDict):
bounds: BoundingBox
"""
Bounding box coordinates of the GUI element
"""

content: Union[str, None]
"""
Content of the GUI element, can be null if no object detected
"""


class DetectedObject(TypedDict):
bounds: BoundingBox
"""
Bounding box coordinates of the detected object
"""

mask: NotRequired[str]
"""
URL or base64 string depending on return_type - only present for some objects
"""



class ObjectDetectionParams(TypedDict):
url: NotRequired[str]
"""
URL of the image to process
"""

file_store_key: NotRequired[str]
"""
File store key of the image to process
"""

prompts: NotRequired[List[str]]
"""
List of prompts for object detection
"""

features: NotRequired[List[Literal["object_detection", "gui"]]]
"""
List of features to enable: object_detection, gui
"""

annotated_image: NotRequired[bool]
"""
Whether to return an annotated image
"""

return_type: NotRequired[Literal["url", "base64"]]
"""
Format for returned images: url or base64
"""


class ObjectDetectionResponse(TypedDict):
annotated_image: NotRequired[str]
"""
URL or base64 string of annotated image (included only if annotated_image=true and objects/gui_elements exist)
"""

gui_elements: NotRequired[List[GuiElement]]
"""
List of detected GUI elements (included only if features includes "gui")
"""

objects: NotRequired[List[DetectedObject]]
"""
List of detected objects (included only if features includes "object_detection")
"""


class VOCRParams(TypedDict):
Expand Down Expand Up @@ -60,7 +173,7 @@ def vocr(self, params: VOCRParams) -> OCRResponse:
).perform_with_content()
return resp

def object_detection(self, params: OCRParams) -> OCRResponse:
def object_detection(self, params: ObjectDetectionParams) -> ObjectDetectionResponse:
path = "/ai/object_detection"
resp = Request(
config=self.config,
Expand Down Expand Up @@ -97,9 +210,9 @@ async def vocr(self, params: VOCRParams) -> OCRResponse:
).perform_with_content()
return resp

async def object_detection(self, params: OCRParams) -> OCRResponse:
async def object_detection(self, params: ObjectDetectionParams) -> ObjectDetectionResponse:
path = "/ai/object_detection"
resp = AsyncRequest(
resp = await AsyncRequest(
config=self.config,
path=path,
params=cast(Dict[Any, Any], params),
Expand Down
7 changes: 4 additions & 3 deletions jigsawstack/web.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ class DNSResponse(TypedDict):
# HTML to Any
#
class HTMLToAnyParams(TypedDict):
html: str
url: str
goto_options: NotRequired[object]
html: NotRequired[str]
url: NotRequired[str]
goto_options: NotRequired[Dict[str, Union[int, str]]]
scale: NotRequired[int]
full_page: NotRequired[bool]
omit_background: NotRequired[bool]
Expand All @@ -59,6 +59,7 @@ class HTMLToAnyParams(TypedDict):
is_mobile: NotRequired[bool]
dark_mode: NotRequired[bool]
use_graphic_renderer: NotRequired[bool]
return_type: NotRequired[Literal["url", "binary", "base64"]]


class HTMLToAnyResponse(TypedDict):
Expand Down
36 changes: 36 additions & 0 deletions tests/test_object_detection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from unittest.mock import MagicMock
import unittest
from jigsawstack.exceptions import JigsawStackError
import jigsawstack
import pytest
import asyncio
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

jigsaw = jigsawstack.JigsawStack()
async_jigsaw = jigsawstack.AsyncJigsawStack()


def test_object_detection_response():
try:
result = jigsaw.vision.object_detection({"url": "https://rogilvkqloanxtvjfrkm.supabase.co/storage/v1/object/public/demo/Collabo%201080x842.jpg"})
print(result)
assert result["success"] == True
except JigsawStackError as e:
pytest.fail(f"Unexpected JigsawStackError: {e}")


def test_object_detection_response_async():
async def _test():
client = jigsawstack.AsyncJigsawStack()
try:
result = await client.vision.object_detection({"url": "https://rogilvkqloanxtvjfrkm.supabase.co/storage/v1/object/public/demo/Collabo%201080x842.jpg"})
print(result)
assert result["success"] == True
except JigsawStackError as e:
pytest.fail(f"Unexpected JigsawStackError: {e}")

asyncio.run(_test())

18 changes: 17 additions & 1 deletion tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,22 @@


def test_search_suggestion_response():
try:
result = jigsaw.web.search({"query": "Where is San Francisco"})
assert result["success"] == True
except JigsawStackError as e:
pytest.fail(f"Unexpected JigsawStackError: {e}")


def test_ai_search_response():
try:
result = jigsaw.web.search({"query": "Where is San Francisco"})
assert result["success"] == True
except JigsawStackError as e:
pytest.fail(f"Unexpected JigsawStackError: {e}")


def test_search_suggestion_response_async():
async def _test():
client = jigsawstack.AsyncJigsawStack()
try:
Expand All @@ -25,7 +41,7 @@ async def _test():
asyncio.run(_test())


def test_ai_search_response():
def test_ai_search_response_async():
async def _test():
client = jigsawstack.AsyncJigsawStack()
try:
Expand Down