# Usage Example

For the comprehensive guide of the query language, read this: https://whoosh.readthedocs.io/en/latest/querylang.html

## Import ``sayt``

In [1]:
from sayt.api import (
    DataSet,
    IdField,
    TextField,
    NumericField,
    NgramField,
    NgramWordsField,
    StoredField,
    T_RECORD,
    T_KWARGS,
    T_DOWNLOADER,
    T_CACHE_KEY_DEF,
    T_CONTEXT,
    T_EXTRACTOR,
    T_RefreshableDataSetResult,
    RefreshableDataSet,
)

In [2]:
import typing as T
import os
import uuid
import random
from pathlib import Path

import faker
from fixa.timer import DateTimeTimer
from diskcache import Cache
from rich import print as rprint

In [3]:
dir_here = Path(os.getcwd())

fake = faker.Faker()

## Define your dataset schema

Let's say our dataset is about a details of a book.

```
{
    "id": "id-1234",
    "title": "Sustainable Energy - without the hot air",
    "author": "MacKay, David JC",
    "year": 2009,
}
```

- We want to match ``id`` only if the query match the id exactly.
- We want to match ``title`` when words in the query match the word in the title, case-insensitive.
- We want to match ``author`` when any ngram characters in the query match the author name.
- We want to use range query to filter on ``year``.

In [4]:
ds = DataSet(
    dir_index=dir_here.joinpath(".index"), # where the index locate
    index_name="my-dataset", # unique name of your dataset
    fields=[
        # unique ID field
        IdField(name="id", stored=True),
        # match by token (word) or phrase
        TextField(name="title", stored=True),
        # match by n-gram characters
        NgramField(
            name="author",
            stored=True,
            minsize=2,
            maxsize=6,
        ),
        # range query
        NumericField(
            name="year",
            stored=True,
            sortable=True,
            ascending=False,
        ),
    ],
    cache=Cache(str(dir_here.joinpath(".cache")), tag_index=True), # where the cache locate
    cache_key="my-dataset", # unique cache key for your dataset
    cache_expire=10, # how long cache expire (in seconds)
    cache_tag="dev", # a tag can be used to do batch delete. if you want to delete cache for many dataset, give them the same tag
)
ds.remove_all_index()

insert some dummy data for testing

In [5]:
data = [
    {
        "id": "id-1234",
        "title": "Sustainable Energy - without the hot air",
        "author": "MacKay, David JC",
        "year": 2009,
    },
]

ds.build_index(data=data)

In [6]:
def run_query(query, limit: int=5, simple_response: bool=True):
    res = ds.search(query, limit=limit, simple_response=simple_response)
    rprint(res)

## Multi Field Match

By default, ``sayt`` try to match the query in all searable fields.

In [7]:
run_query("id-1234")

In [8]:
run_query("energy")

In [9]:
run_query("dav")

In [10]:
run_query("2009")

## Specify the Field you want to match

You can use the ``${field_name}:${query}`` syntax to search on specific field.

In [11]:
run_query("id:id-1234")

In [12]:
run_query("title:energy")

In [13]:
run_query("author:dav")

In [14]:
run_query("year:2009")

## Range Query

You can use the ``${field_name}:${comparison_operator}${value}`` syntax to do range query on specific field.

In [15]:
run_query("year:>2000")

In [16]:
run_query("year:<2020")

In [17]:
run_query("year:>2000 AND year:<2020")

In [18]:
run_query("year:[2000 TO]")

In [19]:
run_query("year:[TO 2020]")

In [20]:
run_query("year:[2000 TO 2020]")

## Logical Operator

You can use ``AND``, ``OR``, ``NOT`` syntax to connect multiple criterions. By default, it is ``AND``.

In [21]:
run_query("title:energy OR author:xyz")

In [22]:
run_query("title:monster OR author:dav")

In [23]:
run_query("title:monster AND author:xyz")

## Fuzzy Search

You can use the ``${field_name}~${edit_distance}`` syntax to do fuzzy search on ``TextField``.

In [24]:
run_query("title:energi~1")

## Use More data

In [25]:
data = [
    {
        "id": uuid.uuid4().hex,
        "title": fake.sentence(),
        "author": fake.name(),
        "year": random.randint(1980, 2020),
    }
    for _ in range(5000) # 5,000
]
with DateTimeTimer("index time"):
    ds.build_index(data=data)

index time: from 2023-10-01 02:54:35.292755 to 2023-10-01 02:54:35.983787 elapsed 0.691032 second.


In [26]:
with DateTimeTimer("query time"):
    run_query("police man")

query time: from 2023-10-01 02:54:35.987750 to 2023-10-01 02:54:36.043777 elapsed 0.056027 second.


## Return ElasticSearch HTTP Response Liked Result

You can use ``simple_response=False`` to return ElasticSearch HTTP response like result. It gives you the query time, number of results and if the cache is hit.

In [27]:
run_query("police man", simple_response=False)

In [28]:
run_query("police man", simple_response=False)

## Refreshable DataSet

In [29]:
# downloader is a callable function that pull the dataset we need, and
# returns a list of record, each record is a dict data. This function
# will be called if your cache expired or you force to refresh the data.
# this downloader function takes an environment name as input,
# and returns a list of VM machine records in that environment.
def downloader(env: str) -> T.List[T.Dict[str, T.Any]]:
    n = 10
    return [
        {"id": ith, "name": f"{ith}th-{env}-machine"} for ith in range(1, 1 + n)
    ]

# cache key definition can be a literal value as the cache key 
# or a callable function that takes the download kwargs and optional context data as input, 
# and returns the cache key. 
# The evaluated value will be used as part of the 
# ``index_name``, ``cache_key`` and ``cache_tag`` naming convention.
def cache_key_def(
    download_kwargs: T_KWARGS,
    context: T_CONTEXT,
):
    return [download_kwargs["env"]]

# convert the record into whoosh indexable document, the document schema should match the definition in ``fields``.
def extractor(
    record: T_RECORD,
    download_kwargs: T_KWARGS,
    context: T_CONTEXT,
) -> T_RECORD:
    greeting = context["greeting"]
    name = record["name"]
    return {"message": f"{greeting} {name}", "raw": record}

# we would like to use ngram words search on message field
# and store the raw data as it is
fields = [
    NgramWordsField(
        name="message",
        stored=True,
        minsize=2,
        maxsize=6,
    ),
    StoredField(
        name="raw",
    ),
]

rds = RefreshableDataSet(
    downloader=downloader,
    cache_key_def=cache_key_def,
    extractor=extractor,
    fields=fields,
    dir_index=dir_here.joinpath(".index"), # where the index locate
    cache=Cache(str(dir_here.joinpath(".cache")), tag_index=True), # where the cache locate
    cache_expire=10,
    context={"greeting": "Hello"},
)
rds.remove_all_index()
rds.remove_all_cache()

In [30]:
# this is first time, so fresh = True and cache = False
res = rds.search(
    download_kwargs={"env": "dev"},
    refresh_data=True,
    query="dev",
    limit=3,
)
rprint(res)

In [31]:
# this is second time, so fresh = False and cache = True
res = rds.search(
    download_kwargs={"env": "dev"},
    query="dev",
    limit=3,
)
rprint(res)

In [32]:
# we force to refresh data, so fresh = True and cache = False
res = rds.search(
    download_kwargs={"env": "dev"},
    refresh_data=True,
    query="dev",
    limit=3,
)
rprint(res)