# Usage Example

For the comprehensive guide of the query language, read this: https://whoosh.readthedocs.io/en/latest/querylang.html

## Import ``sayt``

In [1]:
from sayt.api import (
    DataSet,
    IdField,
    TextField,
    NumericField,
    NgramField,
    NgramWordsField,
    StoredField,
    T_DOCUMENT,
    T_DOWNLOADER,
)

In [2]:
import typing as T
import os
import time
import uuid
import random
from pathlib import Path

import faker
from fixa.timer import DateTimeTimer
from diskcache import Cache
from rich import print as rprint

In [3]:
dir_here = Path(os.getcwd())

fake = faker.Faker()

## Define your dataset schema

Let's say our dataset is about a details of a book.

```python
{
    "id": "id-1234",
    "title": "Sustainable Energy - without the hot air",
    "author": "MacKay, David JC",
    "year": 2009,
}
```

- We want to match ``id`` only if the query match the id exactly.
- We want to match ``title`` when words in the query match the word in the title, case-insensitive.
- We want to match ``author`` when any ngram characters in the query match the author name.
- We want to use range query to filter on ``year``.

In [4]:
fields = [
    # unique ID field
    IdField(name="id", stored=True),
    # match by token (word) or phrase
    TextField(name="title", stored=True),
    # match by n-gram characters
    NgramField(
        name="author",
        stored=True,
        minsize=2,
        maxsize=6,
    ),
    # range query
    NumericField(
        name="year",
        stored=True,
        sortable=True,
        ascending=False,
    ),
    # range query
    StoredField(
        name="raw",
    ),
]

## Define the downloader function

Downloader is a function with zero arguments that returns a list of searchable documents

In [5]:
def downloader():
    return [
        {
            "id": "id-1234",
            "title": "Sustainable Energy - without the hot air",
            "author": "MacKay, David JC",
            "year": 2009,
        },
    ]

## Define the Dataset Object

Dataset is an abstraction of a searchable dataset. It defines how you want to index and search your dataset, how to download your dataset, and where to store the index and cache.

In [6]:
ds = DataSet(
    dir_index=dir_here.joinpath(".index"), # where the index locate
    index_name="my-dataset", # unique name of your dataset
    fields=fields,
    cache=Cache(str(dir_here.joinpath(".cache")), tag_index=True), # where the cache locate
    cache_key="my-dataset", # unique cache key for your dataset
    cache_expire=10, # how long cache expire (in seconds)
    cache_tag="my-dataset", # a tag can be used to do batch delete. if you want to delete cache for many dataset, give them the same tag
    downloader=downloader,
)
ds.remove_all_index() # reset everything before testing
ds.remove_all_cache() # reset everything before testing

## Play with the search method

``DataSet.search`` method is the main API that performs the search, handles the caching, dataset refreshing and all the details.

In [7]:
def run_query(query, limit: int=5, simple_response: bool=True):
    res = ds.search(query, limit=limit, simple_response=simple_response)
    rprint(res)

### Multi Field Match

By default, ``sayt`` try to match the query in all searable fields.

In [8]:
run_query("id-1234")

In [9]:
run_query("energy")

In [10]:
run_query("dav")

In [11]:
run_query("2009")

### Specify the Field you want to match

You can use the ``${field_name}:${query}`` syntax to search on specific field.

In [12]:
run_query("id:id-1234")

In [13]:
run_query("title:energy")

In [14]:
run_query("author:dav")

In [15]:
run_query("year:2009")

### Range Query

You can use the ``${field_name}:${comparison_operator}${value}`` syntax to do range query on specific field.

In [16]:
run_query("year:>2000")

In [17]:
run_query("year:<2020")

In [18]:
run_query("year:>2000 AND year:<2020")

In [19]:
run_query("year:[2000 TO]")

In [20]:
run_query("year:[TO 2020]")

In [21]:
run_query("year:[2000 TO 2020]")

### Logical Operator

You can use ``AND``, ``OR``, ``NOT`` syntax to connect multiple criterions. By default, it is ``AND``.

In [22]:
run_query("title:energy OR author:xyz")

In [23]:
run_query("title:monster OR author:dav")

In [24]:
run_query("title:monster AND author:xyz")

### Fuzzy Search

You can use the ``${field_name}~${edit_distance}`` syntax to do fuzzy search on ``TextField``.

In [25]:
run_query("title:energi~1")

## ElasticSearch-liked results

You can set ``simple_response=False`` to return elasticsearch-liked results.

In [26]:
rprint(ds.search("David", simple_response=False))

## Enable logging

You can set ``verbose=True`` to show detailed log.

In [27]:
def downloader_5000_records():
    # this time we want to work on a larger dataset
    return [
        {
            "id": uuid.uuid4().hex,
            "title": fake.sentence(),
            "author": fake.name(),
            "year": random.randint(1980, 2020),
        }
        for _ in range(5000) # 5,000
    ]

In [28]:
ds.downloader = downloader_5000_records
rprint(ds.search("police", limit=3, simple_response=False, verbose=True))

+----- ⏱ 🟢 🔎 Start 'searching' ------------------------------------------------+
🔎 
🔎 dataset is NOT expired, skip the downloader
🔎 NOT hit query cache!
🔎 preprocessing query ...
🔎 run search on index my-dataset...
🔎   search took: 0 milliseconds
🔎   return: 0 documents
🔎   dataset is fresh: False
🔎   hit cache: False
🔎 
+----- ⏰ 🔴 🔎 End 'searching', elapsed = 0.03 sec ------------------------------+


## Query Caching

The query is automatically cached if the dataset is not expired. You can see that it only takes 0.00 sec if we run the same query again.

In [29]:
rprint(ds.search("police", limit=3, simple_response=False, verbose=True))

+----- ⏱ 🟢 🔎 Start 'searching' ------------------------------------------------+
🔎 
🔎 dataset is NOT expired, skip the downloader
🔎 HIT query cache!
🔎   search took: 0 milliseconds
🔎   return: 0 documents
🔎   dataset is fresh: False
🔎   hit cache: True
🔎 
+----- ⏰ 🔴 🔎 End 'searching', elapsed = 0.01 sec ------------------------------+


## Automatically Refresh the Dataset

You may want to automatically re-download the dataset every X seconds / hours / days, you just need to set the expire time and it will automatically re-run the downloader function when the dataset is expired.

In [30]:
ds.downloader = downloader_5000_records
ds.cache_expire = 1
ds.remove_all_index()
ds.remove_all_cache()

print("=== First run, it will download the data ===")
rprint(ds.search("police", limit=1, simple_response=False, verbose=True))

print("=== Second run, it will not download the data ===")
rprint(ds.search("police", limit=1, simple_response=False, verbose=True))

time.sleep(1)
print("=== Third run, it will automatically download the data ===")
rprint(ds.search("police", limit=1, simple_response=False, verbose=True))

=== First run, it will download the data ===
+----- ⏱ 🟢 🔎 Start 'searching' ------------------------------------------------+
🔎 
🔎 dataset is expired, need to rebuild the index
🔎 +----- ⏱ 🟢 🏗 Start 'build index' --------------------------------------------+
🔎 🏗 
🔎 🏗 exam the index write lock ...
🔎 🏗   nice, it is not locked, working on indexing ...
🔎 🏗     finished indexing 5000 documents, commit the index.
🔎 🏗     the dataset will expire in 1 seconds.
🔎 🏗 
🔎 +----- ⏰ 🔴 🏗 End 'build index', elapsed = 1.16 sec --------------------------+
🔎 NOT hit query cache!
🔎 preprocessing query ...
🔎 run search on index my-dataset...
🔎   search took: 4 milliseconds
🔎   return: 1 documents
🔎   dataset is fresh: True
🔎   hit cache: False
🔎 
+----- ⏰ 🔴 🔎 End 'searching', elapsed = 1.64 sec ------------------------------+


=== Second run, it will not download the data ===
+----- ⏱ 🟢 🔎 Start 'searching' ------------------------------------------------+
🔎 
🔎 dataset is NOT expired, skip the downloader
🔎 HIT query cache!
🔎   search took: 4 milliseconds
🔎   return: 1 documents
🔎   dataset is fresh: False
🔎   hit cache: True
🔎 
+----- ⏰ 🔴 🔎 End 'searching', elapsed = 0.00 sec ------------------------------+


=== Third run, it will automatically download the data ===
+----- ⏱ 🟢 🔎 Start 'searching' ------------------------------------------------+
🔎 
🔎 dataset is expired, need to rebuild the index
🔎 +----- ⏱ 🟢 🏗 Start 'build index' --------------------------------------------+
🔎 🏗 
🔎 🏗 exam the index write lock ...
🔎 🏗   nice, it is not locked, working on indexing ...
🔎 🏗     finished indexing 5000 documents, commit the index.
🔎 🏗     the dataset will expire in 1 seconds.
🔎 🏗 
🔎 +----- ⏰ 🔴 🏗 End 'build index', elapsed = 0.75 sec --------------------------+
🔎 NOT hit query cache!
🔎 preprocessing query ...
🔎 run search on index my-dataset...
🔎   search took: 4 milliseconds
🔎   return: 1 documents
🔎   dataset is fresh: True
🔎   hit cache: False
🔎 
+----- ⏰ 🔴 🔎 End 'searching', elapsed = 1.17 sec ------------------------------+


## Downloader function has parameters

In [40]:
def _downloader(env: str):
    return [
        {"id": f"id-{i}-{env}", "title": f"my {i}th {env} machine"}
        for i in range(1, 1+10)
    ]


def create_per_environment_dataset(env: str):
    def downloader():
        return _downloader(env=env)
    return DataSet(
        dir_index=dir_here.joinpath(".index"), # where the index locate
        index_name=f"my-{env}-dataset", # unique name of your dataset
        fields=[
            # unique ID field
            IdField(name="id", stored=True),
            # match by token (word) or phrase
            TextField(name="title", stored=True),
        ],
        cache=Cache(str(dir_here.joinpath(".cache")), tag_index=True), # where the cache locate
        cache_key=f"my-{env}-dataset", # unique cache key for your dataset
        cache_expire=10, # how long cache expire (in seconds)
        cache_tag=f"my-{env}-dataset", # a tag can be used to do batch delete. if you want to delete cache for many dataset, give them the same tag
        downloader=downloader,
    )

In [41]:
ds = create_per_environment_dataset(env="dev")
ds.remove_all_index() # reset everything before testing
ds.remove_all_cache() # reset everything before testing
rprint(ds.search("dev"))

In [42]:
ds = create_per_environment_dataset(env="prod")
ds.remove_all_index() # reset everything before testing
ds.remove_all_cache() # reset everything before testing
rprint(ds.search("prod"))