# Usage Example

For the comprehensive guide of the query language, read this: https://whoosh.readthedocs.io/en/latest/querylang.html

## Import ``sayt``

In [1]:
from sayt.api import (
    DataSet,
    IdField,
    TextField,
    NumericField,
    NgramField,
)

In [2]:
import os
import uuid
import random
from pathlib import Path

import faker
from fixa.timer import DateTimeTimer
from diskcache import Cache
from rich import print as rprint

In [3]:
dir_here = Path(os.getcwd())

fake = faker.Faker()

## Define your dataset schema

In [32]:
ds = DataSet(
    dir_index=dir_here.joinpath(".index"), # where the index locate
    index_name="my-dataset", # unique name of your dataset
    fields=[
        # unique ID field
        IdField(name="id", stored=True),
        # match by token (word) or phrase
        TextField(name="title", stored=True),
        # match by n-gram characters
        NgramField(
            name="author",
            stored=True,
            minsize=2,
            maxsize=6,
        ),
        # range query
        NumericField(
            name="year",
            stored=True,
            sortable=True,
            ascending=False,
        ),
    ],
    cache=Cache(str(dir_here.joinpath(".cache")), tag_index=True), # where the cache locate
    cache_key="my-dataset", # unique cache key for your dataset
    cache_expire=10, # how long cache expire (in seconds)
    cache_tag="dev", # a tag can be used to do batch delete. if you want to delete cache for many dataset, give them the same tag
)
ds.remove_all_index()

insert some dummy data for testing

In [33]:
data = [
    {
        "id": "id-1234",
        "title": "Sustainable Energy - without the hot air",
        "author": "MacKay, David JC",
        "year": 2009,
    },
]

ds.build_index(data=data)

In [34]:
def run_query(query):
    res = ds.search(query)
    rprint(res)

## Multi Field Match

In [35]:
run_query("id-1234")

In [8]:
run_query("energy")

In [9]:
run_query("dav")

In [10]:
run_query("2009")

## Specify the Field you want to match

In [11]:
run_query("id:id-1234")

In [12]:
run_query("title:energy")

In [13]:
run_query("author:dav")

In [14]:
run_query("year:2009")

## Range Query

In [15]:
run_query("year:>2000")

In [16]:
run_query("year:<2020")

In [17]:
run_query("year:>2000 AND year:<2020")

In [18]:
run_query("year:[2000 TO]")

In [19]:
run_query("year:[TO 2020]")

In [20]:
run_query("year:[2000 TO 2020]")

## Logical Operator

In [21]:
run_query("title:energy OR author:xyz")

In [22]:
run_query("title:monster OR author:dav")

In [23]:
run_query("title:monster AND author:xyz")

## Fuzzy Search

In [24]:
run_query("title:energi~1")

## Use More data

In [48]:
data = [
    {
        "id": uuid.uuid4().hex,
        "title": fake.sentence(),
        "author": fake.name(),
        "year": random.randint(1980, 2020),
    }
    for _ in range(5000) # 5,000
]
with DateTimeTimer("index time"):
    ds.build_index(data=data)

index time: from 2023-09-25 18:55:13.616607 to 2023-09-25 18:55:14.305656 elapsed 0.689049 second.


In [49]:
with DateTimeTimer("query time"):
    run_query("police man")

query time: from 2023-09-25 18:55:14.309403 to 2023-09-25 18:55:14.392765 elapsed 0.083362 second.
