In [88]:
!pip install -q chonkie docling model2vec rich torch transformers tqdm requests /Users/ln/dev/helix-py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [128]:
from chonkie import RecursiveChunker, RecursiveRules, RecursiveLevel
from docling.document_converter import DocumentConverter
from transformers import AutoTokenizer, AutoModel
from rich.console import Console
from rich.text import Text
from typing import List
import numpy as np
import os
import torch
from tqdm import tqdm
import requests

import helix
from helix.client import ragloaddocs, ragsearchdocs

In [96]:
console = Console()

# A wrapper to pretty print
def rprint(text: str, console: Console=console, width: int = 80) -> None:
  richtext = Text(text)
  console.print(richtext.wrap(console, width=width))

In [97]:
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")

In [98]:
helix_docs_endpoints = [
        "https://docs.helix-db.com/info/hql",
        "https://docs.helix-db.com/introduction/cookbook/basic",
        "https://docs.helix-db.com/get-started/installation",
        "https://docs.helix-db.com/get-started/sql-ingestion"
        "https://docs.helix-db.com/hql/schema-definition",
        "https://docs.helix-db.com/hql/query-structure",
        "https://docs.helix-db.com/hql/source/source",
        "https://docs.helix-db.com/hql/source/adding",
        "https://docs.helix-db.com/hql/steps/traversals/steps_nodes",
        "https://docs.helix-db.com/hql/steps/traversals/steps_edges",
        "https://docs.helix-db.com/hql/steps/conditions",
        "https://docs.helix-db.com/hql/steps/anonymous",
        "https://docs.helix-db.com/hql/steps/properties/property-access",
        "https://docs.helix-db.com/hql/steps/properties/property-additions",
        "https://docs.helix-db.com/hql/steps/properties/property-exclusion",
        "https://docs.helix-db.com/hql/steps/properties/property-remappings",
        "https://docs.helix-db.com/hql/steps/deleting",
        "https://docs.helix-db.com/hql/steps/updating",
        "https://docs.helix-db.com/hql/steps/operations",
        "https://docs.helix-db.com/hql/vectors/inserting",
        "https://docs.helix-db.com/hql/vectors/searching",
        "https://docs.helix-db.com/hql/types"
]

In [99]:
converter = DocumentConverter()
results = [converter.convert(doc) for doc in helix_docs_endpoints]
text_results = [res.document.export_to_markdown() for res in results]
text = "\n".join(text_results)

In [89]:
#rprint(text)

In [112]:
rules = RecursiveRules(
    levels=[
        RecursiveLevel(delimiters=['######', '#####', '####', '###', '##', '#']),
        RecursiveLevel(delimiters=['\n\n', '\n', '\r\n', '\r']),
        RecursiveLevel(delimiters='.?!;:'),
        RecursiveLevel()
    ]
)
chunker = RecursiveChunker(rules=rules, chunk_size=200)

In [113]:
chunks = chunker(text)
print(f"Total number of chunks: {len(chunks)}")
# @title A quick look at our chunks~
for chunk in chunks[:4]:
    rprint(chunk.text)
    print('-'*80, '\n\n')

Total number of chunks: 30


-------------------------------------------------------------------------------- 




-------------------------------------------------------------------------------- 




-------------------------------------------------------------------------------- 




-------------------------------------------------------------------------------- 




In [114]:
def vectorize_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state[:, 0, :].squeeze().tolist()
    return embedding

def vectorize_chunked(chunked: List[str]) -> List[List[float]]:
    # embedding dims: 768
    vectorized = []
    for chunk in tqdm(chunked):
        embedding = vectorize_text(chunk)
        vectorized.append(embedding)
    return vectorized

In [115]:
items = [chunk.text for chunk in chunks]
vectors = vectorize_chunked(items)
print(f"doc length: {len(text)} chars, num of vectors: {len(vectors)}")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:03<00:00,  9.98it/s]

30





doc length: 19802 chars, num of vectors: 30


In [117]:
OLLAMA_API_URL = "http://localhost:11434/api/generate"

def get_ollama_response(prompt):
    payload = {
        #"model": "deepseek-r1:7b",
        "model": "llama3.1:8b",
        "prompt": prompt,
        "stream": False
    }
    response = requests.post(OLLAMA_API_URL, json=payload)
    if response.status_code == 200:
        return response.json()["response"]
    else:
        raise Exception(f"Ollama API request failed with status {response.status_code}")

In [72]:
"""
def create_prompt(chunks: List[str], query: str) -> str:
  prompt_template = """<instructions>
  Based on the provided contexts, answer the given question to the best of your ability. Remember to also add citations at appropriate points in the format of square brackets like [1][2][3], especially at sentence or paragraph endings.
  You will be given 4 passages in the context, marked with a label 'Doc [1]:' to denote the passage number. Use that number for citations. Answer only from the given context, and if there's no appropriate context, reply "No relevant context found!".
  </instructions>

  <context>
  {context}
  </context>

  <query>
  {query}
  </query>
  """
  context = "\n\n".join([f"Doc {i+1}: {chunk}" for i, chunk in enumerate(chunks)])
  prompt = prompt_template.format(context=context, query=query)
  return prompt
"""

SyntaxError: unterminated string literal (detected at line 5) (4255936425.py, line 5)

In [118]:
def create_prompt(user_prompt: str, context):
    reformated_prompt = f"""<instructions>
    Based on the provided contexts, answer the given question to the best of your ability. Answer only from the given context."
    </instructions>

    <context>
    {context}
    </context>

    <query>
    {user_prompt}
    </query>
    """

    return reformated_prompt

In [123]:
db = helix.Client(local=True)

# insert all the docs to vectors into helix
db.query(ragloaddocs([(text, vectors)])

[32m[HELIX][0m Helix instance found at 'http://0.0.0.0:6969'


In [133]:
user_prompt = "how do I exclude a property in helix ql?"
query_embedding = vectorize_text(user_prompt)
res = db.query(ragsearchdocs(query_embedding, 4))[0][0]['content']
response = get_ollama_response(create_prompt(user_prompt, res))
print(f"reponse: {response}")

[32m[HELIX][0m Querying 'http://0.0.0.0:6969/ragsearchdocs': 100%|█████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 42.59it/s][0m


reponse: In HelixQL, you can exclude a property from being returned in the query results by using the `NOT` keyword or the `EXCLUDE` clause.

Here are a few examples:

**Example 1: Using NOT**
```helixql
SELECT * FROM myentity WHERE name = 'John' AND NOT age
```
In this example, the `age` property will not be returned in the results.

**Example 2: Using EXCLUDE**
```helixql
SELECT *, EXCLUDE(age) AS excluded FROM myentity WHERE name = 'John'
```
In this example, the `age` property will be excluded from the results and will only appear as a placeholder (in this case, with an alias of `excluded`).

You can also use `EXCLUDE` to exclude multiple properties:
```helixql
SELECT *, EXCLUDE(age, city) AS excluded FROM myentity WHERE name = 'John'
```
Note that when using `NOT`, you should make sure that the property is not used in any aggregate functions (such as `SUM`, `AVG`, etc.) or in any calculations.

When using `EXCLUDE`, be aware that it will only exclude the specified properties from 