In [1]:
!pip install -q chonkie docling model2vec rich torch transformers tqdm requests /Users/ln/dev/helix-py


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
from chonkie import RecursiveChunker, RecursiveRules, RecursiveLevel
from docling.document_converter import DocumentConverter
from transformers import AutoTokenizer, AutoModel
from rich.console import Console
from rich.text import Text
from typing import List
import numpy as np
import os
import torch
from tqdm import tqdm
import requests

import helix
from helix.client import Query, ragloaddocs
from helix.types import Payload

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
console = Console()

# A wrapper to pretty print
def rprint(text: str, console: Console=console, width: int = 80) -> None:
  richtext = Text(text)
  console.print(richtext.wrap(console, width=width))

In [4]:
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")

In [5]:
helix_docs_endpoints = [
        "https://docs.helix-db.com/info/hql",
        "https://docs.helix-db.com/introduction/cookbook/basic",
        "https://docs.helix-db.com/get-started/installation",
        "https://docs.helix-db.com/get-started/sql-ingestion"
        "https://docs.helix-db.com/hql/schema-definition",
        "https://docs.helix-db.com/hql/query-structure",
        "https://docs.helix-db.com/hql/source/source",
        "https://docs.helix-db.com/hql/source/adding",
        "https://docs.helix-db.com/hql/steps/traversals/steps_nodes",
        "https://docs.helix-db.com/hql/steps/traversals/steps_edges",
        "https://docs.helix-db.com/hql/steps/conditions",
        "https://docs.helix-db.com/hql/steps/anonymous",
        "https://docs.helix-db.com/hql/steps/properties/property-access",
        "https://docs.helix-db.com/hql/steps/properties/property-additions",
        "https://docs.helix-db.com/hql/steps/properties/property-exclusion",
        "https://docs.helix-db.com/hql/steps/properties/property-remappings",
        "https://docs.helix-db.com/hql/steps/deleting",
        "https://docs.helix-db.com/hql/steps/updating",
        "https://docs.helix-db.com/hql/steps/operations",
        "https://docs.helix-db.com/hql/vectors/inserting",
        "https://docs.helix-db.com/hql/vectors/searching",
        "https://docs.helix-db.com/hql/types"
]

In [6]:
converter = DocumentConverter()
results = [converter.convert(doc) for doc in helix_docs_endpoints]
text_results = [res.document.export_to_markdown() for res in results]
text = "\n".join(text_results)
#rprint(text)

In [7]:
rules = RecursiveRules(
    levels=[
        RecursiveLevel(delimiters=['######', '#####', '####', '###', '##', '#']),
        RecursiveLevel(delimiters=['\n\n', '\n', '\r\n', '\r']),
        RecursiveLevel(delimiters='.?!;:'),
        RecursiveLevel()
    ]
)
chunker = RecursiveChunker(rules=rules, chunk_size=200)

In [8]:
chunks = chunker(text)
print(f"Total number of chunks: {len(chunks)}")
# @title A quick look at our chunks~
for chunk in chunks[:4]:
    rprint(chunk.text)
    print('-'*80, '\n\n')

Total number of chunks: 30


-------------------------------------------------------------------------------- 




-------------------------------------------------------------------------------- 




-------------------------------------------------------------------------------- 




-------------------------------------------------------------------------------- 




In [9]:
def vectorize_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state[:, 0, :].squeeze().tolist()
    return embedding

def vectorize_chunked(chunked: List[str]) -> List[List[float]]:
    # embedding dims: 768
    vectorized = []
    for chunk in tqdm(chunked):
        embedding = vectorize_text(chunk)
        vectorized.append(embedding)
    return vectorized

OLLAMA_API_URL = "http://localhost:11434/api/generate"

def get_ollama_response(prompt):
    payload = {
        #"model": "deepseek-r1:7b",
        "model": "llama3.1:8b",
        "prompt": prompt,
        "stream": False
    }
    response = requests.post(OLLAMA_API_URL, json=payload)
    if response.status_code == 200:
        return response.json()["response"]
    else:
        raise Exception(f"Ollama API request failed with status {response.status_code}")

In [10]:
items = [chunk.text for chunk in chunks]
vectors = vectorize_chunked(items)
print(f"doc length: {len(text)} chars, num of vectors: {len(vectors)}")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:01<00:00, 16.04it/s]

doc length: 19802 chars, num of vectors: 30





In [11]:
def create_prompt(user_prompt: str, context):
    reformated_prompt = f"""<instructions>
    Based on the provided contexts, answer the given question to the best of your ability. Answer only from the given context."
    </instructions>

    <context>
    {context}
    </context>

    <query>
    {user_prompt}
    </query>
    """

    return reformated_prompt

In [24]:
db = helix.Client(local=True)

class ragsearchdocs(Query):
    def __init__(self, query_vector: List[float], k: int=4):
        super().__init__()
        self.query_vector = query_vector
        self.k = k

    def query(self) -> List[Payload]:
        return [{ "query": self.query_vector, "k": self.k }]

    def response(self, response):
        return response.get('doc_node')[0]['content']

[32m[HELIX][0m Helix instance found at 'http://0.0.0.0:6969'


In [25]:
# insert all the docs to vectors into helix
db.query(ragloaddocs([(text, vectors)]))

[32m[HELIX][0m Querying 'http://0.0.0.0:6969/ragloaddocs': 100%|███████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.11it/s][0m


[None]

In [36]:
user_prompt = "how do I exclude a property in helix qeury language?"
query_embedding = vectorize_text(user_prompt)
res = db.query(ragsearchdocs(query_embedding, 4))[0]
assert(len(res) > 5)
response = get_ollama_response(create_prompt(user_prompt, res))
print(f"reponse: {response}")

[32m[HELIX][0m Querying 'http://0.0.0.0:6969/ragsearchdocs': 100%|█████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 26.08it/s][0m


reponse: In Helix Query Language, you can exclude a property by using the `!` symbol followed by the name of the property. For example:

```
SELECT !property_name
FROM my_table
```

This will return all rows from `my_table`, but with the `property_name` column excluded.

Note that this is just one way to exclude properties in Helix Query Language. The specific syntax may vary depending on your use case and requirements.

Here are some additional examples:

* To select all columns except `column1`, you can use: `SELECT !column1`
* To select all columns except `column1` and `column2`, you can use: `SELECT !column1, !column2`

Keep in mind that when excluding properties, the resulting data will have fewer columns than the original table. If you need to work with the full dataset, including all columns, you should not exclude any properties.
