In [31]:
import io
import zipfile
import requests
import frontmatter
import numpy as np
from minsearch import Index, VectorSearch
from openai import OpenAI, APIConnectionError, RateLimitError, APIStatusError
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
    retry_if_exception_type
)

In [5]:
def read_repo_data(repo_owner, repo_name):
    base_url = "https://codeload.github.com"
    repo_url = f"{base_url}/{repo_owner}/{repo_name}/zip/refs/heads/main"
    
    resp = requests.get(repo_url)
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    with zipfile.ZipFile(io.BytesIO(initial_bytes=resp.content)) as zf:
        for file_info in zf.infolist():
            filename = file_info.filename.lower()
            if not (filename.endswith(".md") or filename.endswith(".mdx")):
                continue
            try:
                with zf.open(file_info) as f_in:
                    content = f_in.read()
                    post = frontmatter.loads(content)
                    data = post.to_dict()
                    data['filename'] = filename
                    repository_data.append(data)
            except Exception as e:
                print(f"Error processing {filename}: {e}")
                continue
        return repository_data

In [6]:
evidently_docs = read_repo_data('evidentlyai', 'docs')

In [7]:
openai_client = OpenAI()

@retry(
    wait=wait_random_exponential(multiplier=1, max=60),
    stop=stop_after_attempt(max_attempt_number=5),
    retry=retry_if_exception_type((APIConnectionError, RateLimitError, APIStatusError))
)
def llm(prompt, model='gpt-4o-mini'):
    messages = [
        {"role": "user", "content": prompt}
    ]

    response = openai_client.responses.create(
        model=model,
        input=messages
    )

    return response.output_text

In [8]:
prompt_template = """
Split the provided document into logical sections
that make sense for a Q&A system.

Each section should be self-contained and cover
a specific topic or concept.

<DOCUMENT>
{document}
</DOCUMENT>

Use this format:

## Section Name

Section content with all relevant details

---

## Another Section Name

Another section content

---
""".strip()


In [9]:
def intelligent_chunking(text):
    """Creating intelligent chunks"""
    prompt = prompt_template.format(document=text)
    response = llm(prompt)
    sections = response.split('---')
    sections = [s.strip() for s in sections if s.strip()]
    return sections

In [10]:
def doc_processing(doc):
    """Doc processing"""
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    sections = intelligent_chunking(doc_content)
    chunked_docs = []
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        chunked_docs.append(section_doc)
    return chunked_docs

In [11]:
doc_results = []
MAX_WORKERS = 8
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures_data = {executor.submit(doc_processing, doc): doc for doc in evidently_docs}

    for future in tqdm(as_completed(futures_data), total=len(futures_data), desc="Processing"):
        try:
            doc_sections = future.result()
            doc_results.extend(doc_sections)
        except Exception as e:
            failed_doc = futures_data[future]
            print(f"\nError processing document: {e}")     

Processing:   0%|          | 0/95 [00:00<?, ?it/s]

In [16]:
doc_results[0:20]

[{'title': 'Create Plant',
  'openapi': 'POST /plants',
  'filename': 'docs-main/api-reference/endpoint/create.mdx',
  'section': "Sure! However, I notice that there's no specific content provided within the `<DOCUMENT>` tags. Please provide the text you would like me to structure into logical sections for a Q&A system, and I'll be happy to help!"},
 {'title': 'Delete Plant',
  'openapi': 'DELETE /plants/{id}',
  'filename': 'docs-main/api-reference/endpoint/delete.mdx',
  'section': 'It seems that you\'ve referenced a "provided document," but I can\'t see any content in your message. If you could share the details or content from the document, I\'d be happy to help you split it into logical sections for a Q&A system. Please provide the text you\'d like organized.'},
 {'title': 'Introduction',
  'description': 'Example section for showcasing API endpoints',
  'filename': 'docs-main/api-reference/introduction.mdx',
  'section': '## Welcome\n\nThere are two ways to build API documentatio

In [17]:
index = Index(text_fields=['title', 'description', 'filename', 'section'],
              keyword_fields=[])
index.fit(doc_results)

<minsearch.minsearch.Index at 0x109a62b40>

In [18]:
query = 'What should be in a test dataset for AI evaluation?'
search_results = index.search(query)

In [19]:
print(search_results)

[{'title': 'RAG evaluation dataset', 'description': 'Synthetic data for RAG.', 'filename': 'docs-main/synthetic-data/rag_data.mdx', 'section': '## Overview of Retrieval-Augmented Generation (RAG) Systems\n\nRetrieval-Augmented Generation (RAG) systems rely on retrieving answers from a knowledge base before generating responses. To evaluate them effectively, a test dataset reflecting what the system *should* know is essential.'}, {'title': 'RAG evaluation dataset', 'description': 'Synthetic data for RAG.', 'filename': 'docs-main/synthetic-data/rag_data.mdx', 'section': '## Creating a RAG Test Dataset\n\nYou can generate a ground truth RAG dataset directly from your data source.\n\n### Steps to Create a Dataset\n\n1. **Create a Project**\n   - In the Evidently UI, start a new Project or open an existing one.\n   - Navigate to “Datasets” in the left menu.\n   - Click “Generate” and select the “RAG” option.\n\n   ![](/images/synthetic/synthetic_data_select_method.png)\n\n2. **Upload Your K

In [20]:
from sentence_transformers import SentenceTransformer

In [21]:
# model for qa purpose in english
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/523 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [24]:
record = doc_results[8]

In [26]:
record_embedding = embedding_model.encode(record['section'])

In [27]:
query_embedding = embedding_model.encode(query)
similarity = query_embedding.dot(record_embedding)

In [29]:
#Not the best similarity
similarity

np.float32(0.24755448)

In [32]:
doc_results_embeddings = []
for d in tqdm(doc_results):
    v = embedding_model.encode(d['section'])
    doc_results_embeddings.append(v)
doc_results_embeddings = np.array(doc_results_embeddings)
doc_vindex = VectorSearch()
doc_vindex.fit(doc_results_embeddings, doc_results)

  0%|          | 0/770 [00:00<?, ?it/s]

<minsearch.vector.VectorSearch at 0x32713c410>

In [42]:
doc_results_embeddings

array([[ 0.08784615, -0.01384743, -0.02574886, ..., -0.03691017,
         0.02301694,  0.00571875],
       [ 0.06002798,  0.02636276, -0.01116224, ..., -0.0110259 ,
        -0.0324511 ,  0.01139372],
       [ 0.01155654,  0.00877271, -0.05588597, ..., -0.0116422 ,
        -0.06385411, -0.01948082],
       ...,
       [ 0.03450141,  0.02419186,  0.03138496, ...,  0.01626617,
        -0.00152633,  0.01150939],
       [ 0.03450141,  0.02419186,  0.03138496, ...,  0.01626617,
        -0.00152633,  0.01150939],
       [-0.01549505,  0.03854848,  0.04474124, ...,  0.03841428,
        -0.01975037,  0.01021559]], shape=(770, 768), dtype=float32)

# Lets implement hybrid search

In [33]:
query = "How can I evaluate classification model results, and ensure numerical data is not drifted?"

In [35]:
text_results = index.search(query=query, num_results=5)
q_v = embedding_model.encode(query)
vector_results = doc_vindex.search(query_vector=q_v, num_results=5)

final_results = text_results + vector_results

In [36]:
final_results

[{'title': 'Synthetic data',
  'description': 'Generating test cases and datasets.',
  'filename': 'docs-main/synthetic-data/introduction.mdx',
  'section': '## Use Cases for Synthetic Test Inputs\n\nEvidently Cloud can be utilized for multiple purposes, including:\n\n* **Experiments**: Create test data to see how your LLM application handles various inputs.\n* **Regression Testing**: Validate changes in your AI system before deployment.\n* **Adversarial Testing**: Assess how your system manages tricky or unexpected inputs.\n\nOnce the data is generated, you can evaluate the results using the Evidently Cloud interface or the Evidently Python library.'},
 {'title': 'Synthetic data',
  'description': 'Generating test cases and datasets.',
  'filename': 'docs-main/synthetic-data/introduction.mdx',
  'section': '## Example of Generating Test Inputs\n\nAn illustrative example of how to generate synthetic test inputs can be found in the accompanying GIF, depicting the data generation process

In [39]:
import hashlib

In [40]:
def text_search(query):
    return index.search(query, num_results=5)

def vector_search(query):
    q = embedding_model.encode(query)
    return doc_vindex.search(q, num_results=5)

def hybrid_search(query):
    text_results = text_search(query)
    vector_results = vector_search(query)
    
    # Combine and deduplicate results
    seen_ids = set()
    combined_results = []

    for result in text_results + vector_results:
        text_to_hash = result['filename'] + ' ' + result['section'][0:250]
        encoded_string = text_to_hash.encode('utf-8')
        hash_object = hashlib.sha256(encoded_string)
        hex_digest = hash_object.hexdigest()
        if hex_digest not in seen_ids:
            seen_ids.add(result['filename'])
            combined_results.append(result)
    
    return combined_results

In [41]:
hybrid_search(query=query)

[{'title': 'Synthetic data',
  'description': 'Generating test cases and datasets.',
  'filename': 'docs-main/synthetic-data/introduction.mdx',
  'section': '## Use Cases for Synthetic Test Inputs\n\nEvidently Cloud can be utilized for multiple purposes, including:\n\n* **Experiments**: Create test data to see how your LLM application handles various inputs.\n* **Regression Testing**: Validate changes in your AI system before deployment.\n* **Adversarial Testing**: Assess how your system manages tricky or unexpected inputs.\n\nOnce the data is generated, you can evaluate the results using the Evidently Cloud interface or the Evidently Python library.'},
 {'title': 'Synthetic data',
  'description': 'Generating test cases and datasets.',
  'filename': 'docs-main/synthetic-data/introduction.mdx',
  'section': '## Example of Generating Test Inputs\n\nAn illustrative example of how to generate synthetic test inputs can be found in the accompanying GIF, depicting the data generation process

In [43]:
import pickle

In [44]:
data_to_save = {
    'embeddings': doc_results_embeddings,
    'documents': doc_results
}

In [46]:
file_path = "vector_search_data.pkl"
try:
    with open(f'{file_path}', 'wb') as f:
        pickle.dump(data_to_save, f)
    print(f"Data successfully pickled and saved to {file_path}")
except pickle.PicklingError as e:
    print(f"An error occurred during pickling: {e}")
except IOError as e:
    print(f"An I/O error occurred: {e}")

Data successfully pickled and saved to vector_search_data.pkl
