In [1]:
import io
from typing import Iterable, Callable
import zipfile
import traceback
from dataclasses import dataclass

import requests


@dataclass
class RawRepositoryFile:
    filename: str
    content: str


class GithubRepositoryDataReader:
    """
    Downloads and parses markdown and code files from a GitHub repository.
    """

    def __init__(self,
                repo_owner: str,
                repo_name: str,
                allowed_extensions: Iterable[str] | None = None,
                filename_filter: Callable[[str], bool] | None = None
        ):
        """
        Initialize the GitHub repository data reader.
        
        Args:
            repo_owner: The owner/organization of the GitHub repository
            repo_name: The name of the GitHub repository
            allowed_extensions: Optional set of file extensions to include
                    (e.g., {"md", "py"}). If not provided, all file types are included
            filename_filter: Optional callable to filter files by their path
        """
        prefix = "https://codeload.github.com"
        self.url = (
            f"{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main"
        )

        if allowed_extensions is not None:
            self.allowed_extensions = {ext.lower() for ext in allowed_extensions}

        if filename_filter is None:
            self.filename_filter = lambda filepath: True
        else:
            self.filename_filter = filename_filter

    def read(self) -> list[RawRepositoryFile]:
        """
        Download and extract files from the GitHub repository.
        
        Returns:
            List of RawRepositoryFile objects for each processed file
            
        Raises:
            Exception: If the repository download fails
        """
        resp = requests.get(self.url)
        if resp.status_code != 200:
            raise Exception(f"Failed to download repository: {resp.status_code}")

        zf = zipfile.ZipFile(io.BytesIO(resp.content))
        repository_data = self._extract_files(zf)
        zf.close()

        return repository_data

    def _extract_files(self, zf: zipfile.ZipFile) -> list[RawRepositoryFile]:
        """
        Extract and process files from the zip archive.
        
        Args:
            zf: ZipFile object containing the repository data

        Returns:
            List of RawRepositoryFile objects for each processed file
        """
        data = []

        for file_info in zf.infolist():
            filepath = self._normalize_filepath(file_info.filename)

            if self._should_skip_file(filepath):
                continue

            try:
                with zf.open(file_info) as f_in:
                    content = f_in.read().decode("utf-8", errors="ignore")
                    if content is not None:
                        content = content.strip()

                    file = RawRepositoryFile(
                        filename=filepath,
                        content=content
                    )
                    data.append(file)

            except Exception as e:
                print(f"Error processing {file_info.filename}: {e}")
                traceback.print_exc()
                continue

        return data

    def _should_skip_file(self, filepath: str) -> bool:
        """
        Determine whether a file should be skipped during processing.
        
        Args:
            filepath: The file path to check
            
        Returns:
            True if the file should be skipped, False otherwise
        """
        filepath = filepath.lower()

        # directory
        if filepath.endswith("/"):
            return True

        # hidden file
        filename = filepath.split("/")[-1]
        if filename.startswith("."):
            return True

        if self.allowed_extensions:
            ext = self._get_extension(filepath)
            if ext not in self.allowed_extensions:
                return True

        if not self.filename_filter(filepath):
            return True

        return False

    def _get_extension(self, filepath: str) -> str:
        """
        Extract the file extension from a filepath.
        
        Args:
            filepath: The file path to extract extension from
            
        Returns:
            The file extension (without dot) or empty string if no extension
        """
        filename = filepath.lower().split("/")[-1]
        if "." in filename:
            return filename.rsplit(".", maxsplit=1)[-1]
        else:
            return ""

    def _normalize_filepath(self, filepath: str) -> str:
        """
        Removes the top-level directory from the file path inside the zip archive.
        'repo-main/path/to/file.py' -> 'path/to/file.py'
        
        Args:
            filepath: The original filepath from the zip archive
            
        Returns:
            The normalized filepath with top-level directory removed
        """
        parts = filepath.split("/", maxsplit=1)
        if len(parts) > 1:
            return parts[1]
        else:
            return parts[0]

In [2]:
def read_github_data():
    allowed_extensions = {"md", "mdx"}

    repo_owner = 'evidentlyai'
    repo_name = 'docs'

    reader = GithubRepositoryDataReader(
        repo_owner,
        repo_name,
        allowed_extensions=allowed_extensions
    )
    
    return reader.read()

In [3]:
data_raw = read_github_data()
print(f"Downloaded {len(data_raw)} files")

Downloaded 95 files


In [4]:
!uv add python-frontmatter

[2mResolved [1m152 packages[0m [2min 0.87ms[0m[0m
[2mAudited [1m147 packages[0m [2min 0.99ms[0m[0m


In [5]:
print(data_raw[40].content)

---
title: "Evidently and GitHub actions"
description: "Testing LLM outputs as part of the CI/CD flow."
---

You can use Evidently together with GitHub Actions to automatically test the outputs of your LLM agent or application - as part of every code push or pull request.

## How the integration work:

- You define a test dataset of inputs (e.g. test prompts with or without reference answers). You can store it as a file, or save the dataset at Evidently Cloud callable by Dataset ID.
- Run your LLM system or agent against those inputs inside CI.
- Evidently automatically evaluates the outputs using the user-specified config (which defines the Evidently descriptors, tests and Report composition), including methods like:
  - LLM judges (e.g., tone, helpfulness, correctness)
  - Custom Python functions
  - Dataset-level metrics like classification quality
- If any test fails, the CI job fails.
- You get a detailed test report with pass/fail status and metrics.

![](/images/examples/github_

In [6]:
import frontmatter
def parse_data(data_raw):
    

    data_parsed = []
    for f in data_raw:
        post = frontmatter.loads(f.content)
        data = post.to_dict()
        data['filename'] = f.filename
        data_parsed.append(data)

    return data_parsed

In [7]:
parsed_data = parse_data(data_raw)

In [8]:
parsed_data[40]

{'title': 'Evidently and GitHub actions',
 'description': 'Testing LLM outputs as part of the CI/CD flow.',
 'content': 'You can use Evidently together with GitHub Actions to automatically test the outputs of your LLM agent or application - as part of every code push or pull request.\n\n## How the integration work:\n\n- You define a test dataset of inputs (e.g. test prompts with or without reference answers). You can store it as a file, or save the dataset at Evidently Cloud callable by Dataset ID.\n- Run your LLM system or agent against those inputs inside CI.\n- Evidently automatically evaluates the outputs using the user-specified config (which defines the Evidently descriptors, tests and Report composition), including methods like:\n  - LLM judges (e.g., tone, helpfulness, correctness)\n  - Custom Python functions\n  - Dataset-level metrics like classification quality\n- If any test fails, the CI job fails.\n- You get a detailed test report with pass/fail status and metrics.\n\n![]

In [9]:
parsed_data[10]['content']

'You can view or export Reports in multiple formats.\n\n**Pre-requisites**:\n\n* You know how to [generate Reports](/docs/library/report).\n\n## Log to Workspace\n\nYou can save the computed Report in Evidently Cloud or your local workspace.\n\n```python\nws.add_run(project.id, my_eval, include_data=False)\n```\n\n<Info>\n  **Uploading evals**. Check Quickstart examples [for ML](/quickstart_ml) or [for LLM](/quickstart_llm) for a full workflow.\n</Info>\n\n## View in Jupyter notebook\n\nYou can directly render the visual summary of evaluation results in interactive Python environments like Jupyter notebook or Colab.\n\nAfter running the Report, simply call the resulting Python object:\n\n```python\nmy_report\n```\n\nThis will render the HTML object directly in the notebook cell.\n\n## HTML\n\nYou can also save this interactive visual Report as an HTML file to open in a browser:\n\n```python\nmy_report.save_html(“file.html”)\n```\n\nThis option is useful for sharing Reports with others 

In [10]:
"""
Document chunking utilities for splitting large documents into smaller, overlapping pieces.

This module provides functionality to break down documents into chunks using a sliding
window approach, which is useful for processing large texts in smaller, manageable pieces
while maintaining context through overlapping content.
"""

from typing import Any, Dict, Iterable, List


def sliding_window(
        seq: Iterable[Any],
        size: int,
        step: int
    ) -> List[Dict[str, Any]]:
    """
    Create overlapping chunks from a sequence using a sliding window approach.

    Args:
        seq: The input sequence (string or list) to be chunked.
        size (int): The size of each chunk/window.
        step (int): The step size between consecutive windows.

    Returns:
        list: A list of dictionaries, each containing:
            - 'start': The starting position of the chunk in the original sequence
            - 'content': The chunk content

    Raises:
        ValueError: If size or step are not positive integers.

    Example:
        >>> sliding_window("hello world", size=5, step=3)
        [{'start': 0, 'content': 'hello'}, {'start': 3, 'content': 'lo wo'}]
    """
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []
    for i in range(0, n, step):
        batch = seq[i:i+size]
        result.append({'start': i, 'content': batch})
        if i + size > n:
            break

    return result


def chunk_documents(
        documents: Iterable[Dict[str, str]],
        size: int = 2000,
        step: int = 1000,
        content_field_name: str = 'content'
) -> List[Dict[str, str]]:
    """
    Split a collection of documents into smaller chunks using sliding windows.

    Takes documents and breaks their content into overlapping chunks while preserving
    all other document metadata (filename, etc.) in each chunk.

    Args:
        documents: An iterable of document dictionaries. Each document must have a content field.
        size (int, optional): The maximum size of each chunk. Defaults to 2000.
        step (int, optional): The step size between chunks. Defaults to 1000.
        content_field_name (str, optional): The name of the field containing document content.
                                          Defaults to 'content'.

    Returns:
        list: A list of chunk dictionaries. Each chunk contains:
            - All original document fields except the content field
            - 'start': Starting position of the chunk in original content
            - 'content': The chunk content

    Example:
        >>> documents = [{'content': 'long text...', 'filename': 'doc.txt'}]
        >>> chunks = chunk_documents(documents, size=100, step=50)
        >>> # Or with custom content field:
        >>> documents = [{'text': 'long text...', 'filename': 'doc.txt'}]
        >>> chunks = chunk_documents(documents, content_field_name='text')
    """
    results = []

    for doc in documents:
        doc_copy = doc.copy()
        doc_content = doc_copy.pop(content_field_name)
        chunks = sliding_window(doc_content, size=size, step=step)
        for chunk in chunks:
            chunk.update(doc_copy)
        results.extend(chunks)

    return results

In [11]:
chunks = chunk_documents(parsed_data)

In [12]:
#chunks[100]
from minsearch import Index

In [13]:
index= Index(
    text_fields=["content","filename","title","description"],
)
index.fit(chunks)

<minsearch.minsearch.Index at 0x7498343116d0>

In [14]:
search_results = index.search("How do I use llm-as-ajudge for evals?")

In [16]:
def search(query):
    return index.search(query=query, num_results=15)

In [18]:
question = "How do I use llm-as-ajudge for evals?"

In [19]:
instructions = """
You're an assistant that helps with the documnentation. Answer the QUESTION based on the CONTEXT from the search engine of our documentation.
Use only the facts from the CONTEXT when answering that QUESTION.
When answering the question, provide the reference to the file with the source. Use the filename field for that. 
The repo url is: https://github.com/evidentlyai/docs/
Include code examples when relevant.
If the question is discussed in multiple documents, cite all ot them.
Don't use markdown or any formatting in the output.
""".strip()

prompt_template="""
<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()



In [20]:
def build_prompt(question, search_result):
    context = json.dumps(search_result)

    prompt = prompt_template.format(
        question=question,
        context=context
    ).strip()

    return prompt
    

In [22]:
from openai import OpenAI

In [37]:
openai_client = OpenAI()

def llm(user_prompt,instructions=None, model="gpt-4o-mini"):
    messages=[]
    if instructions:
        messages.append({
            "role":"system",
            "content":"instructions"
        })
    messages.append({
        "role":"user",
        "content":user_prompt
    })
    response = openai_client.responses.create(
        model=model,
        input=messages
    )
    return response.output_text

In [38]:
import json

In [41]:

def rag(query):
    search_result = search(query)
    prompt = build_prompt(query, search_result)
    answer = llm(prompt)
    return answer

In [43]:
answer = rag(question)

In [44]:
print(answer)

To use an LLM as a judge for evaluations, follow these steps:

### 1. **Installation and Setup**
First, install the required package:

```bash
pip install evidently
```

Next, import the necessary libraries:

```python
import pandas as pd
import numpy as np
from evidently import Dataset, DataDefinition, Report
from evidently.presets import TextEvals
from evidently.llm.templates import BinaryClassificationPromptTemplate
```

Don't forget to set your OpenAI API key:

```python
import os
os.environ["OPENAI_API_KEY"] = "YOUR_KEY"
```

### 2. **Create the Dataset**
Create a dataset that includes:
- **Questions**: Inputs sent to the LLM.
- **Target responses**: Approved responses considered accurate.
- **New responses**: Responses generated from the system.
- **Manual labels**: Labels indicating whether a response is correct.

For example:

```python
data = {
    'question': ["What is the capital of France?", "Explain photosynthesis."],
    'target_response': ["Paris", "Photosynthesis is the