In [1]:
import nest_asyncio

# Core Python imports
import asyncio
import os
import io
import re
import sys
import json
import shutil
import zipfile
import base64
from typing import Dict, Any, List, Optional, Tuple, Union
from pathlib import Path
from dataclasses import dataclass
from enum import Enum
from dotenv import load_dotenv

# Core llama_index imports - wrapped in try/except to handle missing stubs
# and different package versions gracefully
try:
    from llama_index.core import Settings, VectorStoreIndex, StorageContext, load_index_from_storage
    from llama_index.core.workflow import Workflow, step, Event, StartEvent, StopEvent
    from llama_index.embeddings.openai import OpenAIEmbedding
    from llama_index.readers.github import GithubRepositoryReader, GithubClient
    from llama_index.llms.openai import OpenAI
    from llama_index.llms.anthropic import Anthropic
    from llama_index.core.llms import ChatMessage, MessageRole
except ImportError as e:
    print(f"Warning: Some llama_index imports failed ({e}). Functionality may be limited.")

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()



In [2]:
from dotenv import load_dotenv
assert load_dotenv()

In [3]:
async def build_github_retriever(
    github_token: Optional[str],
    owner: str = "microservices-patterns",
    repo: str = "ftgo-application",
    branch: str = "master",
    include_dirs: Optional[List[str]] = None,
    force_rebuild: bool = False
):
    """Build and return a LlamaIndex retriever for a GitHub repository.

    Example usage:
        retriever = await build_github_retriever(os.getenv("GITHUB_TOKEN"))
        docs = retriever.retrieve("Where are the pattern examples for sagas?")

    The function will set Settings.embed_model to OpenAIEmbedding using the
    current OPENAI_API_KEY if available.

    Args:
        github_token: GitHub API token
        owner: Repository owner
        repo: Repository name
        branch: Branch name
        include_dirs: List of directories to include (None for all)
        force_rebuild: If True, rebuild index even if cached version exists
    """
    global GITHUB_RETRIEVER
    if not github_token:
        raise ValueError("github_token is required to read private or API-rate-limited repos")
        
    # Setup cache directory
    cache_dir = Path(".cache/github_indexes")
    cache_dir.mkdir(parents=True, exist_ok=True)
    persist_dir = cache_dir / f"{owner}_{repo}_{branch}"
    
    # Try to load cached index if it exists and force_rebuild is False
    if not force_rebuild and persist_dir.exists():
        try:
            print(f"Loading cached index from {persist_dir}")
            # Ensure embeddings are configured
            openai_api_key = os.getenv("OPENAI_API_KEY")
            if openai_api_key:
                Settings.embed_model = OpenAIEmbedding(api_key=openai_api_key)
            # Load the index from disk using the storage context
            storage_context = StorageContext.from_defaults(persist_dir=str(persist_dir))
            index = load_index_from_storage(storage_context)
            retriever = index.as_retriever(similarity_top_k=5)
            GITHUB_RETRIEVER = retriever
            print("Successfully loaded cached index")
            return retriever
        except Exception as e:
            print(f"Failed to load cached index: {e}, rebuilding...")

    github_client = GithubClient(github_token=github_token, verbose=False)

    # Default filters: include no specific directories (read whole repo) but exclude binary/docs
    filter_directories: Tuple[List[str], GithubRepositoryReader.FilterType]
    if include_dirs is None:
        # include all directories
        filter_directories = ([], GithubRepositoryReader.FilterType.EXCLUDE)
    else:
        filter_directories = (include_dirs, GithubRepositoryReader.FilterType.INCLUDE)

    filter_file_extensions = (
        [
            ".png",
            ".jpg",
            ".jpeg",
            ".gif",
            ".svg",
            ".ico",
            "json",
            ".ipynb",
        ],
        GithubRepositoryReader.FilterType.EXCLUDE,
    )

    reader = GithubRepositoryReader(
        github_client=github_client,
        owner=owner,
        repo=repo,
        use_parser=False,
        verbose=True,
        filter_directories=filter_directories,
        filter_file_extensions=filter_file_extensions,
    )

    print(f"Loading repository {owner}/{repo} (branch={branch}) via GitHub API")
    documents = reader.load_data(branch=branch)
    print(f"Loaded {len(documents)} documents from GitHub repository")

    # Ensure embeddings are configured (use OPENAI_API_KEY if set)
    openai_api_key = os.getenv("OPENAI_API_KEY")
    if openai_api_key:
        Settings.embed_model = OpenAIEmbedding(api_key=openai_api_key)

    # Build an in-memory vector index
    index = VectorStoreIndex.from_documents(documents)
    
    # Save the index to disk for future use
    try:
        print(f"Saving index to {persist_dir}")
        index.storage_context.persist(persist_dir=str(persist_dir))
        print("Successfully saved index to disk")
    except Exception as e:
        print(f"Warning: Failed to save index to disk: {e}")
    
    retriever = index.as_retriever(similarity_top_k=5)
    GITHUB_RETRIEVER = retriever
    return retriever

In [None]:
rtr = await build_github_retriever("ghp-...")  # Replace with your GitHub token

Loading cached index from .cache/github_indexes/microservices-patterns_ftgo-application_master
Loading llama_index.core.storage.kvstore.simple_kvstore from .cache/github_indexes/microservices-patterns_ftgo-application_master/docstore.json.
Loading llama_index.core.storage.kvstore.simple_kvstore from .cache/github_indexes/microservices-patterns_ftgo-application_master/index_store.json.


2025-11-04 22:48:21,521 - INFO - Loading all indices.


Successfully loaded cached index


In [14]:
cache_dir = Path(".cache/github_indexes/microservices-patterns_ftgo-application_master/")

In [None]:
storage_context = StorageContext.from_defaults(persist_dir=cache_dir)