In [1]:
# !pip install 'ragna[all]'

In [2]:
import asyncio
import itertools
from pprint import pprint
from dotenv import load_dotenv

from ragna import Config
from ragna.core import Rag
from ragna.assistants import (
    Claude,
    ClaudeInstant,
    Gpt4,
    Gpt35Turbo16k,
    Mpt7bInstruct,
    Mpt30bInstruct,
    RagnaDemoAssistant,
)
from ragna.source_storages import Chroma, LanceDB, RagnaDemoSourceStorage


In [3]:
assert load_dotenv("./.env")

In [None]:
# Create example doc

document_content = """Einhörner versuchen sich im Baseballspiel, während 
Katzen elegant Ballett tanzen und eine Schildkröte hartnäckig einen Marathon 
absolviert, begleitet von Wolken, die wie Zuckerwatte am Himmel schweben."""

document_path = "ragna_example_doc.txt"
with open(document_path, "w") as file:
    file.write(document_content)

### Simplest dry demo RAG pipeline (demo vector store, demo LLM)

In [None]:
config = Config()
rag = Rag(config)

async with rag.chat(
    documents=[document_path],
    source_storage=RagnaDemoSourceStorage,
    assistant=RagnaDemoAssistant,
) as chat:
    prompt = "Was machen Einhörner, Katzen und die Schildkröte?"
    answer = await chat.answer(prompt)

print(answer)

### Simple RAG using openAI API

In [None]:
source_storages = [Chroma, LanceDB]
assistants = [
    Gpt35Turbo16k,
    Gpt4,
    # ClaudeInstant,
    # Claude,
    # Mpt7bInstruct,
    # Mpt30bInstruct,
]

async def answer_prompt(source_storage, assistant):
    async with rag.chat(
        documents=[document_path],
        source_storage=source_storage,
        assistant=assistant,
    ) as chat:
        print("Prompt: " + prompt)
        message = await chat.answer(prompt)
        return message.content


prompt = "Was machen Einhörner, Katzen und die Schildkröte?"
experiments = {
    (source_storage.display_name(), assistant.display_name()): answer_prompt(
        source_storage, assistant
    )
    for source_storage, assistant in itertools.product(source_storages, assistants)
}

results = dict(zip(experiments.keys(), await asyncio.gather(*experiments.values())))
pprint(results)

In [None]:
source_storage = Chroma # LanceDB, Chroma
assistant = Gpt4 # Gpt35Turbo16k, Gpt4

print(source_storage.display_name(), assistant.display_name())

prompt = "Was machen Einhörner, Katzen und die Giraffe?"
await asyncio.gather(answer_prompt(source_storage, assistant))


### Local LLM

In [None]:
# !conda install pytorch::pytorch torchvision torchaudio -c pytorch
# !conda install optimum

# !pip install auto-gptq    # NEEDS CUDA!

In [5]:
# from ragna.core import Assistant, PackageRequirement, Source


# class AiroborosAssistant(Assistant):
#     @classmethod
#     def display_name(cls):
#         return "TheBloke/Airoboros-L2-7B-2.2-GPTQ"

#     @classmethod
#     def requirements(cls):
#         return [
#             PackageRequirement("torch"),
#             PackageRequirement("optimum"),
#             PackageRequirement("auto-gptq"),
#         ]

#     @classmethod
#     def is_available(cls):
#         requirements_available = super().is_available()
#         if not requirements_available:
#             return False

#         import torch

#         return torch.cuda.is_available()

#     def __init__(self, config):
#         super().__init__(config)

#         from auto_gptq import AutoGPTQForCausalLM
#         from transformers import AutoTokenizer

#         self.tokenizer = AutoTokenizer.from_pretrained(str(self), use_fast=True)
#         self.model = AutoGPTQForCausalLM.from_quantized(
#             str(self),
#             device_map="auto",
#             use_triton=False,
#             use_safetensors=True,
#             trust_remote_code=False,
#             inject_fused_attention=False,
#         )

#     @property
#     def max_input_size(self) -> int:
#         # FIXME
#         return 1024

#     def answer(
#         self, prompt: str, sources: list[Source], *, max_new_tokens: int = 256
#     ) -> str:
#         template = """
#         A chat about the content of documents.
#         Only use the content listed below to answer any questions from the user.
#         Do not make up information.
#         If you can't answer a question based on the information you are given, just say so.

#         {sources}
        
#         USER: {prompt}
#         ASSISTANT: 
#         """
#         templated_prompt = template.format(
#             sources="- " + "\n - ".join(source.content for source in sources),
#             prompt=prompt,
#         )
#         input_ids = self.tokenizer(
#             templated_prompt, return_tensors="pt"
#         ).input_ids.cuda()
#         output_ids = self.model.generate(
#             inputs=input_ids,
#             do_sample=False,
#             max_new_tokens=max_new_tokens,
#         )
#         output = self.tokenizer.decode(output_ids[0])
#         return output.rsplit("ASSISTANT:", 1)[-1].replace("</s>", "").strip()


# assert AiroborosAssistant.is_available()