In [1]:
# set up chroma db 
import chromadb
# set up ollama
import ollama
import json
import nltk
from nltk.tokenize import sent_tokenize

In [2]:
# chroma_client = chromadb.PersistentClient(path="chromadb/")
chroma_client = chromadb.Client()
# chroma_client.delete_collection("my_collection")
collection = chroma_client.create_collection(name="my_collection", get_or_create=True)

In [3]:
count = 0
def add_to_chroma_db(text: str):
    global count 
    embed = ollama.embeddings(model='nomic-embed-text', prompt=text)
    collection.add(embeddings=[embed["embedding"]], documents=[text], ids=[f"id{count}"])
    count += 1
def query_chroma_db(text: str, n_results=20):
    return collection.query(ollama.embeddings(model='nomic-embed-text', prompt=text)["embedding"], n_results=n_results)

In [4]:
def combine_content_with_parents(data):
    """
    Recursively combines each line in _content with its parent keys.

    Args:
        data (dict): The JSON-like data with nested structure.

    Returns:
        list: A list of strings where each string is a combined sentence.
    """
    combined_sentences = []

    def process_node(node, parents=[]):
        if isinstance(node, dict):
            for key, value in node.items():
                if key == '_content' and isinstance(value, list):
                    for line in value:
                        combined_sentence = ' of '.join(parents) + ': ' + line.strip()
                        combined_sentences.append(combined_sentence)
                else:
                    process_node(value, parents + [key])
        elif isinstance(node, list):
            for item in node:
                process_node(item, parents)

    process_node(data)
    return combined_sentences


In [5]:
# import all files in the json_data folder
import os
import json

files = os.listdir("json_data")
for file in files:
    with open(f"json_data/{file}") as f:
        data = json.load(f)
        combined_sentences = combine_content_with_parents(data)
        for sentence in combined_sentences:
            add_to_chroma_db(sentence)

In [17]:
def ask_rag(question, model='llama3.1'):
    responses = query_chroma_db(question, n_results=10)
    # for response in responses['documents'][0]:
    #     print(response)
    context = ""
    for i in range(len(responses['documents'][0])):
        context += f"{i+1}. {responses['documents'][0][i]} "
    
    system = f"""
    Use the following pieces of context to answer the question at the end. 
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    Use five sentences maximum and keep the answer as concise as possible. 
    {context}
    """
        
    response = ollama.chat(model=model, messages=[
    {
        'role': 'system',
        'content': system,
    },
    {
        'role': 'user',
        'content': question,
    },
    ])
    
    return response['message']['content']

def ask_question(question):
    response = ollama.chat(model='llama3.1', messages=[
    {
        'role': 'user',
        'content': question,
    },
    ])
    
    return response['message']['content']

In [28]:
question = "Where can I find wolves"

llama_response = ask_rag(question)
qwen2_response = ask_rag(question, model='qwen2')

print(f"Question: {question}")
print(f"llama3.1: {llama_response}")
print("="*80)
print(f"Question: {question}")
print(f"QWen2: {qwen2_response}")


Question: Where can I find wolves
llama3.1: You can find wolves naturally spawning on grass blocks, dirt, coarse dirt, snow (in Bedrock Edition), snow blocks, or podzol in multiple biomes. The specific appearance and spawn rate may vary depending on the biome.
Question: Where can I find wolves
QWen2: Wolves naturally spawn on various surfaces like grass blocks, dirt, coarse dirt, snow (in Bedrock Edition), snow blocks, or podzol. They have a 10% chance of spawning as babies and their appearance depends on the biome they are found in. Wolves can also be spawned using spawn eggs, monster spawners, commands, or due to bordering specific biomes like jungles or savannas.
