### ChatBot

In [1]:
from llama_index.llms.gemini import Gemini
from llama_index.core.llms import ChatMessage
from dotenv import load_dotenv
import os

load_dotenv()

GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")

llm = Gemini(
    model="models/gemini-1.5-flash",
    api_key=GOOGLE_API_KEY  # uses GOOGLE_API_KEY env var by default
)

### RAG

In [2]:
import chromadb
from chromadb.utils import embedding_functions

CHROMA_DATA_PATH = "chroma_data/"
EMBED_MODEL = "all-MiniLM-L6-v2"
COLLECTION_NAME = "ex1"

client = chromadb.PersistentClient(path=CHROMA_DATA_PATH)

In [3]:
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBED_MODEL
)

collection = client.create_collection(
    name=COLLECTION_NAME,
    embedding_function=embedding_func,
    metadata={"hnsw:space": "cosine"},
)

In [4]:
folder_path = './data/Short_Stories'

# Initialize variables
documents = []  # To store the text content of each PDF
ids = []  # To store the names of each PDF file

# Iterate over all files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".txt"):  # Check if the file is a TXT file
        ids.append(file_name)  # Add the file name to the names list
        file_path = os.path.join(folder_path, file_name)  # Full file path

        # Read the TXT file content
        with open(file_path, "r", encoding="utf-8") as file:
            txt_text = file.read()

        documents.append(txt_text)  # Add the full text of the TXT file to the list

In [5]:
collection.add(
    documents=documents,
    ids=ids
    )

In [10]:
query_results = collection.query(
    query_texts=["Who had seven sons?"],
    n_results=1,
)

query_results.keys()

dict_keys(['ids', 'embeddings', 'documents', 'uris', 'data', 'metadatas', 'distances', 'included'])

In [12]:
query_results['ids'], query_results["documents"]

([['019.txt']],
 [["There was once a man who had seven sons, and still he had\nno daughter, however much he wished for one.  At length his\nwife again gave him hope of a child, and when it came into\nthe world it was a girl.  The joy was great, but the child was\nsickly and small, and had to be privately baptized on account of\nits weakness.  The father sent one of the boys in haste to the\nspring to fetch water for the baptism.  The other six went with\nhim, and as each of them wanted to be first to fill it, the jug\nfell into the well.  There they stood and did not know what to do,\nand none of them dared to go home.  As they still did not return,\nthe father grew impatient, and said, they have certainly forgotten\nit while playing some game, the wicked boys.  He became afraid that\nthe girl would have to die without being baptized, and in his\nanger cried, I wish the boys were all turned into ravens.  Hardly\nwas the word spoken before he heard a whirring of wings over his\nhead, lo

### RAG - shorter files

In [None]:
# import
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from IPython.display import Markdown, display
import chromadb

In [13]:
# create client and a new collection
chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection("quickstart")

# define embedding function
embed_model = HuggingFaceEmbedding(model_name='sentence-transformers/all-MiniLM-L6-v2')
# embed_model =  = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

# load documents
documents = SimpleDirectoryReader("./data/Short_Stories/").load_data()

# set up ChromaVectorStore and load in data
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, embed_model=embed_model
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

NameError: name 'SimpleDirectoryReader' is not defined

# WHAT TO DO 

1. Check different embedding function
2. It return to me the whole document instead of the interesting part of it 

In [8]:

from llama_index.embeddings.huggingface import HuggingFaceEmbedding

Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)



## Storing the vector index

- https://docs.llamaindex.ai/en/stable/understanding/storing/storing/
- https://realpython.com/chromadb-vector-database/ - good chromadb introduction
- https://www.datacamp.com/tutorial/llama-index-adding-personal-data-to-llms