In [1]:
from llama_index.llms.gemini import Gemini
from llama_index.core.llms import ChatMessage
from dotenv import load_dotenv
import os

load_dotenv()

GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")

llm = Gemini(
    model="models/gemini-1.5-flash",
    api_key=GOOGLE_API_KEY  # uses GOOGLE_API_KEY env var by default
)

In [4]:
import chromadb
from chromadb.utils import embedding_functions

CHROMA_DATA_PATH = "chroma_data/"
EMBED_MODEL = "all-MiniLM-L6-v2"
COLLECTION_NAME = "ex1"

client = chromadb.PersistentClient(path=CHROMA_DATA_PATH)

In [5]:
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBED_MODEL
)

collection = client.create_collection(
    name=COLLECTION_NAME,
    embedding_function=embedding_func,
    metadata={"hnsw:space": "cosine"},
)

In [6]:
folder_path = './data/Short_Stories'

# Initialize variables
documents = []  # To store the text content of each PDF
ids = []  # To store the names of each PDF file

# Iterate over all files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".txt"):  # Check if the file is a TXT file
        ids.append(file_name)  # Add the file name to the names list
        file_path = os.path.join(folder_path, file_name)  # Full file path

        # Read the TXT file content
        with open(file_path, "r", encoding="utf-8") as file:
            txt_text = file.read()

        documents.append(txt_text)  # Add the full text of the TXT file to the list

In [7]:
collection.add(
    documents=documents,
    ids=ids
    )

In [8]:
query_results = collection.query(
    query_texts=["Find me some delicious food!"],
    n_results=1,
)

query_results.keys()

dict_keys(['ids', 'embeddings', 'documents', 'uris', 'data', 'metadatas', 'distances', 'included'])

In [11]:
query_results['ids']

[['027.txt']]

## Data Preparation
1. Load raw data.
2. Split it into chunks.
3. Embed it using an embedding model.
4. Store the embeddings in a vector store (e.g., ChromaDB).

In [8]:

from llama_index.embeddings.huggingface import HuggingFaceEmbedding

Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)



## Storing the vector index

- https://docs.llamaindex.ai/en/stable/understanding/storing/storing/
- https://realpython.com/chromadb-vector-database/ - good chromadb introduction
- https://www.datacamp.com/tutorial/llama-index-adding-personal-data-to-llms