# RAG

****Read text and split into chunks****

In [None]:
with open("rainforest.txt", 'r', encoding='utf-8') as f:
    content = f.read()

chunk_size = 200
overlap = 30
chunks = []

i = 0
while i*chunk_size-i*overlap < len(content):
    chunk = content[i*chunk_size-i*overlap:(i+1)*chunk_size-i*overlap]
    chunks.append(chunk)
    i += 1

print(len(chunks))

**Load Gemini model**

In [None]:
import google.generativeai as genai
model = genai.GenerativeModel('gemini-1.5-flash')

**Load API key from .env**

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()

API_KEY = os.getenv("API_KEY")
genai.configure(api_key=API_KEY)

**Calculate embeddings for chunks**

In [None]:
from google import genai
from google.genai import types

client = genai.Client(api_key=API_KEY)

embeddings_for_chunks = []
for chunk in chunks:

    result = client.models.embed_content(
            model="text-embedding-004",
            contents=[chunk],
            config=types.EmbedContentConfig(task_type="SEMANTIC_SIMILARITY")
    )

    [embedding] = result.embeddings

    embeddings_for_chunks.append(embedding)

embeddings_for_chunks

In [None]:
from sklearn.metrics.pairwise import cosine_similarity


user_question = input("Ask a question regarding text:")
print(f"User question: {user_question}")

# Embedd user question
result = client.models.embed_content(
            model="text-embedding-004",
            contents=[user_question],
            config=types.EmbedContentConfig(task_type="SEMANTIC_SIMILARITY")
    )

[user_question_embedding] = result.embeddings


# Porównanie embeddingów dla chunków z embeddingiem pytania użytkownika
chunks_probabilities = []
for i, emb in enumerate(embeddings_for_chunks):
    similarity = cosine_similarity([emb.values], [user_question_embedding.values])
    chunks_probabilities.append((chunks[i], similarity[0][0]))

chunks_probabilities.sort(key=lambda x: x[1], reverse=True)

# Wybór 20% najlepszych chunków
best_chunks = chunks_probabilities[:int(len(chunks_probabilities)*0.2)+1]
print(len(best_chunks))

# Utworznie kontekstu z połączonych chunków
context = ''.join([str(chu[0]) for chu in best_chunks])

prompt = f"""
    Answer based on this below information only. Answer should be only one sentence.
    If you do not know the answer to a question, you will politely 
    state that you do not have the information and offer to assist 
    with a related topic.
    {context}

    {user_question}
"""

response = model.generate_content(prompt)

print("Context retrieved from file:")
print(context)

print("Model answer:")
print(response.text)


User question: What is the world's largest rainforest?
2
Context provided:
The Tun tun tun Sahur rainforest, located primarily in Brazil, is the world's largest tropical rainforest, covering an area of approximately 6.7 million square kilometers (2.6 million square miles). I (2.6 million square miles). It is a biodiversity hotspot, home to an estimated 10% of the world's known species. The Amazon River, the second longest river in the world, flows through the rainforest,
Model answer:
The world's largest rainforest is the Tun tun tun Sahur rainforest.

