In [12]:
# Install necessary packages
!pip install crewai sentence-transformers pydantic google-generativeai langchain-community

# Setup imports
from typing import List
from pydantic import BaseModel
from crewai import Agent, Task, Crew
from sentence_transformers import SentenceTransformer, util
import torch
import google.generativeai as genai
import json
import os




In [13]:
# Set your Gemini API key
os.environ["GOOGLE_API_KEY"] = "api+token"
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [14]:
# Define document grading model
class GradeDocument(BaseModel):
    document: str
    grade: bool

# Load SentenceTransformer model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Sample documents


In [15]:
!pip install pypdf faiss-cpu



In [23]:
# Install necessary packages
!pip install pdfminer.six langchain sentence-transformers chromadb

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings


[0m

In [24]:

# Load PDF and split into chunks
pdf_path = "/content/gemma.pdf"  # Replace with actual file path
loader = PyPDFLoader(pdf_path)
documents = loader.load()

# Chunk the loaded documents
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
)
chunks = splitter.split_documents(documents)

# Initialize embedding model using LangChain-compatible wrapper
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Create and store Chroma vector store
persist_directory = "./chroma_store"
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory=persist_directory
)
vectorstore.persist()
print("Chroma vector store created and persisted locally.")

# Load retriever from stored index
retriever = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding_model
).as_retriever()

# Define query
query = "the vision encoder, or to the possibility that the 7b-Dino combination is undertrained"

# Retrieve top-k similar chunks
relevant_docs = retriever.get_relevant_documents(query)

# Print results
for i, doc in enumerate(relevant_docs):
    print(f"\nChunk {i+1}:")
    print(doc.page_content)

Chroma vector store created and persisted locally.

Chunk 1:
and the richness of the representation provided by the vision
encoder, or to the possibility that the 7b-Dino combination
is undertrained.
3.2. Effects of Pretraining
We find that skipping the initial connector pretraining al-
most always reduces model performance. With the ex-
ceptions of 2B-Dino on MME Cognition and 7B-CLIP
on MME Cognition, MM-Vet and ScienceQA, the variant
with a pretrained connector outperforms its counterpart that
skipped pretraining. These results do not support the hy-

Chunk 2:
and the richness of the representation provided by the vision
encoder, or to the possibility that the 7b-Dino combination
is undertrained.
3.2. Effects of Pretraining
We find that skipping the initial connector pretraining al-
most always reduces model performance. With the ex-
ceptions of 2B-Dino on MME Cognition and 7B-CLIP
on MME Cognition, MM-Vet and ScienceQA, the variant
with a pretrained connector outperforms its counte

In [26]:
!pip install langchain-google-genai

Collecting google-ai-generativelanguage<0.7.0,>=0.6.18 (from langchain-google-genai)
  Using cached google_ai_generativelanguage-0.6.18-py3-none-any.whl.metadata (9.8 kB)
Using cached google_ai_generativelanguage-0.6.18-py3-none-any.whl (1.4 MB)
[0mInstalling collected packages: google-ai-generativelanguage
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-generativeai 0.8.5 requires google-ai-generativelanguage==0.6.15, but you have google-ai-generativelanguage 0.6.18 which is incompatible.[0m[31m
[0mSuccessfully installed google-ai-generativelanguage


In [29]:
# Install necessary packages
!pip install pdfminer.six langchain sentence-transformers chromadb crewai google-generativeai langchain-community

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from crewai import Agent, Task, Crew
import json
import os

# Set Gemini API Key
os.environ["GOOGLE_API_KEY"] = "api_token"

# Load PDF and split into chunks
pdf_path = "/content/gemma.pdf"
loader = PyPDFLoader(pdf_path)
documents = loader.load()

# Chunk the loaded documents
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
)
chunks = splitter.split_documents(documents)

# Initialize embedding model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Create and store Chroma vector store
persist_directory = "./chroma_store"
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory=persist_directory
)
vectorstore.persist()
print("Chroma vector store created and persisted locally.")

# Load retriever from stored index
retriever = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding_model
).as_retriever()

# Define query context
context = "the vision encoder, or to the possibility that the 7b-Dino combination is undertrained"

# Retrieve top-k similar chunks
relevant_docs = retriever.get_relevant_documents(context)
filtered_docs = [doc.page_content for doc in relevant_docs]

# Load Gemini LLM
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0)

# Format prompt for grading
grading_prompt = f"""
You are given a list of document chunks. Some are relevant to the following context:

Context: "{context}"

Document Chunks:
{filtered_docs}

Grade each document as relevant (true) or not (false) in the following JSON format:
[
  {{
    "document": "...",
    "grade": true
  }},
  ...
]
"""

# Run grading through Gemini
graded_output = llm.invoke(grading_prompt)
print("Graded Output:\n", graded_output)

# Parse graded output
try:
    graded_docs = json.loads(graded_output)
except Exception as e:
    print("Error parsing LLM output. Please check format.")
    graded_docs = []

# Extract relevant documents
relevant_filtered_docs = [doc['document'] for doc in graded_docs if doc.get('grade') is True]

# Define CrewAI agent and task
gen_agent = Agent(
    role="Response Generator",
    goal="Generate accurate response from relevant documents",
    backstory="Expert in synthesizing information from multiple documents to generate insightful answers.",
    llm=llm,
    verbose=True
)

gen_task = Task(
    description=f"""Using the relevant context below, generate a detailed response.

Relevant Documents:
{relevant_filtered_docs}

Query: {context}
""",
    expected_output="A comprehensive paragraph summarizing the answer based on the context.",
    agent=gen_agent
)

crew = Crew(agents=[gen_agent], tasks=[gen_task], verbose=True)

# Run CrewAI pipeline
result = crew.kickoff()
# print("\nFinal Result:\n", result)

Collecting google-ai-generativelanguage==0.6.15 (from google-generativeai)
  Using cached google_ai_generativelanguage-0.6.15-py3-none-any.whl.metadata (5.7 kB)
Using cached google_ai_generativelanguage-0.6.15-py3-none-any.whl (1.3 MB)
[0mInstalling collected packages: google-ai-generativelanguage
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-google-genai 2.1.4 requires google-ai-generativelanguage<0.7.0,>=0.6.18, but you have google-ai-generativelanguage 0.6.15 which is incompatible.[0m[31m
[0mSuccessfully installed google-ai-generativelanguage
Chroma vector store created and persisted locally.
Graded Output:
 content='```json\n[\n  {\n    "document": "and the richness of the representation provided by the vision\\nencoder, or to the possibility that the 7b-Dino combination\\nis undertrained.\\n3.2. Effects of Pretraining\\nWe find tha


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m



Output()

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from crewai import Agent, Task, Crew
import json
import os
import asyncio  # Import asyncio
import time  # Import time

# Install required packages before running this script:
# !pip install pdfminer.six langchain sentence-transformers chromadb crewai google-generativeai langchain-community

# Set Gemini API Key
os.environ["GOOGLE_API_KEY"] = "api_token"  # Replace with your actual API key

# Load PDF and split into chunks
pdf_path = "/content/gemma.pdf"
loader = PyPDFLoader(pdf_path)
documents = loader.load()

# Chunk the loaded documents
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
)
chunks = splitter.split_documents(documents)

# Initialize embedding model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Create and store Chroma vector store
persist_directory = "./chroma_store"
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory=persist_directory
)
vectorstore.persist()
print("Chroma vector store created and persisted locally.")

# Load retriever from stored index
retriever = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding_model
).as_retriever()

# Define query context
context = "the vision encoder, or to the possibility that the 7b-Dino combination is undertrained"

# Retrieve top-k similar chunks
relevant_docs = retriever.get_relevant_documents(context)

if not relevant_docs:
    print("No relevant documents found for the given context.")
    exit()

# Filtered documents
filtered_docs = [doc.page_content for doc in relevant_docs if doc.page_content.strip() != ""]

if not filtered_docs:
    print("All retrieved documents were empty.")
    exit()

# Load Gemini LLM
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0)

# Format prompt for grading
grading_prompt = f"""
You are given a list of document chunks. Some are relevant to the following context:

Context: "{context}"

Document Chunks:
{json.dumps(filtered_docs, indent=2)}

Grade each document as relevant (true) or not (false) in the following JSON format:
[
  {{
    "document": "...",
    "grade": true
  }},
  ...
]
"""

# Run grading through Gemini
graded_output = llm.invoke(grading_prompt)
print("Graded Output:\n", graded_output)

# Parse graded output
try:
    # Access the content attribute of the AIMessage object which contains the JSON string
    graded_docs = json.loads(graded_output.content)
except json.JSONDecodeError as e:
    print(f"Error parsing LLM output: {e}. Please check format.")
    graded_docs = []

# Extract relevant documents
relevant_filtered_docs = [doc['document'] for doc in graded_docs if doc.get('grade') is True]

if not relevant_filtered_docs:
    print("No relevant documents were graded as true. Skipping response generation.")
    exit()

# Define CrewAI agent and task
gen_agent = Agent(
    role="Response Generator",
    goal="Generate accurate response from relevant documents",
    backstory="Expert in synthesizing information from multiple documents to generate insightful answers.",
    llm=llm,
    # Set verbose to False to potentially avoid recursion issues with rich rendering
    verbose=False
)

gen_task = Task(
    description=f"""Using the relevant context below, generate a detailed response.

Relevant Documents:
{json.dumps(relevant_filtered_docs, indent=2)}

Query: {context}
""",
    expected_output="A comprehensive paragraph summarizing the answer based on the context.",
    agent=gen_agent
)

# Initialize Crew with verbose=False
crew = Crew(agents=[gen_agent], tasks=[gen_task], verbose=False) # Set verbose to False

# Run CrewAI pipeline
# While asyncio and sleep are imported, direct application here might not solve the recursion issue
# as it's related to the internal workings of rich during verbose logging.
# If you were implementing asynchronous API calls or waiting for external resources,
# you would use await and asyncio.sleep.
# Example of a potential asyncio usage (not directly fixing the rich recursion error):
# async def run_crew():
#     result = crew.kickoff()
#     await asyncio.sleep(2) # Example of a sleep
#     print("\nFinal Result:\n", result)
#
# if __name__ == "__main__":
#     asyncio.run(run_crew())

# For now, running the kickoff synchronously
result = crew.kickoff()
print("\nFinal Result:\n", result)