In [39]:
! pip install -U langchain-nomic langchain_community tiktoken langchainhub chromadb langchain langgraph tavily-python gpt4all firecrawl-py transformers torch einops

Collecting einops
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Downloading einops-0.8.0-py3-none-any.whl (43 kB)
Installing collected packages: einops
Successfully installed einops-0.8.0


In [36]:
import os 
from dotenv import load_dotenv

load_dotenv()

LANGCHAIN_TRACING_V2 = os.environ['LANGCHAIN_TRACING_V2'] = 'true'
LANGCHAIN_ENDPOINT = os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
LANGCHAIN_API_KEY = os.environ['LANGCHAIN_API_KEY']
FIRECRAWL_API_KEY = os.environ['FIRECRAWL_API_KEY']

In [60]:
from langchain_community.chat_models import ChatOllama

llm = ChatOllama(model="llama3.1:latest", format="json", temperature=0)

In [42]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import FireCrawlLoader
from langchain.docstore.document import Document

# Load documents
urls = [
    "https://www.tokyotechies.com",
    "https://www.tokyotechies.com/about-us",
    "https://www.tokyotechies.com/solutions/kotae"
]
docs = [FireCrawlLoader(api_key=FIRECRAWL_API_KEY, url=url, mode="scrape").load() for url in urls]

# Flatten the list of documents
docs_list = [item for sublist in docs for item in sublist]

# Split documents
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=250, chunk_overlap=0
)
doc_splits = text_splitter.split_documents(docs_list)

# Filter out complex metadata
filtered_docs = []
for doc in doc_splits:
    if isinstance(doc, Document) and hasattr(doc, 'metadata'):
        clean_metadata = {k: v for k, v in doc.metadata.items() if isinstance(v, (str, int, float, bool))}
        filtered_docs.append(Document(page_content=doc.page_content, metadata=clean_metadata))

# Save the filtered_docs to a file or cache for later use
import pickle

with open('filtered_docs.pkl', 'wb') as f:
    pickle.dump(filtered_docs, f)

In [45]:
import pickle 

with open('filtered_docs.pkl', 'rb') as f:
    filtered_docs = pickle.load(f)

print(filtered_docs)



In [49]:
from transformers import AutoModel, AutoTokenizer
import torch
from langchain_community.vectorstores import Chroma

# Load the filtered_docs from the saved file
import pickle

with open('filtered_docs.pkl', 'rb') as f:
    filtered_docs = pickle.load(f)

# Load the embedding model and tokenizer
model_name = "nomic-ai/nomic-embed-text-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)

# Function to generate embeddings using the loaded model
def embed_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings[0].cpu().numpy().tolist()  # Convert ndarray to list

# Wrapper class to use with Chroma
class CustomEmbedding:
    def embed_documents(self, texts):
        return [embed_text(text) for text in texts]

    def embed_query(self, text):
        return embed_text(text)

# Instantiate the embedding class
custom_embedding = CustomEmbedding()

# Add documents with embeddings to the vectorDB using the embedding class
vectorstore = Chroma.from_documents(
    documents=filtered_docs,
    collection_name="rag-chroma",
    embedding=custom_embedding,  # Use the embedding class instance
)

retriever = vectorstore.as_retriever()





In [63]:
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser

# Define the prompt template
prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a grader assesing 
    relevance of a retrieved document to a user question. If the document contains keywords related to the user question,
    grade it as relevant. It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n 
    Give a binary score 'yes' or 'no' score to indicate whether the docuemnt is relevant to the question. \n 
    Providde the binary score as a JSON with a single key 'score' and no premable or explaination.
    <|eot_id|><|start_header_id|>user<|end_header_id|>
    Here is the retrieved document* \n\n {document} \n\n
    Here is the user question* {question} \n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
""",
    input_variables=["question", "document"]
)

# Chain the prompt, LLM, and output parser together
retrieval_grader = prompt | llm | JsonOutputParser()

# Define the user question
question = "What is Tokyo Techies?"

# Retrieve documents related to the question
docs = retriever.invoke(question)

# Get the content of the second retrieved document
doc_txt = docs[1].page_content  # Use page_content instead of page_context

# Grade the relevance of the document
result = retrieval_grader.invoke({"question": question, "document": doc_txt})

# Print the result
print(result)

{'score': 'yes'}


In [71]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate


prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are an assitant for question-answering tasks.
    Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.
    Use three sentences maximum and keep the answer concise <|eot_id|><|start_header_id|>user<|end_header_id|>
    Question: {question}
    Context: {context}
    Answer: <|eot_id|><|start_header_id|>assistant<|end_header_id|>
""",
input_variables=["question", "document"]
)

#Post processing
def format_doc(docs):
    return"\n\n".join(doc.page_content for doc in docs)

rag_chain = prompt | llm | StrOutputParser()

#Run

question = "what is kotae?"
docs = retriever.invoke(question)
generation = rag_chain.invoke({"context": docs, "question": question})
print(generation)

{ "Kotae"  : "A chatbot that automates conversations and delights customers by generating responses from a company's knowledge base, allowing for effortless setup, AI-driven automation, and customization." }
