In [None]:
# attach to the existing event loop when using jupyter notebooks
import nest_asyncio
import os
import openai
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

nest_asyncio.apply()

### Load Chroma using Sentence Transformer

In [None]:
from io import BytesIO
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    SentenceTransformersTokenTextSplitter,
)
from langchain_community.vectorstores.chroma import Chroma
from pypdf import PdfReader
pdf_path = os.getenv("FILE_PATH_ENG")

CHROMA_PERSISTENT_DIR = "../../data/chroma"
CHROMA_COLLECTION_NAME = "book_eng_mini_l6"  # 384
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Rec char Splitter
character_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=0,
    separators=["\n\n", "\n", ". ", " ", ""],
)

# sentence token splitter
token_splitter = SentenceTransformersTokenTextSplitter(
    chunk_overlap=0, tokens_per_chunk=256
)

pdf_bytes = None
with open(pdf_path, "rb") as f:
    pdf_bytes = f.read()

doc = PdfReader(BytesIO(pdf_bytes))
docs = []
for page_num in range(len(doc.pages)):
    pdf_page = doc.pages[page_num]
    pdf_page_text = pdf_page.extract_text()

    # skip empty pages
    if not pdf_page_text:
        continue

    # split text
    character_split_texts = character_splitter.split_text(pdf_page_text)

    token_split_texts = []
    for text in character_split_texts:
        token_split_texts += token_splitter.split_text(text)

    # create metadata from token split
    page_nr = int(page_num + 1)

    # set metadata for each split
    metadatas = [
        {"source": pdf_path, "page": page_nr} for _ in token_split_texts
    ]

    # convert to document
    documents = character_splitter.create_documents(
        texts=token_split_texts, metadatas=metadatas
    )

    docs.extend(documents)

db = Chroma.from_documents(
    collection_name=CHROMA_COLLECTION_NAME,
    documents=docs,
    embedding=embedding_function,
    persist_directory=CHROMA_PERSISTENT_DIR,
)




### My Chroma Db with SentenceTransformer

In [None]:
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_community.vectorstores.chroma import Chroma


CHROMA_PERSISTENT_DIR = "../../data/chroma"
CHROMA_COLLECTION_NAME = "book_eng_mini_l6"  # 384
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")


vectorstore = Chroma(
            collection_name=CHROMA_COLLECTION_NAME,
            persist_directory=CHROMA_PERSISTENT_DIR,
           embedding_function=embedding_function,
        )

retriever = vectorstore.as_retriever(search_type="mmr",search_kwargs={"k": 3, "score_threshold": 0.9})

In [None]:
from langchain.document_loaders import TextLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0)

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=retriever,
    return_source_documents=True,
)

### Test RAG

In [35]:
# testing it out
question = "What's the name of the drummer?"
result = qa_chain.invoke({"query": question})
result["result"]

'The name of the drummer is Eugene.'

### Evaluation

In [None]:
eval_questions = [
    "What is name of the forrest?",
    "Who is having a birthday?",
]

eval_answers = [
    "The name of the forrest is Starry Sky",
    "It's Fred birthday",    
]

examples = [
    {"query": q, "ground_truths": [eval_answers[i]]}
    for i, q in enumerate(eval_questions)
]

In [None]:
result = qa_chain.invoke({"query": eval_questions[1]})
result["result"]

In [None]:
result = qa_chain.invoke(examples[0])
result["result"]

### Creating Ragas Evaluators

In [36]:
examples

[{'query': 'What is name of the forrest?',
  'ground_truths': ['The name of the forrest is Starry Sky']},
 {'query': 'Who is having a birthday?',
  'ground_truths': ["It's Fred birthday"]}]

In [37]:
result

{'query': "What's the name of the drummer?",
 'result': 'The name of the drummer is Eugene.',
 'source_documents': [Document(page_content='nowadays mr. m is the most famous manager in the undergrowth. he is the one who made forest band to play all over the world. he met fred a long time ago. at that time, he was working a lot and needed to relax a bit. it was then that he discovered the magical " nap in nature " course organized by fred. since that day, they have became great friends, sharing moments of music, laughter, and joy.', metadata={'page': 27, 'source': 'C:\\Users\\nttLu\\OneDrive\\Illustrated Book\\Midjourney\\The Adventure of Starry Sky V3.pdf'}),
  Document(page_content='" hey, wait a minute... what about the instruments? " says fred. " we need the piano and backing tracks for vj, the bass for flora, the guitar for stella, and the drums for eugene... and of course, a microphone for me. how can we get all this before tonight? "', metadata={'page': 39, 'source': 'C:\\Users\\n

In [None]:
from ragas.langchain.evalchain import RagasEvaluatorChain
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)

# create evaluation chains
faithfulness_chain = RagasEvaluatorChain(metric=faithfulness)
answer_rel_chain = RagasEvaluatorChain(metric=answer_relevancy)
context_rel_chain = RagasEvaluatorChain(metric=context_precision)
context_recall_chain = RagasEvaluatorChain(metric=context_recall)

High faithfulness_score = exact consistency between source documents and answer.

In [None]:
eval_result = faithfulness_chain(result)
eval_result["faithfulness_score"]

In [None]:
eval_result = context_recall_chain(result)
eval_result["context_recall_score"]

### Eval with Predictions

In [None]:
predictions = qa_chain.batch(examples)


In [None]:
predictions

In [None]:
evaluation = faithfulness_chain.evaluate(examples, predictions)
evaluation

## Using with Langsmith

### Prepare dataset

In [None]:
from langsmith import Client
from langsmith.utils import LangSmithError

client = Client()
dataset_name = "Ragas_with_LS"

try:
    dataset = client.read_dataset(dataset_name=dataset_name)
    print("using existing dataset: ", dataset.name)
except LangSmithError:
    dataset = client.create_dataset(
        dataset_name=dataset_name, description="Testing Ragas using LangSmith"
    )
    for e in examples:
        client.create_example(
            inputs={"query": e["query"]},
            outputs={"ground_truths": e["ground_truths"]},
            dataset_id=dataset.id,
        )

    print("dataset created: ", dataset.name)

> Caveat : define a factory function to create a new instance of the chain each time to prevent the reuse of state.

In [None]:
# factory function that return a new qa chain
def create_qa_chain(return_context=True):
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=retriever,
        return_source_documents=return_context,
    )
    return qa_chain

In [None]:
from langchain.smith import RunEvalConfig, run_on_dataset

evaluation_config = RunEvalConfig(
    custom_evaluators=[
        faithfulness_chain,
        answer_rel_chain,
        context_rel_chain,
        #context_recall_chain,
    ],
    prediction_key="result",
)

result = run_on_dataset(
    client,
    dataset_name,
    create_qa_chain,
    evaluation=evaluation_config,
)