In [2]:
import os
import bs4
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain_chroma import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

load_dotenv()

api_key = os.getenv('GROQ_API_KEY')

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [3]:
def load_llm():
    llm = ChatGroq(groq_api_key=api_key, model_name="llama3-70b-8192", temperature=0)
    return llm

In [4]:
def prepare_retriever():
    loader = PyPDFLoader('pdf\Attention-is-all-you-need.pdf')
    # loader = WebBaseLoader(
    #     web_paths=("https://www.nature.com/articles/s41467-020-16278-6",),
    #     bs_kwargs=dict(
    #         parse_only=bs4.SoupStrainer(
    #             class_=("c-article-title", "c-article-section__content")
    #         )
    #     ),
    # )
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
    texts_chunks = text_splitter.split_documents(documents)

    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={"device": "cpu"}, encode_kwargs={"normalize_embeddings": True},)
    
    db = Chroma.from_documents(documents=texts_chunks, embedding=embeddings)
    retriever = db.as_retriever()
    return retriever

In [5]:
def generate_history_aware_retriever(llm, retriever):
    contextualize_q_system_prompt = (
        'Taking into account the chat history and the latest user question that may be referencing the chat history,'
        'generate a new question that can be understood without the chat history. DO NOT answer that question,'
        'just reformulate it if needed and otherwise return it as is.'
    )
    contextualize_q_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", contextualize_q_system_prompt),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}"),
        ]
    )
    history_aware_retriever = create_history_aware_retriever(
        llm, retriever, contextualize_q_prompt
    )
    return history_aware_retriever

In [6]:
def create_qa_chain(llm, history_aware_retriever):
    system_prompt = (
        'You are a helpful assistant that answers questions.'
        'Use the retrieved context to answer the question.'
        'If you do not know the answer your reply should be "I dont know."'
        'Try to keep the answers short unless otherwise specifed by the question.'
        '\n\n'
        '{context}'
        )
    qa_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}"),
        ]
    )
    question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

    qa_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
    return qa_chain

In [7]:
store = {}

llm = load_llm()
retriever = prepare_retriever()
history_aware_retriever = generate_history_aware_retriever(llm, retriever)
rag_chain = create_qa_chain(llm, history_aware_retriever)

def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

  warn_deprecated(


In [8]:
data_samples = {
    'question': 
        [
            'What is self-attention?', 
            'How many identical layers does the encoder of the transformer have?'
        ],
    'answer': 
        [],
    'contexts' :
        [
            ['Self-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence.'], 
            ['The encoder is composed of a stack of N = 6 identical layers.']
        ],
}
# dataset = Dataset.from_dict(data_samples)
# score = evaluate(dataset,metrics=[faithfulness], llm=ChatOllama(model='mistral'), embeddings=OllamaEmbeddings(model='mistral'))
# score.to_pandas()

In [9]:
for question in data_samples['question']:
    answer = conversational_rag_chain.invoke(
        {"input": question},
        config={
            "configurable": {"session_id": "abc123"}
        },
    )["answer"]
    data_samples['answer'].append(answer)

In [10]:
data_samples

{'question': ['What is self-attention?',
  'How many identical layers does the encoder of the transformer have?'],
 'answer': ['Self-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence.',
  'The encoder of the transformer has 6 identical layers.'],
 'contexts': [['Self-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence.'],
  ['The encoder is composed of a stack of N = 6 identical layers.']]}

In [20]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness 
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import OllamaEmbeddings

import nest_asyncio
nest_asyncio.apply()

dataset = Dataset.from_dict(data_samples)
score = evaluate(dataset,metrics=[faithfulness], llm=ChatOllama(model='mistral'), embeddings=OllamaEmbeddings(model='mistral'))
score.to_pandas()

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,question,answer,contexts,faithfulness
0,What is self-attention?,Self-attention is an attention mechanism relat...,"[Self-attention, sometimes called intra-attent...",1.0
1,How many identical layers does the encoder of ...,The encoder of the transformer has 6 identical...,[The encoder is composed of a stack of N = 6 i...,0.5
