In [17]:
import os
import bs4
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain_chroma import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

load_dotenv()

api_key = os.getenv('GROQ_API_KEY')

In [18]:
def load_llm():
    llm = ChatGroq(groq_api_key=api_key, model_name="llama3-70b-8192", temperature=0)
    return llm

In [19]:
def prepare_retriever():
    loader = PyPDFLoader('pdf\Attention-is-all-you-need.pdf')
    # loader = WebBaseLoader(
    #     web_paths=("https://www.nature.com/articles/s41467-020-16278-6",),
    #     bs_kwargs=dict(
    #         parse_only=bs4.SoupStrainer(
    #             class_=("c-article-title", "c-article-section__content")
    #         )
    #     ),
    # )
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
    texts_chunks = text_splitter.split_documents(documents)

    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={"device": "cpu"}, encode_kwargs={"normalize_embeddings": True},)
    
    db = Chroma.from_documents(documents=texts_chunks, embedding=embeddings)
    retriever = db.as_retriever()
    return retriever

In [20]:
def generate_history_aware_retriever(llm, retriever):
    contextualize_q_system_prompt = (
        'Taking into account the chat history and the latest user question that may be referencing the chat history,'
        'generate a new question that can be understood without the chat history. DO NOT answer that question,'
        'just reformulate it if needed and otherwise return it as is.'
    )
    contextualize_q_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", contextualize_q_system_prompt),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}"),
        ]
    )
    history_aware_retriever = create_history_aware_retriever(
        llm, retriever, contextualize_q_prompt
    )
    return history_aware_retriever

In [21]:
def create_qa_chain(llm, history_aware_retriever):
    system_prompt = (
        'You are a helpful assistant that answers questions.'
        'Use the retrieved context to answer the question.'
        'If you do not know the answer your reply should be "I dont know."'
        'Try to keep the answers short unless otherwise specifed by the question.'
        '\n\n'
        '{context}'
        )
    qa_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}"),
        ]
    )
    question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

    qa_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
    return qa_chain

In [22]:
store = {}

llm = load_llm()
retriever = prepare_retriever()
history_aware_retriever = generate_history_aware_retriever(llm, retriever)
rag_chain = create_qa_chain(llm, history_aware_retriever)

def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [23]:
data_samples = {
    'question': 
        [
            'What is self-attention?', 
            'How many identical layers does the encoder of the transformer have?'
        ],
    'answer': 
        [],
    'contexts' :
        [
            ['Self-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence.'], 
            ['The encoder is composed of a stack of N = 6 identical layers.']
        ],
}
# dataset = Dataset.from_dict(data_samples)
# score = evaluate(dataset,metrics=[faithfulness], llm=ChatOllama(model='mistral'), embeddings=OllamaEmbeddings(model='mistral'))
# score.to_pandas()

In [24]:
for question in data_samples['question']:
    answer = conversational_rag_chain.invoke(
        {"input": question},
        config={
            "configurable": {"session_id": "abc123"}
        },
    )["answer"]
    data_samples['answer'].append(answer)

Task exception was never retrieved
future: <Task finished name='Task-630' coro=<as_completed.<locals>.sema_coro() done, defined at c:\Users\irvin\Documents\GitHub\support-chatbot\venv\lib\site-packages\ragas\executor.py:35> exception=TimeoutError()>
Traceback (most recent call last):
  File "C:\Users\irvin\anaconda3\lib\asyncio\tasks.py", line 232, in __step
    result = coro.send(None)
  File "c:\Users\irvin\Documents\GitHub\support-chatbot\venv\lib\site-packages\langchain_core\language_models\chat_models.py", line 853, in _agenerate_with_cache
    result = await self._agenerate(
  File "c:\Users\irvin\Documents\GitHub\support-chatbot\venv\lib\site-packages\langchain_community\chat_models\ollama.py", line 323, in _agenerate
    final_chunk = await self._achat_stream_with_aggregation(
  File "c:\Users\irvin\Documents\GitHub\support-chatbot\venv\lib\site-packages\langchain_community\chat_models\ollama.py", line 244, in _achat_stream_with_aggregation
    async for stream_resp in self._ac

In [25]:
data_samples

{'question': ['What is self-attention?',
  'How many identical layers does the encoder of the transformer have?'],
 'answer': ["The text doesn't explicitly define what self-attention is, but based on the context, it appears to be a type of layer in a neural network that allows the model to attend to different parts of the input sequence in order to compute a representation of the sequence.",
  'The encoder of the transformer has N=6 identical layers.'],
 'contexts': [['Self-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence.'],
  ['The encoder is composed of a stack of N = 6 identical layers.']]}

In [26]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness 
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import OllamaEmbeddings

import nest_asyncio
nest_asyncio.apply()

dataset = Dataset.from_dict(data_samples)
score = evaluate(dataset,metrics=[faithfulness], llm=ChatOllama(model='mistral'), embeddings=OllamaEmbeddings(model='mistral'))
score.to_pandas()

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,question,answer,contexts,faithfulness
0,What is self-attention?,The text doesn't explicitly define what self-a...,"[Self-attention, sometimes called intra-attent...",1.0
1,How many identical layers does the encoder of ...,The encoder of the transformer has N=6 identic...,[The encoder is composed of a stack of N = 6 i...,0.333333


In [27]:
from datasets import Dataset 
from ragas.metrics import answer_relevancy
from ragas import evaluate

# data_samples = {
#     'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
#     'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
#     'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'], 
#     ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
# }

nest_asyncio.apply()

dataset = Dataset.from_dict(data_samples)
score = evaluate(dataset,metrics=[answer_relevancy], llm=ChatOllama(model='mistral'), embeddings=OllamaEmbeddings(model='mistral'))
score.to_pandas()


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,question,answer,contexts,answer_relevancy
0,What is self-attention?,The text doesn't explicitly define what self-a...,"[Self-attention, sometimes called intra-attent...",0.719229
1,How many identical layers does the encoder of ...,The encoder of the transformer has N=6 identic...,[The encoder is composed of a stack of N = 6 i...,0.781346


In [28]:
data_samples = {
    'question': 
        [
            'What is the fundamental mechanism that the Transformer model relies on?', 
            'How does the Transformer model handle global dependencies?',
            'What components of traditional models does the Transformer model replace?',
            'How many layers does the Transformer model typically have?',
            'What are the two main sub-layers in each Transformer layer?',
            'What is the role of the multi-head self-attention mechanism in the Transformer model?',
            'What does each sub-layer in the Transformer model use for normalization?',
            'What is added to the input embeddings and output embeddings in the Transformer model?',
            'Why are positional encodings used in the Transformer model?',
            'How are positional encodings generated in the Transformer model?',
            'What is the dimension of the input and output embeddings in the Transformer model?',
            # 'What optimization technique is used during the training of the Transformer model?',
            # 'What regularization technique is employed in the Transformer model?',
            # 'How is scaled dot-product attention computed in the Transformer model?',
            # 'What is multi-head attention in the Transformer model?',
            # 'What advantage does multi-head attention provide?',
            # 'What are the dimensions of the queries, keys, and values in each head of the multi-head attention mechanism?',
            # 'What activation function is used in the feed-forward network of the Transformer model?',
            # 'What is the main advantage of using attention mechanisms over recurrent layers?',
            # 'What metric is used to evaluate the performance of the Transformer model?',
            # 'How does the Transformer model perform compared to previous state-of-the-art models?',
            # 'How is the final output of the Transformer model produced?'
        ],
    'answer': 
        [],
    'contexts' :
        [
            ['Instead of using recurrence, the Transformer uses an attention mechanism to draw global dependencies between input and output.'], 
            ['The Transformer uses an attention mechanism to draw global dependencies between input and output.'],
            ['Our model, the Transformer, is based solely on attention mechanisms, dispensing with recurrence and convolutions entirely.'],
            ['The Transformer model architecture consists of a stack of six identical layers.'],
            ['Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position-wise fully connected feed-forward network.'],
            ['The first sub-layer in each layer is a multi-head self-attention mechanism, which allows the model to focus on different parts of the input sequence.'],
            ['We employ a residual connection around each of the two sub-layers, followed by layer normalization.'],
            ['We add positional encodings to the input embeddings and the output embeddings at the bottoms of the encoder and decoder stacks.'],
            ['Since our model contains no recurrence and no convolution, in order for the model to make use of the order of the sequence, we must inject some information about the relative or absolute position of the tokens in the sequence.'],
            ['We use sine and cosine functions of different frequencies as positional encodings.'],
            ['We use learned embeddings to convert the input tokens and output tokens to vectors of dimension d_model.'],
            # ['We use the Adam optimizer with β1 = 0.9, β2 = 0.98, and ε = 10^-9.'],
            # ['We employ residual dropout, with a rate of P_drop = 0.1.'],
            # ['The input consists of queries and keys of dimension dk, and values of dimension dv. We compute the dot products of the query with all keys, divide each by √dk, and apply a softmax function to obtain the weights on the values.'],
            # ['Multi-head attention consists of several attention layers running in parallel, called heads. Each head has its own set of weights for linear transformations of the queries, keys, and values.'],
            # ['Using multi-head attention allows the model to jointly attend to information from different representation subspaces at different positions.'],
            # ['Each head has a dimension of dk = d_model/h.'],
            # ['We apply the ReLU activation function after the first linear transformation in the feed-forward network.'],
            # ['Attention mechanisms can draw dependencies regardless of their distance in the input or output sequences, which makes them more parallelizable than recurrent layers.'],
            # ['We report results using the BLEU score, a common metric for evaluating the quality of machine-translated text.'],
            # ['Our Transformer model outperforms the previously best reported models on both the WMT 2014 English-to-German and English-to-French translation tasks.'],
            # ["The decoder generates the output sequence one token at a time, with each generated token being conditioned on the previously generated tokens and the encoder's output."]
        ],
    'ground_truth' :
        [
            'The fundamental mechanism is the attention mechanism.',
            'By using an attention mechanism.',
            'It replaces recurrence and convolutions.',
            'It typically has six layers.',
            'A multi-head self-attention mechanism and a fully connected feed-forward network.',
            'It allows the model to focus on different parts of the input sequence.',
            'Layer normalization.',
            'Positional encodings.',
            'To inject information about the relative or absolute position of the tokens in the sequence.',
            'Using sine and cosine functions of different frequencies.',
            'The dimension is d_model.',
            # 'The Adam optimizer.',
            # 'Residual dropout.',
            # 'By computing the dot products of the query with all keys, dividing each by √dk, and applying a softmax function.',
            # 'It consists of several attention layers running in parallel, each with its own set of weights.',
            # 'It allows the model to jointly attend to information from different representation subspaces at different positions.',
            # 'The dimension is dk = d_model/h.',
            # 'The ReLU activation function.',
            # 'They can draw dependencies regardless of their distance and are more parallelizable.',
            # 'The BLEU score.',
            # 'It outperforms the previously best reported models.',
            # "The decoder generates the output sequence one token at a time, conditioned on previously generated tokens and the encoder's output."
        ]
}

for question in data_samples['question']:
    answer = conversational_rag_chain.invoke(
        {"input": question},
        config={
            "configurable": {"session_id": "abc123"}
        },
    )["answer"]
    data_samples['answer'].append(answer)

In [29]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    answer_similarity,
    answer_correctness,
)
from ragas.metrics.critique import harmfulness

nest_asyncio.apply()

dataset = Dataset.from_dict(data_samples)
score = evaluate(dataset,metrics=[faithfulness], llm=ChatOllama(model='mistral'), embeddings=OllamaEmbeddings(model='mistral'))
score.to_pandas()

Evaluating:   0%|          | 0/11 [00:00<?, ?it/s]

Unnamed: 0,question,answer,contexts,ground_truth,faithfulness
0,What is the fundamental mechanism that the Tra...,The fundamental mechanism that the Transformer...,"[Instead of using recurrence, the Transformer ...",The fundamental mechanism is the attention mec...,1.0
1,How does the Transformer model handle global d...,The Transformer model handles global dependenc...,[The Transformer uses an attention mechanism t...,By using an attention mechanism.,1.0
2,What components of traditional models does the...,The Transformer model replaces recurrence and ...,"[Our model, the Transformer, is based solely o...",It replaces recurrence and convolutions.,1.0
3,How many layers does the Transformer model typ...,The Transformer model typically has 6 identica...,[The Transformer model architecture consists o...,It typically has six layers.,0.5
4,What are the two main sub-layers in each Trans...,The two main sub-layers in each Transformer la...,[Each layer has two sub-layers. The first is a...,A multi-head self-attention mechanism and a fu...,1.0
5,What is the role of the multi-head self-attent...,The role of the multi-head self-attention mech...,[The first sub-layer in each layer is a multi-...,It allows the model to focus on different part...,1.0
6,What does each sub-layer in the Transformer mo...,Each sub-layer in the Transformer model uses L...,[We employ a residual connection around each o...,Layer normalization.,0.0
7,What is added to the input embeddings and outp...,Positional encodings are added to the input em...,[We add positional encodings to the input embe...,Positional encodings.,1.0
8,Why are positional encodings used in the Trans...,Positional encodings are used in the Transform...,[Since our model contains no recurrence and no...,To inject information about the relative or ab...,1.0
9,How are positional encodings generated in the ...,Positional encodings are generated using sinus...,[We use sine and cosine functions of different...,Using sine and cosine functions of different f...,1.0


In [31]:
score

{'faithfulness': 0.8636}

In [32]:
score.to_pandas()

Unnamed: 0,question,answer,contexts,ground_truth,faithfulness
0,What is the fundamental mechanism that the Tra...,The fundamental mechanism that the Transformer...,"[Instead of using recurrence, the Transformer ...",The fundamental mechanism is the attention mec...,1.0
1,How does the Transformer model handle global d...,The Transformer model handles global dependenc...,[The Transformer uses an attention mechanism t...,By using an attention mechanism.,1.0
2,What components of traditional models does the...,The Transformer model replaces recurrence and ...,"[Our model, the Transformer, is based solely o...",It replaces recurrence and convolutions.,1.0
3,How many layers does the Transformer model typ...,The Transformer model typically has 6 identica...,[The Transformer model architecture consists o...,It typically has six layers.,0.5
4,What are the two main sub-layers in each Trans...,The two main sub-layers in each Transformer la...,[Each layer has two sub-layers. The first is a...,A multi-head self-attention mechanism and a fu...,1.0
5,What is the role of the multi-head self-attent...,The role of the multi-head self-attention mech...,[The first sub-layer in each layer is a multi-...,It allows the model to focus on different part...,1.0
6,What does each sub-layer in the Transformer mo...,Each sub-layer in the Transformer model uses L...,[We employ a residual connection around each o...,Layer normalization.,0.0
7,What is added to the input embeddings and outp...,Positional encodings are added to the input em...,[We add positional encodings to the input embe...,Positional encodings.,1.0
8,Why are positional encodings used in the Trans...,Positional encodings are used in the Transform...,[Since our model contains no recurrence and no...,To inject information about the relative or ab...,1.0
9,How are positional encodings generated in the ...,Positional encodings are generated using sinus...,[We use sine and cosine functions of different...,Using sine and cosine functions of different f...,1.0


In [33]:
score['faithfulness']

0.8636363636363636