In [8]:
import os
import pickle
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

load_dotenv()

api_key = os.getenv('GROQ_API_KEY')

In [9]:
def load_llm(model):
    llm = ChatGroq(groq_api_key=api_key, model=model, temperature=0.2)
    return llm

In [10]:
def prepare_retriever():
    loader = PyPDFLoader('pdf\Attention-is-all-you-need.pdf')
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
    texts_chunks = text_splitter.split_documents(documents)

    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={"device": "cpu"}, encode_kwargs={"normalize_embeddings": True},)
    
    db = Chroma.from_documents(documents=texts_chunks, embedding=embeddings)
    retriever = db.as_retriever()
    return retriever

In [11]:
def generate_history_aware_retriever(llm, retriever):
    contextualize_q_system_prompt = (
        'Taking into account the chat history and the latest user question that may be referencing the chat history,'
        'generate a new question that can be understood without the chat history. DO NOT answer that question,'
        'just reformulate it if needed and otherwise return it as is.'
    )
    contextualize_q_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", contextualize_q_system_prompt),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}"),
        ]
    )
    history_aware_retriever = create_history_aware_retriever(
        llm, retriever, contextualize_q_prompt
    )
    return history_aware_retriever

In [12]:
def create_qa_chain(llm, history_aware_retriever):
    system_prompt = (
        'You are a helpful assistant that answers questions.'
        'Use the retrieved context to answer the question.'
        'If you do not know the answer your reply should be "I dont know."'
        'Try to keep the answers short unless otherwise specifed by the question.'
        '\n\n'
        '{context}'
        )
    qa_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}"),
        ]
    )
    question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

    qa_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
    return qa_chain

In [13]:
def create_model_eval_set(model, data):

    store = {}
    llm = load_llm(model)
    retriever = prepare_retriever()
    history_aware_retriever = generate_history_aware_retriever(llm, retriever)
    rag_chain = create_qa_chain(llm, history_aware_retriever)

    def get_session_history(session_id: str) -> BaseChatMessageHistory:
        if session_id not in store:
            store[session_id] = ChatMessageHistory()
        return store[session_id]

    conversational_rag_chain = RunnableWithMessageHistory(
        rag_chain,
        get_session_history,
        input_messages_key="input",
        history_messages_key="chat_history",
        output_messages_key="answer",
    )

    data['answer'] = []

    for question in data['question']:
        answer = conversational_rag_chain.invoke(
            {"input": question},
            config={
                "configurable": {"session_id": model}
            },
        )["answer"]
        data['answer'].append(answer)

    with open(f'data\{model}.pkl', 'wb') as pickle_file:
        pickle.dump(data, pickle_file)

In [14]:
models = ['llama3-70b-8192', 'llama3-8b-8192', 'mixtral-8x7b-32768', 'gemma-7b-it', 'gemma2-9b-it']

data_samples = {
    'question': 
        [
            'What is the primary architectural innovation introduced in the "Attention is All You Need" paper?', 
            'How does the Transformer model handle the sequential nature of input data without recurrence?',
            'What is the main advantage of the Transformer model over RNNs?',
            'What are multi-head attention mechanisms in the Transformer model?',
            'How does the Transformer model utilize residual connections?',
            'What optimization method is used to train the Transformer model?',
            'How does the self-attention mechanism work in the Transformer model?',
            'What regularization technique is applied to the Transformer model during training?',
            'How is the final output of the Transformer model produced?',
            'What datasets were used to evaluate the Transformer model?'

        ],
    'answer': 
        [],
    'contexts' :
        [
            ['In this work, we propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely.'], 
            ['To handle the sequential nature of the data, the model adds positional encodings to the input embeddings at the bottoms of the encoder and decoder stacks.'],
            ['The Transformer allows for significantly more parallelization, which allows training on much more data than is possible for RNNs, and reduces the training time considerably.'],
            ['Instead of performing a single attention function with d_model-dimensional keys, values and queries, we found it beneficial to linearly project the queries, keys and values h times with different, learned linear projections to dk, dv and dq dimensions, respectively.'],
            ['We employ a residual connection around each of the two sub-layers, followed by layer normalization.'],
            ['We use the Adam optimizer with β1=0.9, β2=0.98 and ε=10−9.'],
            ['In the self-attention mechanism, each position in the sequence attends to all positions, which allows it to draw global dependencies between input and output.'],
            ['We apply dropout to the output of each sub-layer, before it is added to the sub-layer input and normalized.'],
            ["The decoder generates the output sequence one token at a time, with each generated token being conditioned on the previously generated tokens and the encoder's output."],
            ['We evaluate our models on the WMT 2014 English-to-German and English-to-French translation tasks.']
        ],
    'ground_truth' :
        [
            'The Transformer architecture, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely.',
            'The Transformer model handles the sequential nature of input data without recurrence by adding positional encodings to the input embeddings at the bottoms of the encoder and decoder stacks.',
            'The main advantage of the Transformer model is significantly more parallelization, allowing for training on much more data and reducing the training time considerably.',
            'Multi-head attention mechanisms involve linearly projecting the queries, keys, and values multiple times with different learned projections to allow the model to jointly attend to information from different representation subspaces.',
            'The Transformer model employs a residual connection around each of the two sub-layers, followed by layer normalization.',
            'The optimization method used to train the Transformer model is the Adam optimizer with β1=0.9, β2=0.98, and ε=10−9.',
            'Each position in the sequence attends to all positions, allowing it to draw global dependencies between input and output.',
            'Dropout is applied to the output of each sub-layer before it is added to the sub-layer input and normalized.',
            "The decoder generates the output sequence one token at a time, conditioned on previously generated tokens and the encoder's output.",
            'The datasets used to evaluate the Transformer model were the WMT 2014 English-to-German and English-to-French translation tasks.',
        ]
}

In [15]:
for model in models:
    create_model_eval_set(model, data_samples)