In [1]:
import os
import nest_asyncio
import pandas as pd
import streamlit as st
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain import PromptTemplate
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import SeleniumURLLoader, PyPDFLoader, DirectoryLoader
from langchain.chains import RetrievalQA

In [2]:
load_dotenv()

api_key = os.getenv('GROQ_API_KEY')

In [3]:
# load the LLM
llm = ChatGroq(groq_api_key=api_key, model_name="llama3-70b-8192", temperature=0)

In [4]:
# load the pdf files
loader = DirectoryLoader(path='pdf', glob="*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

for document in documents:
    document.metadata['filename'] = document.metadata['source']

# load the vector store
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
texts_chunks = text_splitter.split_documents(documents)

db = Chroma.from_documents(texts_chunks, embeddings, persist_directory="db")

In [5]:
# prepare the template we will use when prompting the AI
template = """Use the provided context to answer the user's question.
If you don't know the answer, respond with "I do not know".

Context: {context}
Question: {question}
Answer:
"""

prompt = PromptTemplate(template=template, input_variables=['context', 'question'])

In [6]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

nest_asyncio.apply()

# generator with openai models
generator_llm = ChatGroq(groq_api_key=api_key, model_name="llama3-8b-8192")
critic_llm = ChatGroq(groq_api_key=api_key, model_name="llama3-70b-8192")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# generate testset
testset = generator.generate_with_langchain_docs(documents, test_size=10, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})

embedding nodes:   0%|          | 0/44 [00:00<?, ?it/s]

Generating:   0%|          | 0/10 [00:00<?, ?it/s]

max retries exceeded for MultiContextEvolution(generator_llm=LangchainLLMWrapper(run_config=RunConfig(timeout=60, max_retries=15, max_wait=90, max_workers=16, exception_types=<class 'Exception'>)), docstore=InMemoryDocumentStore(splitter=<langchain_text_splitters.base.TokenTextSplitter object at 0x000001C4341095A0>, nodes=[Node(page_content='Deep Reinforcement Learning with Double Q-learning\nHado van Hasselt andArthur Guez andDavid Silver\nGoogle DeepMind\nAbstract\nThe popular Q-learning algorithm is known to overestimate\naction values under certain conditions. It was not previously\nknown whether, in practice, such overestimations are com-\nmon, whether they harm performance, and whether they can\ngenerally be prevented. In this paper, we answer all these\nquestions afﬁrmatively. In particular, we ﬁrst show that the\nrecent DQN algorithm, which combines Q-learning with a\ndeep neural network, suffers from substantial overestimations\nin some games in the Atari 2600 domain. We then 

In [7]:
# from llama_index import download_loader
# from ragas.testset.evolutions import simple, reasoning, multi_context
# from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# SemanticScholarReader = download_loader("SemanticScholarReader")
# loader = SemanticScholarReader()
# query_space = "large language models"
# documents = loader.load_data(query=query_space, limit=100)

# # generator with openai models
# generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
# critic_llm = ChatOpenAI(model="gpt-4")
# embeddings = OpenAIEmbeddings()

# generator = TestsetGenerator.from_langchain(
#     generator_llm,
#     critic_llm,
#     embeddings
# )


# distributions = {
#     simple: 0.5,
#     multi_context: 0.4,
#     reasoning: 0.1
# }

# # generate testset
# testset = generator.generate_with_llamaindex_docs(documents, 100,distributions)
# testset.to_pandas()


In [8]:
testset.to_pandas()

Unnamed: 0,question,contexts,ground_truth,evolution_type,episode_done
0,Here is a question that can be fully answered ...,[Deep Reinforcement Learning with Double Q-lea...,Overestimations in Q-learning can negatively a...,simple,True
1,Here is a question that can be fully answered ...,[Deep Reinforcement Learning with Double Q-lea...,Overestimations in Q-learning can negatively a...,simple,True
2,Here is a question that can be fully answered ...,[Deep Reinforcement Learning with Double Q-lea...,Overestimations in Q-learning can negatively a...,simple,True
3,Here is a question that can be fully answered ...,[Deep Reinforcement Learning with Double Q-lea...,The potential problem with the Q-learning algo...,simple,True
4,Here is a question that can be fully answered ...,[Deep Reinforcement Learning with Double Q-lea...,The potential problem with the Q-learning algo...,simple,True
5,Here is a rewritten version of the question th...,[Deep Reinforcement Learning with Double Q-lea...,,reasoning,True
6,Here is a rewritten version of the question th...,[Deep Reinforcement Learning with Double Q-lea...,The key limitation of Q-learning that Double Q...,reasoning,True
7,Here is a rewritten version of the question th...,[Deep Reinforcement Learning with Double Q-lea...,,reasoning,True


In [12]:
testset

TestDataset(test_data=[DataRow(question='Here is a question that can be fully answered from the given context:\n\n"What is the potential impact of overestimations in Q-learning on the quality of the resulting policy?"\n\nThis question is relevant to the topic of overestimations in Q-learning and their impact on policy performance, and can be answered by referencing the text, which discusses the potential negative effects of overestimations on policy performance, as well as the benefits of reducing overestimations through the use of algorithms like Double Q-learning.', contexts=['Deep Reinforcement Learning with Double Q-learning\nHado van Hasselt andArthur Guez andDavid Silver\nGoogle DeepMind\nAbstract\nThe popular Q-learning algorithm is known to overestimate\naction values under certain conditions. It was not previously\nknown whether, in practice, such overestimations are com-\nmon, whether they harm performance, and whether they can\ngenerally be prevented. In this paper, we answe

In [9]:
from datasets import Dataset 
import nest_asyncio
import os
from ragas import evaluate
from ragas.metrics import faithfulness, answer_correctness

# os.environ["OPENAI_API_KEY"] = "your-openai-key"

data_samples = {
    'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
    'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
    'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'], 
    ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
    'ground_truth': ['The first superbowl was held on January 15, 1967', 'The New England Patriots have won the Super Bowl a record six times']
}

dataset = Dataset.from_dict(data_samples)

nest_asyncio.apply()

score = evaluate(dataset,metrics=[faithfulness,answer_correctness])
score.to_pandas()

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

Exception in thread Thread-11:
Traceback (most recent call last):
  File "C:\Users\irvin\anaconda3\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "c:\Users\irvin\Documents\GitHub\support-chatbot\venv\lib\site-packages\ragas\executor.py", line 96, in run
    results = self.loop.run_until_complete(self._aresults())
  File "c:\Users\irvin\Documents\GitHub\support-chatbot\venv\lib\site-packages\nest_asyncio.py", line 98, in run_until_complete
    return f.result()
  File "C:\Users\irvin\anaconda3\lib\asyncio\futures.py", line 201, in result
    raise self._exception.with_traceback(self._exception_tb)
  File "C:\Users\irvin\anaconda3\lib\asyncio\tasks.py", line 232, in __step
    result = coro.send(None)
  File "c:\Users\irvin\Documents\GitHub\support-chatbot\venv\lib\site-packages\ragas\executor.py", line 84, in _aresults
    raise e
  File "c:\Users\irvin\Documents\GitHub\support-chatbot\venv\lib\site-packages\ragas\executor.py", line 79, in _aresults
    r = awai

ExceptionInRunner: The runner thread which was running the jobs raised an exeception. Read the traceback above to debug it. You can also pass `raise_exceptions=False` incase you want to show only a warning message instead.