In [1]:
import os
import pandas as pd
import streamlit as st
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain import PromptTemplate
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import SeleniumURLLoader, PyPDFLoader, DirectoryLoader
from langchain.chains import RetrievalQA

In [2]:
load_dotenv()

api_key = os.getenv('GROQ_API_KEY')

In [3]:
# load the LLM
llm = ChatGroq(groq_api_key=api_key, model_name="llama3-70b-8192", temperature=0.1)

In [4]:
# load the pdf files
loader = DirectoryLoader(path='pdf', glob="*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

for document in documents:
    document.metadata['filename'] = document.metadata['source']

# load the vector store
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
texts_chunks = text_splitter.split_documents(documents)

db = Chroma.from_documents(texts_chunks, embeddings, persist_directory="db")

In [5]:
# prepare the template we will use when prompting the AI
template = """Use the provided context to answer the user's question.
If you don't know the answer, respond with "I do not know".

Context: {context}
Question: {question}
Answer:
"""

prompt = PromptTemplate(template=template, input_variables=['context', 'question'])

In [6]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

# generator with openai models
generator_llm = ChatGroq(groq_api_key=api_key, model_name="llama3-8b-8192")
critic_llm = ChatGroq(groq_api_key=api_key, model_name="llama3-70b-8192")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# generate testset
testset = generator.generate_with_langchain_docs(documents, test_size=10, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})

embedding nodes:   0%|          | 0/44 [00:00<?, ?it/s]

Generating:   0%|          | 0/10 [00:00<?, ?it/s]

Exception in thread Thread-9:
Traceback (most recent call last):
  File "C:\Users\irvin\anaconda3\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "c:\Users\irvin\Documents\GitHub\support-chatbot\venv\lib\site-packages\ragas\executor.py", line 96, in run
    results = self.loop.run_until_complete(self._aresults())
  File "C:\Users\irvin\anaconda3\lib\asyncio\base_events.py", line 649, in run_until_complete
    return future.result()
  File "c:\Users\irvin\Documents\GitHub\support-chatbot\venv\lib\site-packages\ragas\executor.py", line 84, in _aresults
    raise e
  File "c:\Users\irvin\Documents\GitHub\support-chatbot\venv\lib\site-packages\ragas\executor.py", line 79, in _aresults
    r = await future
  File "C:\Users\irvin\anaconda3\lib\asyncio\tasks.py", line 571, in _wait_for_one
    return f.result()  # May raise f.exception().
  File "c:\Users\irvin\Documents\GitHub\support-chatbot\venv\lib\site-packages\ragas\executor.py", line 38, in sema_coro
    return 

ExceptionInRunner: The runner thread which was running the jobs raised an exeception. Read the traceback above to debug it. You can also pass `raise_exceptions=False` incase you want to show only a warning message instead.

In [7]:
testset.to_pandas()

NameError: name 'testset' is not defined