In [36]:
import chromadb
import os
from chromadb.utils import embedding_functions
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
import os
from langchain.document_loaders import DirectoryLoader
from dotenv import load_dotenv

In [37]:
loader = DirectoryLoader('./new_articles/', glob="./*.txt", loader_cls=TextLoader)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
print(texts[1]) 

page_content='1.   Pair up with a partner. Assign one person as the interviewer and the other as the candidate\n 2.   Interviewer – take about 5 minutes to read through the case in your head. Then read the prompt to the\n      candidate\n 3.   Candidate – Note on paper important details from the prompt. Ask clarifying questions, then ask the\n      interviewer if you can take about 1 minute to draw up a structured problem-solving approach. Then talk\n      through your structure with the interviewer and ask for any additional information you think might be helpful\n 4.   Interviewer – Listen carefully to candidate’s structure and logic. Are there any crucial pieces he/she is\n      missing? Is he/she going down the right track? If not, try to lead the candidate in the right direction. Provide\n      additional information only when the candidate asks for them. Then go through the questions one by one,\n      providing the exhibits as appropriate' metadata={'source': 'new_articles/Yale_

In [38]:
metadata_list = [doc.metadata for doc in texts]
text_content_list = [doc.page_content for doc in texts]
id_list=id_list = ["doc" + str(i + 1) for i in range(len(texts))]

In [39]:

class Rag:
    def __init__(self, embedding_name, llm):
        self.embedding_name = embedding_name
        self.llm = llm
    def get_embedding_name(self):
        return self.embedding_name

    def get_llm(self):
        return self.llm  
    def generate_embedding(self):
        load_dotenv()
        if self.embedding_name == "GoogleGenAI Embedding":
            google_API_KEY= os.getenv("GOOGLE_API_KEY")
            embedding_function= embedding_functions.GoogleGenerativeAiEmbeddingFunction(api_key=google_API_KEY)
            return embedding_function
        elif self.embedding_name == "OpenAI Embedding":
            from openai import OpenAI
            OPENAI_API_KEY= os.getenv("OPENAI_API_KEY")
            embedding_function = embedding_functions.OpenAIEmbeddingFunction(
            api_key=OPENAI_API_KEY,
            model_name="text-embedding-ada-002"
            )
            return embedding_function
            
        elif self.embedding_name == "jinaAi Embedding":
            jina_api= os.getenv("Jina_Api")
            jinaai_ef = embedding_functions.JinaEmbeddingFunction(
                api_key=jina_api,
                model_name="jina-embeddings-v2-base-en"
            )
            
            return embedding_function
        else:
            return None 






In [23]:
from ipywidgets import widgets

embd_options = ["GoogleGenAI Embedding", "OpenAI Embedding", "jinaAi Embedding"]
dropdown_1 = widgets.Dropdown(options=embd_options, description='Select Embedding:')

llm_options = ["Gemini", "OpenAI"]
dropdown_2 = widgets.Dropdown(options=llm_options, description='Select LLm:')

selected_embedding = None  
selected_llm = None 

def on_dropdowns_change(change):
    global selected_embedding, selected_llm
    selected_embedding = dropdown_1.value
    selected_llm = dropdown_2.value

def display_dropdowns():
    display(dropdown_1)
    display(dropdown_2)

def access_selected_values():
    global selected_embedding, selected_llm
    return selected_embedding, selected_llm

dropdown_1.observe(on_dropdowns_change, names='value')
dropdown_2.observe(on_dropdowns_change, names='value')

# Set default values for dropdowns
dropdown_1.value = embd_options[0]
dropdown_2.value = llm_options[0]

display_dropdowns()

Dropdown(description='Select Embedding:', options=('GoogleGenAI Embedding', 'OpenAI Embedding', 'jinaAi Embedd…

Dropdown(description='Select LLm:', options=('Gemini', 'OpenAI'), value='Gemini')

In [47]:
embd, llm=access_selected_values()
rag_instance = Rag(embd, llm)
print("Embedding data:", rag_instance.get_embedding_name())
print("LLm data:", rag_instance.get_llm())

Embedding data: GoogleGenAI Embedding
LLm data: Gemini


In [41]:
llm=rag_instance.get_llm()



ChromaDB Creation


In [42]:
chroma_client = chromadb.Client()

In [43]:
collection_name = "Articles"
existing_collections = [collection.name for collection in chroma_client.list_collections()]

if collection_name in existing_collections:
    chroma_client.delete_collection(collection_name)
    print(f"Info: Existing collection '{collection_name}' deleted.")
else:
    print("No collection found")

Info: Existing collection 'Articles' deleted.


In [44]:

vector_store = chroma_client.get_or_create_collection(name="Articles",
                                                      embedding_function=rag_instance.generate_embedding())
vector_store.add(ids=id_list, documents=text_content_list,metadatas=metadata_list)

In [59]:
# results = vector_store.query(
#         query_texts="What is the amount of floss used in America",
#         n_results=2
#     )
# print(results)

{'ids': [['doc81', 'doc71']], 'distances': [[0.48240000009536743, 0.5079999566078186]], 'metadatas': [[{'source': 'new_articles/Yale_YGCC_Life_Sciences_Casebook_2014_1.txt'}, {'source': 'new_articles/Yale_YGCC_Life_Sciences_Casebook_2014_1.txt'}]], 'embeddings': None, 'documents': [['Market Penetration\nPercent of Revenue\n\n\n\n\n                                                  Net Profit                          35%\n                     70%                65%       Distribution                        30%\n                     60%                          Production                          25%\n                              30%\n                     50%                          R&D                                 20%\n\n                     40%                                                              15%', 'Market                                                                                                      Competitive                 Culture\n         for Drug           

Evaluation BY True_Lens

In [45]:
from openai import OpenAI
OPENAI_API_KEY= os.getenv("OPENAI_API_KEY")
import openai

In [46]:
from trulens_eval import Tru
from trulens_eval.tru_custom_app import instrument
tru = Tru()

In [31]:
OPENAI_API_KEY= os.getenv("OPENAI_API_KEY")
class RAG_from_scratch:
    @instrument
    def retrieve(self, query: str) -> list:
        """
        Retrieve relevant text from vector store.
        """
        results = vector_store.query(
        query_texts=query,
        n_results=2
    )
        return results['documents'][0]

    @instrument
    def generate_completion(self, query: str, context_str: list) -> str:
        """
        Generate answer from context.
        """
        if llm=="Gemini":
            import google.generativeai as genai
            message = [{
                "role": "user",
                "parts": [
                    f"We have provided context information below. \n"
                    f"---------------------\n"
                    f"{context_str}"
                    f"\n---------------------\n"
                    f"Given this information, please answer the question: {query}"
                ]
            }]
            model = genai.GenerativeModel('gemini-pro')
            response = model.generate_content(message)
            response_txt = response.text
            return response_txt

        elif llm=="OpenAI":
            import openai
            openai.api_key = OPENAI_API_KEY
            completion = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            temperature=0,
            messages=
            [
                {"role": "user",
                "content": 
                f"We have provided context information below. \n"
                f"---------------------\n"
                f"{context_str}"
                f"\n---------------------\n"
                f"Given this information, please answer the question: {query}"
                }
            ]
            ).choices[0].message.content
            return completion
        else:
            print("no llm")
    

    @instrument
    def query(self, query: str) -> str:
        context_str = self.retrieve(query)
        completion = self.generate_completion(query, context_str)
        return completion

rag = RAG_from_scratch()

In [65]:
# rag_instance = RAG_from_scratch()
# my_query = "how to manage project"
# result = rag_instance.query(my_query)
# print(result)

In [32]:
from trulens_eval import Feedback, Select
from trulens_eval.feedback import Groundedness
from trulens_eval.feedback.provider.openai import OpenAI as fOpenAI

import numpy as np

# Initialize provider class
fopenai = fOpenAI()

grounded = Groundedness(groundedness_provider=fopenai)

# Define a groundedness feedback function
f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons, name = "Groundedness")
    .on(Select.RecordCalls.retrieve.rets.collect())
    .on_output()
    .aggregate(grounded.grounded_statements_aggregator)
)

# Question/answer relevance between overall question and answer.
f_qa_relevance = (
    Feedback(fopenai.relevance_with_cot_reasons, name = "Answer Relevance")
    .on(Select.RecordCalls.retrieve.args.query)
    .on_output()
)

# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(fopenai.qs_relevance_with_cot_reasons, name = "Context Relevance")
    .on(Select.RecordCalls.retrieve.args.query)
    .on(Select.RecordCalls.retrieve.rets.collect())
    .aggregate(np.mean)
)

✅ In Groundedness, input source will be set to __record__.app.retrieve.rets.collect() .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Answer Relevance, input prompt will be set to __record__.app.retrieve.args.query .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input question will be set to __record__.app.retrieve.args.query .
✅ In Context Relevance, input statement will be set to __record__.app.retrieve.rets.collect() .


In [33]:
from trulens_eval import TruCustomApp
tru_rag = TruCustomApp(rag,
    app_id = 'RAG v3',
    feedbacks = [f_groundedness, f_qa_relevance, f_context_relevance])

In [34]:
with tru_rag as recording:
    rag.query("What is the amount of floss used in America")

In [35]:
tru.run_dashboard()

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.


Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu…

Dashboard started at http://192.168.1.10:8501 .


<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>