In [96]:
import os
from kaggle import api 
from dotenv import load_dotenv
from app.pipelines.workflow import setup_preloaded_chroma
from app.modules.qa_chain_composer.utils import create_multi_modal_query

from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain.chains import RetrievalQA

from tqdm import tqdm

In [28]:
load_dotenv()
os.environ['ROOT_DIR'] = "./"

In [5]:
if not os.path.exists("./data/doc_dataset"):
    api.dataset_download_files('ivanhusarov/the-batch-articles-initial', path='./data/doc_dataset', unzip=True)
if not os.path.exists("./data/image_descriptions_dataset"):
    api.dataset_download_files('ivanhusarov/the-batch-articles-image-descriptions', path='./data/image_descriptions_dataset', unzip=True)
if not os.path.exists("./chroma_preloaded"):
    api.dataset_download_files('ivanhusarov/the-batch-rag-chroma', path='chroma_preloaded', unzip=True)

In [29]:
vector_store = setup_preloaded_chroma()

In [31]:
llm = ChatOpenAI(model='gpt-4o', temperature=0)

retriever = vector_store.as_retriever(search_kwargs={"k": 10})

compressor = CohereRerank(model="rerank-english-v3.0")
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

In [32]:

template = """Answer the question based only on the following context. If you cannot answer the question with the context, please respond with 'I don't know':

### CONTEXT
{context}

### QUESTION
Question: {question}
"""
prompt = PromptTemplate(
    template=template, 
    input_variables=[
        'context', 
        'question',
    ]
)

qa_chain = RetrievalQA.from_chain_type(llm=llm, 
                                    retriever=compression_retriever,     
                                    return_source_documents=True,
                                    chain_type_kwargs={"prompt": prompt}
)

In [39]:
# question = "Explain the misclassification in AI, mentioning the bias in ML models?"
# query = create_multi_modal_query(question=question, image=None)
# answer = qa_chain.invoke({"query": query})
# answer

In [46]:
import random
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser

from datasets import Dataset
from pydantic import BaseModel, Field

# Define the schema for output
class QAFormat(BaseModel):
    question: str = Field(default=None, description="Generated question about the context")
    answer: str = Field(default=None, description="Generated answer on question from the context")

qa_template = """\
You are a University Professor creating a test for advanced students. Create a question that is specific to the context. Avoid creating generic or general questions. Don't say I don't know.
Consider the following context: {context}


Return your response as a JSON object with the following structure:
{{
    "question": a question about the context.
    "answer": an answer from the context.
}}

Always return your response as a JSON object.
"""

full_doc_list = vector_store.get()["documents"]
context = full_doc_list[0]

parser = JsonOutputParser(pydantic_object=QAFormat)

prompt = PromptTemplate(
    template=qa_template,
    input_variables=["context"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | llm | parser

{'question': 'What was the surprising outcome of the auction for the painting created by the AI-powered robot Ai-Da?',
 'answer': 'A painting of mathematician Alan Turing created by the AI-powered robot Ai-Da sold at Sotheby’s for $1.1 million, far exceeding initial estimates.'}

In [52]:
from random import sample

random.seed(42)

full_doc_list = vector_store.get()["documents"]
processed_sample = sample(full_doc_list, 20)

synthetic_questions = []
synthetic_ground_truths = []
contexts = []

for sample_entry in processed_sample:
    result = chain.invoke({"context": sample_entry})
    synthetic_ground_truths.append(result["answer"])
    synthetic_questions.append(result["question"])
    contexts.append(sample_entry)

questions = synthetic_questions
ground_truths = synthetic_ground_truths
# references = processed_sample
# answers = []
# contexts = []

# # # Inference
# # for query in questions:
# #     qa_response = qa_chain.invoke({"query": query})
# #     answers.append(qa_response['result'])
# #     contexts.append([docs.page_content for docs in qa_response['source_documents']])

# # To dict
data = {
    "question": questions,
    # "answer": answers,
    # "reference": references,
    "contexts": contexts,
    "ground_truth": ground_truths
}

# # Convert dict to dataset
dataset = Dataset.from_dict(data)

In [53]:
dataset[3]

{'question': 'What is the key advancement in the new architecture that allows it to outperform a fully trained AlexNet model with only a few labeled samples?',
 'contexts': 'The power of deep learning is blunted in domains where labeled training data is scarce.But that may be changing, thanks to a new architecture that recognizes images with high accuracy based on few labeled samples.What’s new: Researchers devised a network that, given a small number of labeled images, can learn enough from unlabeled images to outperform a fully trained AlexNet model.',
 'ground_truth': 'The new architecture can learn enough from unlabeled images to achieve high accuracy, even with a small number of labeled images.'}

In [115]:
import dspy
from dspy.teleprompt import BootstrapFewShot
from typing import List, Dict
from dspy.teleprompt import BootstrapFewShot
from dspy.evaluate.evaluate import Evaluate
from dspy.evaluate import SemanticF1, f1_score, DecompositionalSemanticRecallPrecision


In [103]:
from dspy import OpenAI

model = OpenAI(model='gpt-4o', max_tokens=400)
dspy.settings.configure(lm=model)


In [62]:
# class QuestionAnswerSignature(dspy.Signature):
#     """Retrieve the most relevant answer to a given question using available context."""
#     question = dspy.InputField()
#     context = dspy.InputField(desc="Supporting information to answer the question")
#     answer = dspy.OutputField(desc="Concise and accurate answer to the question")



In [57]:
STARTING_SYSTEM_PROMPT = """Answer the question based only on the following context. If you cannot answer the question with the context, please respond with 'I don't know':

### CONTEXT
{context}

### QUESTION
Question: {question}
"""


In [104]:
qa_signature = dspy.Signature("question, context -> answer")
QA_program = dspy.ChainOfThought(qa_signature)

In [124]:
class NewDecompositionalSemanticRecallPrecision(DecompositionalSemanticRecallPrecision):
    recall: float = dspy.OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response. Please, return only float number without text and explanation.")
    precision: float = dspy.OutputField(desc="fraction (out of 1.0) of system response covered by the ground truth. Please, return only float number without text and explanation.")

class NewSemanticF1(SemanticF1):
    def __init__(self, threshold=0.66, decompositional=False):
        self.threshold = threshold
        self.module = dspy.ChainOfThought(NewDecompositionalSemanticRecallPrecision)


    def forward(self, example, pred, trace=None):
        scores = self.module(question=example["question"], ground_truth=example["response"], system_response=pred["response"])
        score = f1_score(float(scores.precision), float(scores.recall))

        return score if trace is None else score >= self.threshold

In [141]:
rag_prompt = """You are an advanced retrieval and generation assistant, trained to provide **detailed, accurate, and contextually relevant answers**. Leverage the provided **context**, user inputs, and embedded memory to construct logically sound and comprehensive responses.

### Task:
1. **Understand the Query**: Analyze user inputs to discern intent and determine the best retrieval and synthesis strategy.
2. **Employ Chain of Thought (CoT)**: Break down reasoning step by step to ensure depth, accuracy, and coherence in the response.
3. **Leverage Context**: Integrate contextual information with retrieved knowledge to ensure the answer aligns with the user's requirements.
4. **Generate Outputs Dynamically**: Tailor the response to the input format (text, image description, or both).

### Input Schema:
- **Context**: {context}
- **Text Input**: {text_input} (optional)
- **Image Description**: {image_description} (optional)

### Instructions by Scenario:
1. **Only Text Provided**:
   - Focus exclusively on the text input.
   - Extract key details from the context to complement the analysis.
2. **Only Image Description Provided**:
   - Analyze the described image thoroughly.
   - Use the context to enhance the interpretation of the visual elements.
3. **Both Text and Image Description Provided**:
   - Synthesize information from both inputs for a cohesive and nuanced response.

### Chain of Thought (CoT) Reasoning:
- **Step 1**: Analyze all inputs to determine the user’s intent.
- **Step 2**: Identify the required knowledge domains and retrieve relevant information.
- **Step 3**: Verify the sufficiency of retrieved information against the context and inputs.
- **Step 4**: Formulate a detailed, coherent, and accurate response.

### Final Response:
- Structure: Ensure the output is clear, precise, and aligned with user intent.
- Depth: Provide additional insights or actionable recommendations where appropriate.
- Clarity: Avoid unnecessary jargon while maintaining technical accuracy.

---

### Example Queries for Different Scenarios

#### **Scenario 1: User Inputs Only Text**
**Context**: Recent advancements in quantum computing.
**Text Input**: "Explain the impact of qubits on computational speed."
**Response**: (Use CoT reasoning to address the question in depth, referencing the context on quantum computing.)

#### **Scenario 2: User Inputs Only Image Description**
**Context**: Environmental conservation initiatives.
**Image Description**: "A detailed image of a river surrounded by deforested land."
**Response**: (Analyze the environmental implications using the context and interpret the visual information.)

#### **Scenario 3: User Inputs Both Text and Image Description**
**Context**: Climate change and renewable energy.
**Text Input**: "What are the benefits of solar panels in urban areas?"
**Image Description**: "An image of a rooftop covered with solar panels."
**Response**: (Synthesize both inputs to create a comprehensive answer about urban solar panel benefits.)

---

Return your response as a JSON object with the following structure:
{{
    "answer": an answer from the context.
}}

Always return your response as a JSON object.
"""

In [142]:
class QAFormat_final(BaseModel):
    answer: str = Field(default=None, description="Generated answer on question from the context")

In [143]:
parser = JsonOutputParser(pydantic_object=QAFormat_final)

prompt = PromptTemplate(
    template=rag_prompt,
    input_variables=["context", "text_input", "image_description"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | llm | parser

In [145]:
chain.invoke({"context": dataset[0]['contexts'], "text_input": dataset[0]['question'], "image_description": ""})

{'answer': 'The Allen Institute for AI offers CoViz, an interactive network that visualizes relationships among concepts present in the COVID-19 Open Research Dataset.'}

In [146]:
metric = NewSemanticF1()

f1_metric_arr = []
for entry in tqdm(dataset):

    response = chain.invoke({"context": entry['contexts'], "text_input": entry['question'], "image_description": ""})
    # response = QA_program(question=entry['question'], context=entry['contexts'])
    f1_metric_arr.append(metric({"question": entry['question'], "response": entry['ground_truth']}, {"response": response['answer']} ))

100%|██████████| 20/20 [02:02<00:00,  6.11s/it]


In [147]:
import numpy as np

np.average(f1_metric_arr)

0.7556905019294613