## Connect to LLM and get example question

In [2]:
import os
from mistralai import Mistral

api_key = os.environ["MISTRAL_API_KEY"]
model = "mistral-medium-latest"

client = Mistral(api_key=api_key)

chat_response = client.chat.complete(
    model=model,
    messages=[
        {
            "role": "user",
            "content": "What is the largest lake in California?",
        },
    ],
)
answer = chat_response.choices[0].message.content
print(chat_response.choices[0].message.content)

The largest lake in California is **Salton Sea**, located in the southern part of the state.

- **Surface Area:** Approximately 343 square miles (though it fluctuates due to evaporation and water inflow).
- **Formation:** It was created accidentally in 1905 when an irrigation canal breach caused the Colorado River to flood the Salton Basin for nearly two years.
- **Current Status:** The Salton Sea is a saline lake with no natural outlet, and its water levels have been declining due to reduced agricultural runoff and drought conditions.

While **Lake Tahoe** is more famous and deeper, the Salton Sea holds the title for the largest lake by surface area in California.


In [1]:
from dataclasses import dataclass

from jinja2 import Template


@dataclass
class Prompts:
    """Store prompt constants using jinja2 templates for dynamic content generation.

    Attributes:
        self_reflection: Prompt Template for self-reflection questions.
        consistency: Template to generate multiple responses for consistency checking.
    """

    self_reflection: Template = Template("""
    1. Question: {{ question }}, Proposed Answer: {{ answer }}
    Is the proposed answer: (A) Correct (B) Incorrect (C) I am not sure.
    The output should strictly use the following template:
    explanation: [insert analysis], answer: [choose one letter from among choices A through C]

    2. Question: {{ question }}, Proposed Answer: {{ answer }}
    Are you really sure the proposed answer is correct?
    Choose again: (A) Correct (B) Incorrect (C) I am not sure.
    The output should strictly use the following template:
    explanation: [insert analysis], answer: [choose one letter from among choices A through C]
    """)

    consistency = Template("""
    Please strictly use the following template to provide answer:
    explanation: [insert step-by-step analysis], answer: [provide
    your answer] + Question: {{ question }}
    """)


In [None]:
from typing import List
import numpy as np
import os
from mistralai import Mistral
import re
import ollama


class NLIClient:
    def __init__(self, model: str = "nli-deberta-v3-small"):
        self.pipeline = self.get_nli_pipeline()
        self.model: str = model
        self.supported_models = ["nli-deberta-v3-small"]
        if model not in self.supported_models:
            raise NotImplementedError(
                f"NLI Pipeline not implemented for model: {self.supported_models}"
            )

    def get_nli_pipeline(self, model="nli-deberta-v3-small"):
        from transformers import pipeline

        return pipeline(
            "text-classification",
            model="cross-encoder/nli-deberta-v3-small",
            device=1,
            top_k=None,
        )


class LLMClient:
    def __init__(self):
        self.model: str = "mistral-medium-latest"
        self.client: str = "mistral"
        self.model: str = "mistral-medium-latest"
        self.get_temp: dict = {"mistral": {"min": 0.0, "max": 1.0}}
        self.supported_clients = ["mistral"]  # TODO add supporot for local models
        if self.client not in self.supported_clients:
            raise NotImplementedError(f"Client {self.client} is not implemented.")

    def chat(self, question: str, temperature: str):
        if self.client == "mistral":
            temperature = self.get_temp[self.client][temperature]
            return self.get_mistral_answer(question, temperature)
        elif self.client == "ollama":
            temperature = self.get_temp[self.client][temperature]
            return self.get_ollama_answer(question=question, temperature=temperature)

    def get_ollama_anser(self,question:str, temperature:float):
        response = ollama.generate(model=self.model, prompt=question, temperature=temperature)
        return response['response']

    def get_mistral_answer(
        self, question:str, temperature: float):
        client = Mistral(api_key=os.environ["MISTRAL_API_KEY"])
        chat_response = client.chat.complete(
            model=self.model,
            temperature=temperature,
            messages=[
                {
                    "role": "user",
                    "content": question,
                },
            ],
        )
        return chat_response.choices[0].message.content


class NoPinocchio:
    def __init__(
        self,
        prompts: Prompts,
        llm_client: LLMClient,
        nli_client: NLIClient,
        question: str,
        k: int = 5,
        alpha: float = 0.5,
        beta: float = 0.5,
    ):
        self.prompts: Prompts = prompts
        self.llm_client: LLMClient = llm_client
        self.nli_client: NLIClient = nli_client
        self.k: int = k
        self.alpha: float = alpha
        self.beta: float = beta
        self._responses: List[str]
        self.question: str = question

    def _get_self_reflection_prompt(self):
        return self.prompts.self_reflection.render(
            question=self.question, answer=self.answer
        )

    def get_consistency_prompt(self, sampled_answer: str):
        return self.prompts.consistency.render(
            question=self.question, answer=sampled_answer
        )

    def _get_k_responses(self) -> List[str]:
        sampled_answers = []
        for _ in range(self.k):
            response = self.llm_client.chat(question=self.question, temperature="max")
            sampled_answers.append(response)
        return sampled_answers

    def _calc_selfreflection_score(self) -> float:
        prompt = self._get_self_reflection_prompt()
        sr_answer = self.llm_client.chat(question=prompt, temperature="min")

        rendered_answers = re.findall(
            r"answer:\s*([ABC])", sr_answer, flags=re.IGNORECASE
        )

        def map_to_score(letter):
            return {"A": 1.0, "B": 0.0, "C": 0.5}.get(letter.upper(), None)

        scores = [
            map_to_score(letter=a)
            for a in rendered_answers
            if map_to_score(letter=a) is not None
        ]
        if len(scores) == 0:
            pass  # TODO

        return np.mean(scores)

    def _calc_pairwise_consistency(self, sampled_answer) -> float:
        y_yi = f"{self.answer} [SEP] {sampled_answer}"
        yi_y = f"{sampled_answer} [SEP] {self.answer}"
        res_y_yi = self.nli_client(y_yi)
        res_yi_y = self.nli_client(yi_y)
        p_contra = [i["score"] for i in res_y_yi[0] if i["label"] == "contradiction"][0]
        p_contra_prime = [
            i["score"] for i in res_yi_y[0] if i["label"] == "contradiction"
        ][0]
        si = 0.5 * ((1 - p_contra) + (1 - p_contra_prime))
        ri = (
            1 if self.answer == sampled_answer else 0
        )  # indicator function accounts for shortcomings of NLI
        return self.alpha * si + (1 - self.alpha) * ri

    def _calc_observed_consistency(self, k_sampled_answers):
        cons_scores = []
        for answer in k_sampled_answers:
            s = self._calc_pairwise_consistency(sampled_answer=answer)
            cons_scores.append(s)
        return np.mean(cons_scores)

    def _calc_confidence_score(
        self, observed_consistency: float, self_reflection: float
    ) -> float:
        return self.beta * observed_consistency + (1 - self.beta) * self_reflection

    def get_confidence(self):
        self.answer: str = self.llm_client.chat(
            question=self.question, temperature="min"
        )
        k_sampled_answers = self._get_k_responses()
        o = self._calc_observed_consistency(k_sampled_answers=k_sampled_answers)
        s = self._calc_selfreflection_score()
        return self._calc_confidence_score(observed_consistency=o, self_reflection=s)


In [None]:

test_question = "How many survivors were there on the Titanic?"
nop = NoPinocchio(llm_client=LLMClient(), nli_client=NLIClient(),prompts=Prompts(), question=test_question)
conf = nop.get_confidence()

Device set to use mps:1


SDKError: API error occurred: Status 401
{
  "message":"Unauthorized",
  "request_id":"d472795294df8b50b6853a7563beb043"
}

### Getting outputs for score

In [6]:
test_question = "How many survivors were there on the Titanic?"
test_question = "How many Rs are i the word strawberry?"
answer = get_initial_answer(test_question)
consistency_responses = get_k_responses(question=test_question, k=5, temperature=1.0)

In [7]:
# print(f"Initial answer: {answer}")
for i, response in enumerate(consistency_responses):
    print("------------------------")
    print(f"Response {i + 1}: {response}")
    print("------------------------")

------------------------
Response 1: Alright, let's tackle the question: **"How many Rs are in the word 'strawberry'?"**

### Understanding the Question
First, I need to understand what the question is asking. It's asking for the count of the letter "R" (both uppercase and lowercase, but since the word is in lowercase, we'll focus on lowercase "r") in the word "strawberry."

### Writing Down the Word
Let me write down the word to visualize it better:

**strawberry**

### Identifying Each Letter
Now, I'll list out each letter in the word along with its position to keep track:

1. s
2. t
3. r
4. a
5. w
6. b
7. e
8. r
9. r
10. y

### Counting the 'r's
Now, I'll go through each letter one by one to see if it's an 'r':

1. **s** - Not an 'r'.
2. **t** - Not an 'r'.
3. **r** - This is an 'r'. (First 'r' found)
4. **a** - Not an 'r'.
5. **w** - Not an 'r'.
6. **b** - Not an 'r'.
7. **e** - Not an 'r'.
8. **r** - This is an 'r'. (Second 'r' found)
9. **r** - This is an 'r'. (Third 'r' found)
1

In [8]:
print(f"Initial answer: {answer}")

Initial answer: Alright, let's tackle the question: **"How many Rs are in the word 'strawberry'?"**

### Understanding the Question
First, I need to understand what the question is asking. It's asking for the count of the letter "R" (both uppercase and lowercase, but since the word is in lowercase, we'll focus on lowercase "r") in the word "strawberry."

### Writing Down the Word
Let me write down the word to visualize it better:

**strawberry**

### Identifying Each Letter
Now, I'll list out each letter in the word one by one and see if it's an "r":

1. s
2. t
3. r
4. a
5. w
6. b
7. e
8. r
9. r
10. y

### Counting the Rs
Now, let's go through each letter:

1. **s** - Not an "r."
2. **t** - Not an "r."
3. **r** - This is an "r." (Count: 1)
4. **a** - Not an "r."
5. **w** - Not an "r."
6. **b** - Not an "r."
7. **e** - Not an "r."
8. **r** - This is an "r." (Count: 2)
9. **r** - This is an "r." (Count: 3)
10. **y** - Not an "r."

### Verifying the Count
From the above, the "r"s are at p

### Calc scores

In [10]:
score = get_self_reflection_score(test_question, answer)


Self-Reflection Response:
 explanation: The proposed answer provides a detailed and accurate analysis of the word "strawberry," correctly identifying and counting the letter "r" three times. The step-by-step breakdown confirms that the letter "r" appears in the 3rd, 8th, and 9th positions of the word, totaling three occurrences. The verification steps and alternative approach further validate this count, ensuring no miscounting or oversight. Therefore, the proposed answer is correct., answer: A


In [11]:
score

1.0

### Next Action Observed Consistency Score

In [15]:
res = nli("the apple is red <-> the apple is green")


In [None]:
type(res)
res[0][1]
[i["score"] for i in res[0] if i["label"] == "contradiction"][0]

0.9989809393882751

In [None]:
calc_observed_consistency(
    reference_answer=answer, k_sampled_answers=consistency_responses
)

Device set to use mps:1
Token indices sequence length is longer than the specified maximum sequence length for this model (1243 > 512). Running this sequence through the model will result in indexing errors
Device set to use mps:1
Token indices sequence length is longer than the specified maximum sequence length for this model (1302 > 512). Running this sequence through the model will result in indexing errors
Device set to use mps:1
Token indices sequence length is longer than the specified maximum sequence length for this model (1321 > 512). Running this sequence through the model will result in indexing errors
Device set to use mps:1
Token indices sequence length is longer than the specified maximum sequence length for this model (1169 > 512). Running this sequence through the model will result in indexing errors
Device set to use mps:1
Token indices sequence length is longer than the specified maximum sequence length for this model (1131 > 512). Running this sequence through the mo

np.float64(0.9765633953269571)

# second next, fix NLI token limitation
# third next, add quality score
# fourth next add eval study