## Connect to LLM and get example question

In [2]:
import os
from mistralai import Mistral

api_key = os.environ["MISTRAL_API_KEY"]
model = "mistral-medium-latest"

client = Mistral(api_key=api_key)

chat_response = client.chat.complete(
    model=model,
    messages=[
        {
            "role": "user",
            "content": "What is the largest lake in California?",
        },
    ],
)
answer = chat_response.choices[0].message.content
print(chat_response.choices[0].message.content)

The largest lake in California is **Salton Sea**, located in the southern part of the state.

- **Surface Area:** Approximately 343 square miles (though it fluctuates due to evaporation and water inflow).
- **Formation:** It was created accidentally in 1905 when an irrigation canal breach caused the Colorado River to flood the Salton Basin for nearly two years.
- **Current Status:** The Salton Sea is a saline lake with no natural outlet, and its water levels have been declining due to reduced agricultural runoff and drought conditions.

While **Lake Tahoe** is more famous and deeper, the Salton Sea holds the title for the largest lake by surface area in California.


In [3]:
from jinja2 import Template

PROMPT_SELF_REFLECTION = Template("""
1. Question: {{ question }}, Proposed Answer: {{ answer }}
Is the proposed answer: (A) Correct (B) Incorrect (C) I am not sure.
The output should strictly use the following template:
explanation: [insert analysis], answer: [choose one letter from among choices A through C]

2. Question: {{ question }}, Proposed Answer: {{ answer }}
Are you really sure the proposed answer is correct?
Choose again: (A) Correct (B) Incorrect (C) I am not sure.
The output should strictly use the following template:
explanation: [insert analysis], answer: [choose one letter from among choices A through C]
""")


In [4]:
PROMPT_CONSITENCY = Template("""
Please strictly use the following template to provide answer:
explanation: [insert step-by-step analysis], answer: [provide
your answer] + Question: {{ question }}
""")

In [5]:
def get_self_reflection_prompt(question, answer):
    return PROMPT_SELF_REFLECTION.render(question=question, answer=answer)


def get_consistency_prompt(question, answer):
    return PROMPT_CONSITENCY.replace("[User Provided]", question).replace(
        "[provide your answer]", answer
    )


def get_initial_answer(
    question, temperature=0.0, client="mistral", model="mistral-medium-latest"
):
    if client == "mistral":
        return get_mistral_answer(question, temperature, model)
    else:
        raise NotImplementedError(f"Client {client} is not implemented.")


def get_mistral_answer(question, temperature=0.0, model="mistral-medium-latest"):
    client = Mistral(api_key=os.environ["MISTRAL_API_KEY"])
    chat_response = client.chat.complete(
        model=model,
        temperature=temperature,
        messages=[
            {
                "role": "user",
                "content": question,
            },
        ],
    )
    return chat_response.choices[0].message.content


def get_k_responses(
    question, k=5, temperature=1.0, client="mistral", model="mistral-medium-latest"
):
    responses = []
    for _ in range(k):
        response = get_initial_answer(question, temperature)
        responses.append(response)
    return responses


### Getting outputs for score

In [6]:
test_question = "How many survivors were there on the Titanic?"
test_question = "How many Rs are i the word strawberry?"
answer = get_initial_answer(test_question)
consistency_responses = get_k_responses(question=test_question, k=5, temperature=1.0)

In [7]:
# print(f"Initial answer: {answer}")
for i, response in enumerate(consistency_responses):
    print("------------------------")
    print(f"Response {i + 1}: {response}")
    print("------------------------")

------------------------
Response 1: Alright, let's tackle the question: **"How many Rs are in the word 'strawberry'?"**

### Understanding the Question
First, I need to understand what the question is asking. It's asking for the count of the letter "R" (both uppercase and lowercase, but since the word is in lowercase, we'll focus on lowercase "r") in the word "strawberry."

### Writing Down the Word
Let me write down the word to visualize it better:

**strawberry**

### Identifying Each Letter
Now, I'll list out each letter in the word along with its position to keep track:

1. s
2. t
3. r
4. a
5. w
6. b
7. e
8. r
9. r
10. y

### Counting the 'r's
Now, I'll go through each letter one by one to see if it's an 'r':

1. **s** - Not an 'r'.
2. **t** - Not an 'r'.
3. **r** - This is an 'r'. (First 'r' found)
4. **a** - Not an 'r'.
5. **w** - Not an 'r'.
6. **b** - Not an 'r'.
7. **e** - Not an 'r'.
8. **r** - This is an 'r'. (Second 'r' found)
9. **r** - This is an 'r'. (Third 'r' found)
1

In [8]:
print(f"Initial answer: {answer}")

Initial answer: Alright, let's tackle the question: **"How many Rs are in the word 'strawberry'?"**

### Understanding the Question
First, I need to understand what the question is asking. It's asking for the count of the letter "R" (both uppercase and lowercase, but since the word is in lowercase, we'll focus on lowercase "r") in the word "strawberry."

### Writing Down the Word
Let me write down the word to visualize it better:

**strawberry**

### Identifying Each Letter
Now, I'll list out each letter in the word one by one and see if it's an "r":

1. s
2. t
3. r
4. a
5. w
6. b
7. e
8. r
9. r
10. y

### Counting the Rs
Now, let's go through each letter:

1. **s** - Not an "r."
2. **t** - Not an "r."
3. **r** - This is an "r." (Count: 1)
4. **a** - Not an "r."
5. **w** - Not an "r."
6. **b** - Not an "r."
7. **e** - Not an "r."
8. **r** - This is an "r." (Count: 2)
9. **r** - This is an "r." (Count: 3)
10. **y** - Not an "r."

### Verifying the Count
From the above, the "r"s are at p

### Calc scores

In [9]:
import re


def get_self_reflection_score(
    question, answer, client="mistral", model="mistral-medium-latest"
):
    prompt = get_self_reflection_prompt(question, answer)
    if client == "mistral":
        response = get_mistral_answer(prompt, temperature=0.0, model=model)
    else:
        raise NotImplementedError(f"Client {client} is not implemented.")

    print("\nSelf-Reflection Response:\n", response)

    # Extract "answer: X" from each question (expecting two)
    def extract_answers(text):
        return re.findall(r"answer:\s*([ABC])", text, flags=re.IGNORECASE)

    def map_to_score(letter):
        return {"A": 1.0, "B": 0.0, "C": 0.5}.get(letter.upper(), None)

    answers = extract_answers(response)
    scores = [map_to_score(a) for a in answers if map_to_score(a) is not None]

    if scores:
        return sum(scores) / len(scores)
    else:
        return None  # Could not parse answers

In [10]:
score = get_self_reflection_score(test_question, answer)


Self-Reflection Response:
 explanation: The proposed answer provides a detailed and accurate analysis of the word "strawberry," correctly identifying and counting the letter "r" three times. The step-by-step breakdown confirms that the letter "r" appears in the 3rd, 8th, and 9th positions of the word, totaling three occurrences. The verification steps and alternative approach further validate this count, ensuring no miscounting or oversight. Therefore, the proposed answer is correct., answer: A


In [11]:
score

1.0

### Next Action Observed Consistency Score

In [27]:
def get_nli_pipeline(model="nli-deberta-v3-small"):
    from transformers import pipeline
    if not model == "nli-deberta-v3-small":
        raise NotImplementedError(f"NLI Pipeline not implemented for model: {model}.")
    return pipeline(
        "text-classification",
        model="cross-encoder/nli-deberta-v3-small",
        device=1,
        top_k=None,
    )

In [15]:
res = nli("the apple is red <-> the apple is green")


In [None]:
type(res)
res[0][1]
[i["score"] for i in res[0] if i["label"] == "contradiction"][0]

0.9989809393882751

In [None]:
def calc_pairwise_consistency(reference_answer, sampled_answer, alpha = 0.5):
    y_yi = f"{reference_answer} [SEP] {sampled_answer}"
    yi_y = f"{sampled_answer} [SEP] {reference_answer}"
    nli = get_nli_pipeline()
    res_y_yi = nli(y_yi)
    res_yi_y = nli(yi_y)
    p_contra = [i["score"] for i in res_y_yi[0] if i["label"] == "contradiction"][0]
    p_contra_prime = [i["score"] for i in res_yi_y[0] if i["label"] == "contradiction"][0]
    si = 0.5 * ((1-p_contra) + (1-p_contra_prime))
    ri = 1 if reference_answer == sampled_answer else 0 # indicator function accounts for shortcomings of NLI
    return alpha * si + (1 - alpha) * ri



In [32]:
import numpy as np
def calc_observed_consistency(reference_answer, k_sampled_answers):
    cons_scores = []
    for answer in k_sampled_answers:
        s = calc_pairwise_consistency(reference_answer=reference_answer, sampled_answer=answer)
        cons_scores.append(s)
    return np.mean(cons_scores)




In [33]:
calc_observed_consistency(reference_answer=answer, k_sampled_answers=consistency_responses)

Device set to use mps:1
Token indices sequence length is longer than the specified maximum sequence length for this model (1243 > 512). Running this sequence through the model will result in indexing errors
Device set to use mps:1
Token indices sequence length is longer than the specified maximum sequence length for this model (1302 > 512). Running this sequence through the model will result in indexing errors
Device set to use mps:1
Token indices sequence length is longer than the specified maximum sequence length for this model (1321 > 512). Running this sequence through the model will result in indexing errors
Device set to use mps:1
Token indices sequence length is longer than the specified maximum sequence length for this model (1169 > 512). Running this sequence through the model will result in indexing errors
Device set to use mps:1
Token indices sequence length is longer than the specified maximum sequence length for this model (1131 > 512). Running this sequence through the mo

np.float64(0.9765633953269571)

In [None]:
def confidence_score(observed_consistency, self_reflection, beta=0.5):
    return beta * observed_consistency + (1 - beta) * self_reflection
