In [None]:
# TODO NLI implementation wrong, we don't want to compare the prompt to the answer, rather compare differnt prompts (refer to paper)
# TODO next action, print candidates of COT, print results of each self reflection, print results of NLI
# TODO second action: bettere understand confidence score calculation

## Connect to LLM and get example question

In [5]:
import os
from mistralai import Mistral

api_key = os.environ["MISTRAL_API_KEY"]
model = "mistral-medium-latest"

client = Mistral(api_key=api_key)

chat_response = client.chat.complete(
    model=model,
    messages=[
        {
            "role": "user",
            "content": "What is the largest lake in California?",
        },
    ],
)
answer = chat_response.choices[0].message.content
print(chat_response.choices[0].message.content)

The largest lake in California is **Salton Sea**, located in the southern part of the state.

- **Surface Area:** Approximately 343 square miles (though it fluctuates due to evaporation and water inflow).
- **Formation:** It was created accidentally in 1905 when the Colorado River breached an irrigation canal and flooded the Salton Basin.
- **Salinity:** The lake is saltier than the ocean due to high evaporation rates and agricultural runoff.

While **Lake Tahoe** is more famous and deeper, the Salton Sea holds the title for the largest lake by surface area in California. However, it faces significant environmental challenges, including shrinking water levels and ecological concerns.


In [6]:
from jinja2 import Template

PROMPT_SELF_REFLECTION = Template("""
1. Question: {{ question }}, Proposed Answer: {{ answer }}
Is the proposed answer: (A) Correct (B) Incorrect (C) I am not sure.
The output should strictly use the following template:
explanation: [insert analysis], answer: [choose one letter from among choices A through C]

2. Question: {{ question }}, Proposed Answer: {{ answer }}
Are you really sure the proposed answer is correct?
Choose again: (A) Correct (B) Incorrect (C) I am not sure.
The output should strictly use the following template:
explanation: [insert analysis], answer: [choose one letter from among choices A through C]
""")


In [7]:
PROMPT_CONSITENCY = Template("""
Please strictly use the following template to provide answer:
explanation: [insert step-by-step analysis], answer: [provide
your answer] + Question: {{ question }}
""")

In [8]:
def get_self_reflection_prompt(question, answer):
    return PROMPT_SELF_REFLECTION.render(question=question, answer=answer)


def get_consistency_prompt(question, answer):
    return PROMPT_CONSITENCY.replace("[User Provided]", question).replace(
        "[provide your answer]", answer
    )


def get_initial_answer(
    question, temperature=0.0, client="mistral", model="mistral-medium-latest"
):
    if client == "mistral":
        return get_mistral_answer(question, temperature, model)
    else:
        raise NotImplementedError(f"Client {client} is not implemented.")


def get_mistral_answer(question, temperature=0.0, model="mistral-medium-latest"):
    client = Mistral(api_key=os.environ["MISTRAL_API_KEY"])
    chat_response = client.chat.complete(
        model=model,
        temperature=temperature,
        messages=[
            {
                "role": "user",
                "content": question,
            },
        ],
    )
    return chat_response.choices[0].message.content


def get_k_responses(
    question, k=5, temperature=1.0, client="mistral", model="mistral-medium-latest"
):
    responses = []
    for _ in range(k):
        response = get_initial_answer(question, temperature)
        responses.append(response)
    return responses


### Getting outputs for score

In [9]:
test_question = "How many survivors were there on the Titanic?"
answer = get_initial_answer(test_question)
consistency_responses = get_k_responses(question=test_question, k=5, temperature=1.0)

In [None]:
# print(f"Initial answer: {answer}")
for i, response in enumerate(consistency_responses):
    print("------------------------")
    print(f"Response {i + 1}: {response}")
    print("------------------------")

In [16]:
print(f"Initial answer: {answer}")

Initial answer: The exact number of survivors from the sinking of the RMS Titanic on April 15, 1912, is approximately **706** out of the estimated **2,224** passengers and crew on board.

Here’s a breakdown:
- **Total on board:** ~2,224 (1,317 passengers + 907 crew)
- **Survivors:** ~706 (492 passengers + 214 crew)
- **Fatalities:** ~1,517

The survival rate was heavily influenced by factors such as class, gender, and age, with women and children in first and second class having higher survival rates. The lifeboats, which could accommodate about 1,178 people, were not filled to capacity, leading to many unnecessary deaths.

Would you like more details on specific groups (e.g., by class or gender)?


### Calc scores

In [15]:
import re


def get_self_reflection_score(
    question, answer, client="mistral", model="mistral-medium-latest"
):
    prompt = get_self_reflection_prompt(question, answer)
    if client == "mistral":
        response = get_mistral_answer(prompt, temperature=0.0, model=model)
    else:
        raise NotImplementedError(f"Client {client} is not implemented.")

    print("\nSelf-Reflection Response:\n", response)

    # Extract "answer: X" from each question (expecting two)
    def extract_answers(text):
        return re.findall(r"answer:\s*([ABC])", text, flags=re.IGNORECASE)

    def map_to_score(letter):
        return {"A": 1.0, "B": 0.0, "C": 0.5}.get(letter.upper(), None)

    answers = extract_answers(response)
    scores = [map_to_score(a) for a in answers if map_to_score(a) is not None]

    if scores:
        return sum(scores) / len(scores)
    else:
        return None  # Could not parse answers

In [16]:
score = get_self_reflection_score(test_question, answer)


Self-Reflection Response:
 1. **First Question:**
explanation: The proposed answer provides a well-researched and accurate breakdown of the Titanic's survivors, fatalities, and total passengers/crew. The numbers align with historical records, and the additional context about survival factors (class, gender, lifeboat protocols) is correct and relevant. The offer to provide more details is also appropriate for further inquiry.
answer: A

2. **Second Question:**
explanation: The proposed answer remains consistent with verified historical data. The numbers (706 survivors, 2,224 total on board) are widely accepted by historians, and the explanation of survival disparities by class and gender is accurate. There is no reason to doubt the correctness of this answer.
answer: A


In [14]:
score

1.0

### Next Action Observed Consistency Score