In [None]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter Your OpenAI API Key:")
os.environ["HUGGINGFACEHUB_API_TOKEN"] = getpass.getpass("Enter your HF API key: ")

open_ai_model = "gpt-5-nano-2025-08-07"

## Self-Consistency Prompting in Language Models

Self-consistency prompting was introduced in the March 2022 paper ["Self-Consistency Improves Chain of Thought Reasoning in Language Models"](https://arxiv.org/pdf/2203.11171.pdf) by Xuezhi Wang, et. al.

# ü§î **What is Self-Consistency Prompting?**

- Focuses on exploring different reasoning paths for complex problems.

- Aims for reliable answers by checking consistency across various thought processes.

### üåü **Differences from Other Prompting Techniques**

- Traditional CoT: Generates short sentences mimicking human reasoning for solving tasks.

- Self-Consistency: Samples multiple reasoning paths and finds the most consistent answer.

- It's unsupervised and works with pre-trained models, unlike methods needing extra training or human annotations.

### **üõ†Ô∏è Constructing a Self-Consistency Prompt**


<figure>
  <img src="https://pbs.twimg.com/media/Ff73Y-RaEAANaqT.jpg" alt="Image Description" style="width:100%">
  <figcaption>The self-consistency method contains three steps: (1) prompt a language model using chain-of-thought (CoT) prompting; (2) replace the ‚Äúgreedy decode‚Äù in CoT prompting by sampling from the language model‚Äôs decoder to generate a diverse set of reasoning paths; and (3) marginalize out the reasoning paths and aggregate by choosing the most consistent answer in the final answer set.</figcaption>
  <a href="https://twitter.com/denny_zhou/status/1584978661668966401">Image Source</a>
</figure>


1. **Start with CoT**: Use chain-of-thought prompting as the base.

2. **Sample Diverse Paths**: Use "sample-and-marginalize" decoding to get various reasoning paths.

3. **Marginalize for Consistency**: Find the most consistent answer from these different paths.

### üîç **Examples in Action**

- **Chain-of-Thought**: For the question "If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?", instead of directly answering "5", the model might respond with the reasoning: "There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5."

- **Self-Consistency**: For a question about how much Janet makes from selling eggs, the model might generate multiple reasoning paths like:
  1. "She has 16 - 3 - 4 = 9 eggs left. So she makes $2 * 9 = $18 per day."

  2. "This means she sells the remainder for $2 * (16 - 4 - 3) = $26 per day."
  
  3. "She eats 3 for breakfast, so she has 16 - 3 = 13 left. Then she bakes muffins, so she has 13 - 4 = 9 eggs left. So she has 9 eggs * $2 = $18."

  The model then aggregates these paths to determine the most consistent answer, which in this case is "$18 per day."


üöÄ **Impact of Self-Consistency Prompting**
- Enhances model reasoning by considering multiple paths.
- Shown to boost performance in arithmetic and commonsense reasoning tasks.

# Begin by setting up CoT prompts

We're following the same pattern from the Chain of Thought lesson.

1. Downloading CoT prompt datasets from HuggingFace

2. Downloading embeddings model from HuggingFace

3. Creating prompt template for CoT

4. Creating an example selector

5. Construct the prompt

In [None]:
from datasets import load_dataset

# Load the dataset using streaming mode to avoid the file structure issue
# Then materialize a subset of it
dataset_stream = load_dataset(
    "kaist-ai/CoT-Collection", 
    streaming=True,
    token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
)

# Take the first 10,000 examples from the training split
dataset = dataset_stream['train'].take(10_000) #type: ignore

# Convert to a list to materialize the data
dataset = list(dataset)



  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import pandas as pd

# Convert the list of examples to a pandas DataFrame
dataset_df = pd.DataFrame(dataset) #type: ignore

# Remove the 'task' and 'type' columns if they exist
columns_to_remove = [col for col in ['task', 'type'] if col in dataset_df.columns]
if columns_to_remove:
    dataset_df = dataset_df.drop(columns=columns_to_remove)

dataset_df.head()

selected_examples = dataset_df.to_dict(orient='records')

In [7]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")



# from langchain_community.embeddings import HuggingFaceBgeEmbeddings

# model_name = "BAAI/bge-base-en-v1.5"

# encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

# embeddings = HuggingFaceBgeEmbeddings(
#     model_name=model_name,
#     model_kwargs={'device': 'cuda'},
#     encode_kwargs=encode_kwargs
# )

In [8]:
from langchain_core.example_selectors import MaxMarginalRelevanceExampleSelector
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import FewShotPromptTemplate, PromptTemplate

prefix = "Consider the following as examples of how to reason:"

examples_template = """Query: {source}

Rationale: {rationale}

Response: {target}
"""

suffix = """Using a similar reasoning approach, answer the users question which is delimited by triple backticks.

User question: ```{input}```

Take a deep breath, break down the user's query step-by-step, and provide a clear chain of thought in your response."
"""

examples_prompt = PromptTemplate(
    input_variables=["source", "rationale", "target"],
    template=examples_template
)

In [9]:
example_selector = MaxMarginalRelevanceExampleSelector.from_examples(
    selected_examples,
    embeddings,
    FAISS,
    k=5,
)

mmr_prompt = FewShotPromptTemplate(
    example_selector=example_selector,
    example_prompt=examples_prompt,
    prefix=prefix,
    suffix=suffix,
    input_variables=["input"]
)

In [10]:
query = """There were nine computers in the server room. Five more computers were installed each day, from
monday to thursday. How many computers are now in the server room?
"""
prompt = mmr_prompt.format(input=query)

In [11]:
print(prompt)

Consider the following as examples of how to reason:

Query: In this task, you need to answer the given multiple-choice question on the gain. Gain is the value by which to multiply the input. Classify your answers into 'a', 'b', 'c', 'd', and 'e'.

Problem: running at the same rate , 8 identical machines can produce 560 paperclips a minute . at this rate , how many paperclips could 18 machines produce in 6 minutes ? 
Options: a ) 1344 , b ) 3360 , c ) 7560 , d ) 50400 , e ) 67200

Rationale: The machines produce 560 paperclips a minute when there are 8 of them. Since each machine produces the same amount, we can say that each one produces 560 / 8 = 70 paperclips a minute.\nThere are 18 machines producing at this rate, so they produce in total 18 * 70 = 1260 paperclips per minute. In 6 minutes, they will have produced 6 * 1260 = 7560 paperclips.

Response: c


Query: In this task, you need to count the number of words in a sentence that contain the given letter

Sentence: 'two computer 

# Self-Consistency Prompt

In [12]:
sc_template = """Based on the responses (delimited by < >) to the following query, \
(delimited by triple backticks) return the response that occurs most frequently.

Query: ```{query}```

Responses: <{responses}>
"""

sc_prompt = PromptTemplate.from_template(sc_template)

### Generating multiple responses


Use the `n` parameter to generate alternative responses. Increase `n` to explore different variations.



In [13]:
from langchain_openai import OpenAI

# we'll use the default model here, gpt-3.5-turbo-instruct
llm = OpenAI(n=5)

generations = llm.generate([prompt])

In [14]:
generations.generations

[[Generation(text='\nRationale: The statement indicates that there were 9 computers in the server room initially. Then, 5 more computers were installed each day from Monday to Thursday. This means that on Monday, there were 9 + 5 = 14 computers, on Tuesday there were 14 + 5 = 19 computers, on Wednesday there were 19 + 5 = 24 computers, and on Thursday there were 24 + 5 = 29 computers. Therefore, there are now 29 computers in the server room.\n\nResponse: 29', generation_info={'finish_reason': 'stop', 'logprobs': None}),
  Generation(text='\nRationale: The sentence states that there were nine computers in the server room. Each day, five more computers were installed from Monday to Thursday. So, in total, there were 9 + 5 + 5 + 5 + 5 = 29 computers installed in those four days. Therefore, there are now 9 + 29 = 38 computers in the server room.\n\nResponse: 38', generation_info={'finish_reason': 'stop', 'logprobs': None}),
  Generation(text='\nProblem: There were nine computers in the ser

In [15]:
responses = []
for item in generations.generations[0]:
    response_index = item.text.find("Response: ")
    if response_index != -1:
        response = item.text[response_index + len("Response: "):].strip()
        responses.append(response)

In [16]:
responses

['29', '38', 'd', '29']

In [17]:
llm = OpenAI()

final_prompt = sc_prompt.format(query=query, responses=str(responses))

print(final_prompt)

Based on the responses (delimited by < >) to the following query, (delimited by triple backticks) return the response that occurs most frequently.

Query: ```There were nine computers in the server room. Five more computers were installed each day, from
monday to thursday. How many computers are now in the server room?
```

Responses: <['29', '38', 'd', '29']>



In [18]:
print(llm.invoke(final_prompt))


29
