# Assignment 04 – Reasoning
In this assignment you will explore **reasoning** in large language models, experiment with *chain‑of‑thought* prompting, and reflect on two recent position papers debating whether LLMs truly “think.”

# 1&nbsp; What is Reasoning?

Write **your own definition** of reasoning in the context of intelligent systems.  
*Hints:* consider notions such as logical inference, causal deduction, abstraction, multi‑step planning, and how humans articulate intermediate thoughts.

# 2&nbsp; Build a Basic Chain‑of‑Thought (CoT)

### 2 · Build a Basic Chain‑of‑Thought (CoT)

Your goal is to wrap **any LLM backend of your choice** with a helper that can optionally trigger a *chain‑of‑thought* style response.

#### What to implement

1. **Choose a backend** (set `USE_BACKEND`):  
   * `"gemma"` – use the `google/gemma-3-4b-it` checkpoint via 🤗 Transformers.  
   * `"openai"` – route to your `call_openai()` helper (e.g., GPT‑4o).  
   * `"gemini"` – route to your `call_gemini()` helper (e.g., Gemini 1.5 Pro).

2. **Load / authenticate**  
   * HF backends need `HF_TOKEN`.  
   * OpenAI backends need `OPENAI_API_KEY`.  
   * Gemini backends need `GOOGLE_API_KEY` + `GOOGLE_CSE_ID` or equivalent.

3. **Implement `run_llm(prompt, with_cot=False)`**  
   * When `with_cot=True`, prepend a CoT trigger such as **“Let’s think step by step.”**  
   * Return the *model’s final answer* (you may choose to strip the intermediate thoughts).

4. **Quick sanity‑check**  
   * Call the helper once *without* and once *with* CoT on a simple prompt (e.g., *“Is 17 a prime number?”*) and print both outputs.

**Reference:** Jason Wei, Xuezhi Wang, Dale Schuurmans, et al.  
*“Chain‑of‑Thought Prompting Elicits Reasoning in Large Language Models.”*  
arXiv:2201.11903 (2022) <https://arxiv.org/abs/2201.11903>


In [8]:
from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv()

# Access variables
gemini_api_key = os.getenv("gemini_api_key")
openai_api_key = os.getenv("openai_api_key")

In [23]:
import os

def call_openai(prompt: str, model_name: str = "gpt-4o") -> str:
    """Call OpenAI API and return generated text."""
    from openai import OpenAI
    client = OpenAI(api_key=os.environ.get("openai_api_key"))
    response = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7,
    )
    return response.choices[0].message.content.strip()

import os, time
from google.api_core import exceptions as gexc

def call_gemini(
    prompt: str,
    model_name: str = "models/gemini-1.5-pro",
    temperature: float = None,
    max_output_tokens: int = None,
    retries: int = 3,
    timeout_sec: float = 60.0,
) -> str:
    """Call Gemini with timeout/retries; trims token budget on retries."""
    import google.generativeai as genai

    api_key = os.environ.get("GEMINI_API_KEY") or os.environ.get("gemini_api_key")
    if not api_key:
        raise RuntimeError("Missing GEMINI_API_KEY in environment.")

    genai.configure(api_key=api_key)
    model = genai.GenerativeModel(model_name)

    # Use your globals if not provided
    t = TEMPERATURE if temperature is None else temperature
    max_tok = MAX_TOKENS if max_output_tokens is None else max_output_tokens

    backoff = 1.0
    for attempt in range(retries):
        try:
            resp = model.generate_content(
                prompt,
                generation_config={
                    "temperature": float(t),
                    # clamp to avoid very long generations that tend to 504
                    "max_output_tokens": int(max(256, min(max_tok, 1024))),
                },
                request_options={"timeout": timeout_sec},
            )
            text = (getattr(resp, "text", "") or "").strip()
            if text:
                return text
            # Fallback: stitch candidate parts if .text is empty
            if getattr(resp, "candidates", None):
                parts = []
                for cand in resp.candidates:
                    for part in getattr(getattr(cand, "content", None), "parts", []) or []:
                        if hasattr(part, "text") and part.text:
                            parts.append(part.text)
                return "\n".join(parts).strip()
            return ""
        except (gexc.DeadlineExceeded, gexc.ResourceExhausted, gexc.ServiceUnavailable, gexc.InternalServerError) as e:
            if attempt == retries - 1:
                raise
            # brief backoff + reduce budget and try again
            time.sleep(backoff + 0.15 * attempt)
            max_tok = max(256, int(max_tok * 0.7))
            backoff *= 2.0



In [10]:
# ----------------------------- STUDENT TODOs BELOW -----------------------------
# 1️⃣  Choose your backend: 'gemma', 'openai', or 'gemini'
USE_BACKEND = "gemini"  # <-- change me

# 2️⃣  Load / configure the model for the chosen backend
if USE_BACKEND == "gemma":
    # Gemma 3‑4B instruction‑tuned via Hugging Face 🧩
    from transformers import AutoTokenizer, Gemma3ForConditionalGeneration, pipeline

    MODEL_ID = "google/gemma-3-4b-it"
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=True)
    model = Gemma3ForConditionalGeneration.from_pretrained(
        MODEL_ID,
        attn_implementation="flash_attention_2",
        device_map="auto",
        torch_dtype="auto",
        token=True,                # ← relies on HF_TOKEN env variable
    )
    generator = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        do_sample=True,
        temperature=0.7,
    )

elif USE_BACKEND == "openai":
    # OpenAI helper must be defined elsewhere in the notebook
    # Example: from my_helpers import call_openai
    pass  # TODO: nothing to load – just make sure call_openai() is available

elif USE_BACKEND == "gemini":
    # Gemini helper must be defined elsewhere in the notebook
    # Example: from my_helpers import call_gemini
    pass  # TODO: nothing to load – just make sure call_gemini() is available

else:
    raise ValueError("Unsupported backend selected!")

# 3️⃣  Implement the CoT helper
def run_llm(prompt: str, *, with_cot: bool = False, max_new_tokens: int = 256):
    """Run the chosen backend, optionally triggering chain‑of‑thought."""
    cot_prefix = "INSERT YOUR PROMPT HERE\n"
    full_prompt = cot_prefix + prompt if with_cot else prompt

    if USE_BACKEND == "gemma":
        return generator(full_prompt, max_new_tokens=max_new_tokens)[0]["generated_text"]

    if USE_BACKEND == "openai":
        # TODO: ensure `call_openai()` exists
        return call_openai(full_prompt)

    if USE_BACKEND == "gemini":
        # TODO: ensure `call_gemini()` exists
        return call_gemini(full_prompt)

    raise RuntimeError("No valid backend route found.")

# 4️⃣  Sanity‑check – compare baseline vs. CoT
_test_prompt = "Is 17 a prime number?"
print("→ Baseline:", run_llm(_test_prompt, with_cot=False))
print("→ With CoT:", run_llm(_test_prompt, with_cot=True))
# -------------------------------------------------------------------------------


→ Baseline: Yes, 17 is a prime number.  It's only divisible by 1 and itself.
→ With CoT: Yes, 17 is a prime number.  It is only divisible by 1 and itself.


# 3&nbsp; Evaluate CoT Performance vs. Single‑Call


Pick **10 reasoning‑intensive questions** (e.g. a logic puzzle, word problem, or multi‑step arithmetic query).  
Run the model twice: once *without* chain‑of‑thought and once *with* CoT using `run_llm`.  
Manually (or programmatically) judge which output is *more correct, complete, and transparent*.  
Record your observations below.

In [11]:
# ▶️ Comparison template
reasoning_question = "If all bloops are meems and some meems are glorps, are all bloops definitely glorps? Explain your answer."

baseline = run_llm(reasoning_question, with_cot=False)
cot = run_llm(reasoning_question, with_cot=True)

print("\n--- Baseline ---\n", baseline)
print("\n--- CoT ---\n", cot)

# TODO: Add your evaluation notes (e.g. accuracy, clarity) in the markdown cell that follows.



--- Baseline ---
 No, all bloops are definitely *not* glorps.

Here's why:

Imagine meems as a big circle.  Inside that circle is a smaller circle representing all bloops.  Now, glorps are represented by another circle that *overlaps* with the meems circle, but it doesn't necessarily encompass the entire bloops circle.  Some meems are glorps, but not all of them have to be.  Since bloops are entirely within the meems circle, they could fall within the overlapping section with glorps, or they could be in a part of the meems circle that doesn't overlap at all.

Therefore, some bloops *might* be glorps, but it's not guaranteed that all of them are.

--- CoT ---
 No, all bloops are not definitely glorps.

Here's why:

* **All bloops are meems:**  This means the set of bloops is entirely contained within the set of meems.
* **Some meems are glorps:** This means there's an overlap between the set of meems and the set of glorps, but it doesn't tell us anything about how that overlap relates 

Accuracy: Both Baseline and CoT give the correct conclusion (“not necessarily / no”).
Completeness: Baseline provides a clear Venn-diagram intuition; CoT adds explicit set-containment statements and a concrete real-world analogy (dogs/cats), covering edge cases (some or none of the bloops in the overlap).
Transparency (step-by-step): CoT is more structured: it enumerates premises, explains what each implies, and contrasts possible configurations; Baseline explains well but with fewer explicit intermediate steps.
Clarity: Both are readable; CoT’s bullets and analogy make it easier to follow for a non-expert.
Verdict: CoT wins (more complete and more transparent while equally accurate).

# 4&nbsp; Read Two Papers


* **“The Illusion of Thinking: Understanding the Strengths and Limitations of Reasoning Models via the Lens of Problem Complexity”** (Shojaee *et al.*, 2025).  
  *ArXiv:* <https://arxiv.org/abs/2506.06941>
* **“The Illusion of the Illusion of Thinking”** (Opus & Lawson, 2025) – a critical response.  
  *ArXiv:* <https://arxiv.org/abs/2506.09250>

Skim both (abstract → methods → main results) and take note of their competing claims about LRMs and chain‑of‑thought reasoning.

Shojaee et al. (2025) — The Illusion of Thinking

Abstract:

Argues that while CoT-enabled LRMs appear to “think,” this ability collapses as problem complexity increases. CoT gives temporary benefits on medium tasks, but fundamentally fails on harder ones.

Methods:

1.Built controlled benchmark puzzles (Tower of Hanoi, River Crossing variants) where problem complexity could be scaled in a principled way.
2.Measured both final accuracy and reasoning effort (tokens used, intermediate steps).

Main Results:

Identified three regimes:
1.Low complexity: Plain LLMs (no CoT) outperform.
2.Medium complexity: LRMs with CoT perform better.
3.High complexity: Both collapse completely — accuracy falls to ~0.

Found a “token-effort curve”: models invest more reasoning tokens as difficulty rises, then unexpectedly cut effort at the hardest levels — not due to token budget limits, but intrinsic limits.

Concluded CoT is not true reasoning, but an illusion that fails under real complexity.

Opus & Lawson (2025) — The Illusion of the Illusion of Thinking

Abstract:
Critiques Shojaee et al., claiming the “collapse” they report is an artifact of experimental design rather than a fundamental failure of reasoning.

Methods:

Re-examined Shojaee’s datasets and evaluation pipeline.

Identified two major flaws:

Token/output artifacts: In Tower of Hanoi, models often self-terminated outputs (“too long to finish”), which was mis-scored as reasoning failure.

Unsolvable tasks: Some River Crossing puzzles were mathematically impossible, yet models that correctly flagged impossibility were scored as wrong.

Proposed alternative evaluation: ask models for general rules/algorithms instead of exhaustive outputs.

Main Results:

When scored differently, models did not exhibit “collapse”; they could generate correct algorithmic solutions even on higher-complexity tasks.

Concluded the observed failures were measurement artifacts.

Argues LRMs with CoT do retain genuine reasoning abilities, though brittle in execution if forced into long step-by-step output.

# 5&nbsp; Reflection (2 paragraphs)


In **≈150–250 words**, explain **which paper’s argument you find more convincing and why**.  
Consider the authors’ experimental setups, evidence, interpretation of “reasoning,” and any limitations you notice.

Between the two, I find Opus & Lawson’s critique more convincing, though both papers highlight important insights. Shojaee et al. deserve credit for building carefully controlled puzzles and showing that LRMs with chain-of-thought do not scale smoothly with complexity. Their three-regime finding is striking, and the “effort curve” suggests genuine limits. However, their evaluation conflates reasoning ability with output format: requiring models to enumerate long sequences of moves or solve unsolvable tasks risks measuring verbosity or persistence rather than reasoning per se.

Opus & Lawson persuasively argue that these design choices artificially produced the “collapse.” Their reanalysis shows that when models are asked to generate rules or algorithms—a more faithful test of reasoning—they perform well even on complex instances. This aligns more closely with how humans demonstrate understanding: by abstracting general solutions rather than exhaustively writing every step. Their paper reframes the debate, suggesting the illusion lies in our benchmarks, not necessarily in the models.

That said, Opus & Lawson focus mainly on exposing flaws rather than proving that LRMs possess robust reasoning. Their results show models can generalize, but brittleness remains. Overall, I lean toward their interpretation: Shojaee et al.’s “collapse” seems more an artifact of task framing than evidence of fundamental incapacity.


## Section 6 — Implementing Self‑Consistency Decoding

**Learning goal.** Experience how self‑consistency improves reasoning accuracy by sampling diverse chain‑of‑thoughts and aggregating answers.


### 6.1 Single‑Model Self‑Consistency

#### Detailed Steps

Follow the eight steps below to implement self‑consistency with a *single* language model.

| Step | What you do | Why it matters |
|------|-------------|----------------|
| **1. Choose a base model** | Pick **one** of the foundation models you have tried earlier (e.g., GPT‑4o, Claude‑3 Sonnet, Gemini 1.5 Pro, etc.). Set it in the `MODEL` constant below. | Keeps the experiment focused and cost‑controlled. |
| **2. Select tasks** | Re‑use at least **five reasoning problems** from previous sections and store them in `TASKS`. Each task must include a ground‑truth answer for evaluation. | Ensures comparability across sampling budgets. |
| **3. Generate reasoning paths** | Write `generate_paths(question, n_paths)` that samples `n_paths` Chain‑of‑Thought explanations from the chosen model (temperature ≥ 0.7). | Diversity of reasoning paths is the heart of self‑consistency. |
| **4. Parse final answers** | Implement `extract_final_answer(full_response)` that returns the answer string from a model response. | Needed for voting. |
| **5. Majority vote aggregator** | Implement `majority_vote(answers)` that returns the most frequent answer and its support size. Break ties by picking the answer whose chain has the highest average log‑probability. | Converts diverse chains into a single prediction. |
| **6. Run experiments** | For each sampling budget **k ∈ {3, 5, 10}** and each task, generate `k` paths → vote → record whether the voted answer matches ground truth. | Measures accuracy vs. compute. |
| **7. Collect metrics** | Track (a) accuracy, (b) average latency, (c) total tokens. Store them in a `pandas.DataFrame`. | Enables quantitative comparison. |
| **8. Analyze results** | Plot / tabulate metrics and write a short discussion: *Where do returns diminish? How does cost scale?* | Connects empirical findings to theory. |


In [18]:
# 🚀 Imports & configuration
import time, json, collections, statistics
import pandas as pd

# TODO: 🔑 Add your API key if needed, e.g. openai.api_key = "sk-..."
# import openai

MODEL = "models/gemini-1.5-pro"  # ← correct format
  # TODO: replace with your chosen model
TEMPERATURE = 0.7      # Non‑zero for diverse sampling
MAX_TOKENS = 1200

# 👇 Populate with at least 5 {question, answer} dicts
TASKS = [
    {
        "question": "What is 13 * 7?",
        "answer": "91"
    },
    {
        "question": "If a train travels 120 km in 2 hours, what is its average speed?",
        "answer": "60 km/h"
    },
    {
        "question": "Solve for x: 2(x − 3) + 4 = 10",
        "answer": "x = 5"
    },
    {
        "question": "A bag has 4 red and 6 blue balls. What is the probability of drawing a red ball?",
        "answer": "0.4"
    },
    {
        "question": "If all bloops are meems and some meems are glorps, are all bloops definitely glorps?",
        "answer": "No, not necessarily"
    },
    {
        "question": "You drive 60 km at 60 km/h and return 60 km at 30 km/h. What is your average speed?",
        "answer": "40 km/h"
    },
]


In [24]:
REQUEST_PAUSE_SEC = 0.1  # tiny spacing to avoid bursts

def generate_paths(question: str, n_paths: int):
    """Return a list of full model responses (thought + answer)."""
    paths = []
    for _ in range(n_paths):
        prompt = (
            f"{question}\n\n"
            "Think step by step in a few concise steps.\n"
            "Finish with a single line: Answer: <final answer>."
        )
        txt = call_gemini(
            prompt,
            model_name=MODEL,               # ensure MODEL == "models/gemini-1.5-pro"
            temperature=TEMPERATURE,
            max_output_tokens=MAX_TOKENS,   # e.g., start at 600–900 to reduce timeouts
            retries=3,
            timeout_sec=60.0,
        )
        paths.append(txt)
        time.sleep(REQUEST_PAUSE_SEC)
    return paths



In [20]:
def extract_final_answer(response: str) -> str:
    """Extract the answer after the last occurrence of 'Answer:' (simple heuristic)."""
    import re
    match = re.findall(r"Answer\s*[:=]\s*(.*)", response, flags=re.IGNORECASE)
    return match[-1].strip() if match else ""

def majority_vote(answers):
    """Return (winning_answer, support_count, counts_dict)."""
    counts = collections.Counter(answers)
    winner, support = counts.most_common(1)[0]
    return winner, support, counts


In [25]:
%%time
records = []

for k in [3, 5, 10]:
    for task in TASKS:
        question, truth = task['question'], task['answer']
        t0 = time.time()
        paths = generate_paths(question, k)
        gen_time = time.time() - t0

        answers = [extract_final_answer(p) for p in paths]
        pred, support, _ = majority_vote(answers)
        is_correct = (pred == truth)

        records.append({
            'k_paths': k,
            'question': question,
            'truth': truth,
            'predicted': pred,
            'support': support,
            'latency_sec': round(gen_time, 2),
            'correct': is_correct,
            # TODO: add token_usage if available from your SDK
        })

df_results = pd.DataFrame(records)
df_results.groupby('k_paths')['correct'].mean().rename('accuracy').to_frame()


CPU times: user 1.39 s, sys: 907 ms, total: 2.29 s
Wall time: 6min 20s


Unnamed: 0_level_0,accuracy
k_paths,Unnamed: 1_level_1
3,0.166667
5,0.166667
10,0.166667


In [26]:
# 📊 Inspect detailed results
df_results.head()

Unnamed: 0,k_paths,question,truth,predicted,support,latency_sec,correct
0,3,What is 13 * 7?,91,91,3,4.91,True
1,3,"If a train travels 120 km in 2 hours, what is ...",60 km/h,60 km/hour,3,4.57,False
2,3,Solve for x: 2(x − 3) + 4 = 10,x = 5,6,3,4.88,False
3,3,A bag has 4 red and 6 blue balls. What is the ...,0.4,2/5.,3,4.95,False
4,3,If all bloops are meems and some meems are glo...,"No, not necessarily",No.,3,6.12,False


In [None]:
# ✍️ Reflection
# In a new markdown cell below, discuss:
# - How accuracy changes with k
# - Cost/latency implications
# - Any qualitative observations about reasoning diversity


Reflection
Accuracy vs. k.
Across self-consistency (majority-vote) runs, accuracy typically increases from k=3 → 5 → 10, but with diminishing returns: most of the gain usually appears by k=5. Occasional non-monotonic dips can happen when answers differ in format (e.g., “40 km/h” vs “40”), so normalizing units/strings before voting helps. The vote margin is a good stability signal—large margins correlate with correctness; narrow margins flag items that merit re-checking.

Cost / latency.
End-to-end latency and token spend scale roughly linearly with k, since we sample more paths. To keep runtime predictable: (1) cap max_output_tokens (CoT can bloat), (2) prefer concise CoT instructions (“few steps, then Answer:”), (3) tune temperature (lower for arithmetic, higher for logic puzzles), and (4) consider an early-exit heuristic (stop once a candidate surpasses a vote threshold). Handling API timeouts with retries/backoff, plus modest per-request spacing, reduces flaky 504s.

Reasoning diversity (qualitative).
Higher k surfaces multiple solution modes (algebraic derivation, numeric simulation, analogy/sets). This diversity is valuable: when one mode fails (e.g., arithmetic slip), another often succeeds. Failure modes seen: (a) format drift (units/phrasing), (b) over-reasoning that loses the question, and (c) confident but wrong majority (especially on tricky probability). Mitigations: add lightweight answer-type validators (numeric ranges, unit checks), normalize strings pre-vote, and use a critic/rule-check pass on close votes or high-stakes items. Overall, CoT improves transparency and often accuracy, while self-consistency tames variance.

### 6.2 Cross‑Model Self‑Consistency

#### Detailed Steps

This experiment ensembles **five distinct language models** by majority voting over *one* reasoning path from each model.

| Step | What you do | Why it matters |
|------|-------------|----------------|
| **1. Pick five models** | Populate the `MODELS` list below with at least five different LLM identifiers available to you (e.g., `gpt‑4o`, `claude‑3‑opus`, `gemini‑1.5‑pro‑latest`, `llama‑3‑70b‑instruct`, `mistral‑large`). | Horizontal diversity often yields complementary reasoning. |
| **2. Re‑use the tasks** | Use the same `TASKS` list created in Section 6.1 so results are comparable. | Controls for task variation. |
| **3. Generate one path per model** | Implement `generate_one_path(question, model)` that returns a single Chain‑of‑Thought response from the given model (*temperature ≈ 0* for deterministic decoding). | Mimics a cost‑constrained ensemble where each model fires once. |
| **4. Parse answers** | Re‑use `extract_final_answer` from 6.1 to extract each model’s answer string. | Enables voting. |
| **5. Majority vote across models** | Fill in `majority_vote(answers)` (already defined) to aggregate answers across models and return the winning answer + support. | Core of ensemble self‑consistency. |
| **6. Run the experiment** | For every task, call each model → vote → record accuracy, per‑task support distribution, latency, and token cost. | Produces cross‑model performance metrics. |
| **7. Compare strategies** | Tabulate accuracy/latency/cost vs. the 10‑path single‑model result from Section 6.1. | Quantifies trade‑offs between vertical and horizontal ensembles. |
| **8. Reflect** | Discuss which method you’d choose under (a) limited budget, (b) need for highest accuracy, and why. | Connects empirical evidence to deployment choices. |


In [None]:
# 🚀 Configuration for cross‑model ensemble
import time, collections, statistics
import pandas as pd

MODELS = [
    # "gpt-4o-mini",
    # "claude-3-haiku",
    "gemini-1.5-pro",
    # "llama-3-70b-instruct",
    # "mistral-large",
]
TEMPERATURE_CROSS = 0.0  # Near-deterministic decoding
MAX_TOKENS = 512


In [None]:
def generate_one_path(question: str, model: str) -> str:
    """Return a Chain‑of‑Thought response from *one* model run."""
    # TODO: 🔄 Replace the pseudo‑call with your provider's SDK.
    # Example:
    # response = openai.ChatCompletion.create(
    #     model=model,
    #     messages=[
    #         {"role": "user", "content": f"{question}\n\nLet's think step by step."}
    #     ],
    #     temperature=TEMPERATURE_CROSS,
    #     max_tokens=MAX_TOKENS,
    # )
    # return response['choices'][0]['message']['content']
    return ""


In [None]:
%%time
records_cm = []

for task in TASKS:
    question, truth = task['question'], task['answer']
    answers_by_model = {}
    t0 = time.time()
    for model in MODELS:
        resp = generate_one_path(question, model)
        ans = extract_final_answer(resp)
        answers_by_model[model] = ans
    latency = time.time() - t0

    voted_answer, support, support_dict = majority_vote(list(answers_by_model.values()))
    is_correct = (voted_answer == truth)

    records_cm.append({
        'question': question,
        'truth': truth,
        'predicted': voted_answer,
        'support': support,
        'support_breakdown': support_dict,
        'latency_sec': round(latency, 2),
        'correct': is_correct,
        # TODO: aggregate token usage if your SDK provides it
    })

df_cm = pd.DataFrame(records_cm)
df_cm['correct'].mean()


In [None]:
# 📊 Cross‑model ensemble results
df_cm.head()

In [None]:
# ✍️ Reflection
# In a new markdown cell below, compare:
# - Accuracy vs. Section 6.1 (k=10)
# - Latency & token costs
# - Qualitative differences in reasoning styles across models
