In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import pickle
import os
import torch
from pathlib import Path

In [2]:
def get_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        # Print CUDA details?
        print(f"CUDA Device: {torch.cuda.get_device_name()}")
        print(f"CUDA Memory Allocated: {torch.cuda.memory_allocated()/1024**2:.2f}MB")
        print(f"CUDA Memory Reserved: {torch.cuda.memory_reserved()/1024**2:.2f}MB")
        return device
    else:
        return torch.device("cpu")

In [3]:
DEVICE = get_device()

In [4]:
def load_model():
    model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
                                                trust_remote_code=True,
                                                torch_dtype=torch.bfloat16)
    tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")

    model.to(DEVICE)
    return model, tokenizer

In [5]:
model, tokenizer = load_model()
model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
    (rotary_

---

these are the prompts for steering

In [6]:
REASONING_STEERING = {
    'system1_vs_system2': {
        'positive_examples': [  # System 2: Slow, Deliberative, Analytical
            "Let me think through this step by step. First, I need to identify the core question, then gather relevant information, evaluate options, and finally form a conclusion.",
            "This requires careful analysis. I'll break down the problem into its constituent parts: [Part A], [Part B], and [Part C], and then examine how they interrelate.",
            "I should examine multiple angles here. From a logical perspective..., from an ethical perspective..., and from a practical perspective, considering all implications.",
            "Let me work through the logic meticulously: if A implies B, and B is true given evidence C, then we can deduce A with a certain confidence.",
            "This is complex. I'll need to reason through each component carefully, ensuring each step logically follows from the previous one.",
            "Wait, let me reconsider my initial intuitive response. Upon further reflection, considering factors like [Factor X] and [Factor Y], a different, more reasoned approach seems more robust.",
            "Let me trace through the implications. If we assume X, then it logically follows that Y will occur, which in turn could lead to Z.",
            "I need to be precise here. The exact reasoning is as follows: Premise 1: All P are Q. Premise 2: R is P. Conclusion: Therefore, R is Q.",
            "Let me verify my logic: Is each premise sound? Does the conclusion necessarily follow from the premises? Are there any hidden assumptions?",
            "This warrants deeper analysis. I'll examine the underlying assumptions, such as the reliability of the data source and the validity of the model used.",
            "Hold on, I should think about this more carefully before answering, perhaps by outlining the arguments for and against.",
            "Let me approach this methodically. Starting from the foundational principles, I will build up the argument piece by piece.",
            "I need to consider several factors here. Factor one is [Factor A with its details], Factor two is [Factor B with its details], and how they interact.",
            "This isn't straightforward. Let me work through the complexities by creating a decision tree or a flow chart to map out the possibilities.",
            "Let me pause and think about what this really means in a broader context and what the second-order effects might be.",
            "I should cross-check my reasoning here. If my logic is correct, then applying it to a similar, simpler problem should yield a verifiable result.",
            "Let me formulate this properly. The key insight, derived from analyzing [Data Point 1] and [Principle P], is that [Conclusion C].",
            "This requires connecting multiple concepts. First, we have the principle of [Concept A]. Second, there's the theory of [Concept B]. Connecting these explains [Phenomenon P].",
            "I need to be careful about my assumptions. Let me state them explicitly: I am assuming [Assumption 1] and [Assumption 2] for this line of reasoning.",
            "Let me reason backwards from the desired outcome. If we want to achieve [Goal G], what prerequisite steps must be true, and what actions would lead to them?"
        ],
        'negative_examples': [  # System 1: Fast, Intuitive, Direct
            "The answer is obviously X, no need to dwell on it.",
            "It's simple - just do Y. I don't see any complexities here.",
            "That's easy! It's clearly Z.",
            "Off the top of my head, I'd say it's A without much thought.",
            "The quick answer, without digging deep, is B.",
            "Intuitively, this seems like C. My first feeling is usually right.",
            "My immediate thought is D. No need for a long explanation.",
            "Without overthinking it, I'd say E. Let's move on.",
            "The straightforward answer is F. Why complicate it?",
            "At first glance, this is clearly G. It just jumps out.",
            "My gut tells me H. I trust my instincts on this.",
            "The obvious choice is I. Any other option is over-analysis.",
            "Instantly, I know that J is the solution.",
            "No need to complicate this - it's simply K.",
            "The direct answer, without any fluff, is L.",
            "Simply put, it's M. End of story.",
            "My first instinct says N. I'll go with that.",
            "The clear solution is O. It's plain as day.",
            "Right away, I can tell you P. No hesitation.",
            "It's immediately apparent that Q is the case."
        ]
    },
    'analytical_depth': {
        'positive_examples': [  # Deep Analysis
            "Let's examine this systematically. First, I'll identify key variables (e.g., A, B, C), then hypothesize their relationships, and finally test these against available data, noting any covariances.",
            "This requires multi-step reasoning. Step 1: Define the problem space and constraints. Step 2: Generate potential solutions. Step 3: Evaluate each solution against criteria like feasibility, cost, and impact. Step 4: Select the optimal solution with justification.",
            "I need to consider edge cases and potential exceptions. For example, what if input X is negative, or if resource Y is unavailable?",
            "Let me construct a logical proof: Given axioms A (e.g., [Axiom A]) and B (e.g., [Axiom B]), and applying rule of inference R (e.g., Modus Ponens), we can formally derive conclusion C.",
            "To properly analyze this, I should consider counterfactuals: what if [Condition A] were different (e.g., if interest rates hadn't risen), how would that have changed [Outcome B]?",
            "Let me trace the causal chain: X (e.g., increased marketing spend) leads to Y (e.g., higher brand awareness) because of [mechanism M1], which in turn causes Z (e.g., sales growth) due to [mechanism M2].",
            "I'll apply formal reasoning here. Using the principle of [e.g., expected utility maximization], we can model the decision-making process under uncertainty.",
            "This needs rigorous analysis. Let me establish the constraints (e.g., budget < $10k, timeline < 3 months) and objective functions (e.g., maximize user engagement).",
            "I should validate each assumption before proceeding. For instance, the assumption that [Assumption A] holds true needs to be checked against [Data Source D].",
            "Let me derive this from first principles. Starting with the fundamental laws of [e.g., physics/economics], we can build up to explain this phenomenon.",
            "We need to examine the underlying mechanics of how this works, perhaps by looking at the component interactions at a granular level.",
            "Let me build a mental model of this system, identifying its key components, their states, and the rules governing their transitions and interactions.",
            "I'll analyze the dependencies and interactions between components. For example, component A provides input to B, and a failure in A will propagate to B and C.",
            "This calls for a comprehensive examination of all relevant factors, including [Factor 1], [Factor 2], and their potential second-order effects.",
            "Let me investigate the root causes and their effects, distinguishing between proximate causes and ultimate underlying issues.",
            "I need to understand the fundamental relationships at play, perhaps by mapping out a concept diagram or an influence network.",
            "Let's dive into the technical details and mechanisms, such as the specific algorithms used or the chemical pathways involved.",
            "I'll perform a thorough analysis of each element, considering its properties (e.g., strength, capacity), functions, interactions with other elements, and its contribution to the overall system behavior.",
            "This requires understanding the theoretical foundations, such as [Theory T1] and [Model M1], that underpin this problem domain.",
            "Let me explore the implications at multiple levels of abstraction, from the concrete operational details to the strategic conceptual overview."
        ],
        'negative_examples': [  # Surface Level
            "Generally speaking, these things tend to work out.",
            "In most cases, a simple approach is usually best.",
            "The rule of thumb here is just to [Simple Action].",
            "Common sense tells us that [Oversimplified Conclusion].",
            "It's widely known that [Generalization], so that's probably it.",
            "The typical answer would be to just focus on the main part.",
            "Usually, people just [Common Behavior] in this situation.",
            "The standard approach is [Standard Method], no need to reinvent the wheel.",
            "Conventionally, this is handled by [Conventional Step].",
            "Most would agree that it's basically about [Simple Idea].",
            "It's commonly understood that [Broad Statement].",
            "The usual way is to look at the obvious factors.",
            "Typically, you'd just consider the surface details.",
            "As everyone knows, the core issue is [Superficial Point].",
            "The normal response is to address the most apparent aspect.",
            "It's standard practice to not overcomplicate these things.",
            "The accepted view is that it's a straightforward matter.",
            "Ordinarily, one would just take it at face value.",
            "The conventional wisdom says to keep it simple.",
            "It's routine to just apply the basic solution here."
        ]
    },
    'metacognitive_awareness': {
        'positive_examples': [  # High Metacognition
            "I notice I'm making an assumption here – that [Assumption X] is true. Let me question whether there's evidence for this, or if alternative assumptions are more plausible.",
            "Wait, I should double-check my reasoning because the stakes are high, and a small error in logic could lead to a significant misjudgment.",
            "I'm uncertain about this part (e.g., the reliability of this data point), so let me reconsider its impact on my overall conclusion or seek clarification.",
            "My confidence in this conclusion is moderate (around 60-70%) because, while [Evidence A] supports it, [Evidence B] presents a counterpoint that I haven't fully reconciled.",
            "I recognize this could be biased by my prior experience with [Similar Situation Y], which might not be fully applicable here. Let me adjust by consciously looking for disconfirming evidence.",
            "I should acknowledge the limitations of my reasoning here; for instance, I'm relying on [Data Source Z], which might have its own inherent biases or incompleteness, affecting the generalizability.",
            "Let me reflect on whether my logic is sound. Are there any fallacies I might be committing, like a hasty generalization or a false dichotomy?",
            "I'm aware that my initial instinct to [Action A] might be wrong, so I will deliberately explore alternative actions [B and C].",
            "I notice a potential flaw in my thinking: I've equated correlation with causation here. Let me correct that by looking for mechanistic evidence.",
            "Upon reflection, I realize I need more information about [Specific Area S] before I can confidently proceed with this line of argument.",
            "I'm conscious that I might be oversimplifying a complex issue. Let me add nuance by considering [Factor F1] and [Factor F2].",
            "I should question my own premise here. What if [Premise P] is not universally true? How would that change my subsequent deductions?",
            "I detect a possible contradiction in my reasoning: if [Statement S1] is true, then [Statement S2] seems unlikely. I need to resolve this.",
            "My thinking might be influenced by the framing of the question. I should try to reframe it to see if my perspective changes.",
            "I realize I'm extrapolating beyond what I know for certain. This conclusion is more speculative and relies on [Assumption A] holding true in a new context.",
            "I should be more careful about this claim because the evidence is indirect and open to multiple interpretations.",
            "I'm noticing my reasoning relies heavily on the analogy with [Analogous Situation X]. Is this analogy strong enough to support the conclusion?",
            "Let me examine why I'm drawn to this particular conclusion. Are there any emotional factors or cognitive biases, like confirmation bias, at play?",
            "I should consider what I might be overlooking. Are there any alternative explanations or perspectives that I haven't explored yet?",
            "I need to be honest about the gaps in my understanding regarding [Topic T]. This means my conclusion about it is tentative."
        ],
        'negative_examples': [  # Low Metacognition
            "Obviously, this is the only correct answer, and there's no other way to see it.",
            "It's definitely true. I don't need to think twice.",
            "There's no doubt that my initial assessment is perfect.",
            "Everyone knows this is how it works. It's common knowledge.",
            "It's certain that this will succeed. Failure is not an option.",
            "Clearly, my logic is flawless and requires no review.",
            "Without question, this is the best approach. No debate needed.",
            "It's undeniable that this conclusion is the right one.",
            "Absolutely, I'm correct. I don't see any room for error.",
            "It goes without saying that this is the truth.",
            "Undoubtedly, my reasoning stands. No need to check.",
            "Surely, anyone can see this is the only way.",
            "It's self-evident that my perspective is accurate.",
            "Naturally, this is the conclusion one would arrive at.",
            "Of course, I've considered everything important.",
            "It's indisputable that this is the final word on the matter.",
            "Unquestionably, my judgment is sound here.",
            "It's a fact that this is how things are. No ifs, ands, or buts.",
            "Certainly, there are no flaws in this argument.",
            "It's plain to see that this is the only logical outcome."
        ]
    },
    'problem_decomposition': {
        'positive_examples': [  # Systematic Breakdown
            "Let me decompose this complex problem into more manageable subproblems: (1) Understanding user needs, (2) Designing the interface, (3) Developing the backend logic, and (4) Testing and deployment.",
            "I'll tackle this piece by piece. First component: Analyze the input data structure. Second component: Develop the processing algorithm. Third component: Format the output results.",
            "This breaks down into several parts. Part A involves gathering requirements. Part B is about resource allocation. Part C focuses on execution strategy.",
            "Let me isolate each variable and analyze its impact: Variable X (e.g., price) affects demand. Variable Y (e.g., advertising) affects awareness. Variable Z (e.g., quality) affects satisfaction.",
            "I'll solve this modularly. Module 1 will handle data ingestion. Module 2 will perform the core computation. Module 3 will be responsible for visualization.",
            "Breaking this down: the input is [describe specific input, e.g., a time series of stock prices]. The process involves [Step 1: data cleaning, Step 2: feature engineering, Step 3: model training]. The desired output is [describe specific output, e.g., a prediction for the next day's price].",
            "Let me separate the concerns: technically, we need to ensure system stability and scalability; practically, the solution must be user-friendly and cost-effective; ethically, we must consider data privacy and bias.",
            "I'll analyze each dimension independently: the temporal dimension (how it evolves over time), the spatial dimension (how it varies by location), and the causal dimension (what factors drive it).",
            "This has multiple layers. At the surface, it's an issue of [Surface Problem]. Deeper down, it relates to [Underlying Factor]. At the core, it's about [Fundamental Principle].",
            "Let me factor this problem: common elements across different instances are [A, B]. Unique aspects specific to this case are [C, D].",
            "I'll divide this project into phases: Phase 1: Initialization and planning. Phase 2: Development and iteration. Phase 3: Evaluation and conclusion.",
            "Let me segment this by stakeholder: from the customer's perspective, the priority is [X]. From the developer's view, it's [Y]. From management's standpoint, it's [Z].",
            "I'll break this into atomic operations: first, fetch the data; then, validate each record; next, transform specific fields; finally, store the result.",
            "This decomposes into: prerequisites (what must be in place before starting), the main process (the steps involved), and the expected outcomes (the measurable results).",
            "Let me partition this problem space: known elements that are well-understood, and unknown elements that require further investigation or assumptions.",
            "I'll structure this hierarchically: the high-level strategic goal is [Goal G]. This breaks down into mid-level tactical objectives [O1, O2]. The specific operational tasks for O1 include [T1a, T1b].",
            "Breaking into functional units: the input processing unit handles data acquisition and validation; the transformation unit applies the core logic; the output unit formats and delivers results.",
            "Let me separate static elements (e.g., system configuration, core algorithms) from dynamic elements (e.g., user input, real-time data streams).",
            "I'll decompose by abstraction level: conceptually, it's about [Abstract Idea]; logically, it's represented by [Logical Model]; physically, it's implemented with [Physical Components].",
            "This project splits into: problem definition (clearly stating what needs to be solved), solution space exploration (identifying possible approaches), and implementation planning (detailing the chosen approach)."
        ],
        'negative_examples': [  # Holistic/Gestalt
            "Looking at the big picture, the individual details don't really matter as much as the overall feel.",
            "Taking everything together, it just feels right/wrong.",
            "As a whole, this seems to be a single, indivisible issue.",
            "Overall, I'd say the essence of it is [Simple Summary], without needing to pick it apart.",
            "In general, let's not get lost in the weeds; the forest is more important than the trees.",
            "Broadly speaking, the main thrust is clear without detailed breakdown.",
            "The gist is simply [Core Idea]. No need for sub-points.",
            "All things considered, it boils down to one central theme.",
            "On balance, the overall impression is what counts the most.",
            "The main point is [X], and the rest are minor details.",
            "Viewing this holistically, the interconnectedness is more key than the parts.",
            "The overall impression is that it's a unified problem.",
            "Taking a bird's eye view, the specifics blur into a larger pattern.",
            "The general sense is that it's one cohesive entity.",
            "In the aggregate, the individual components lose their distinctiveness.",
            "The collective picture shows a single, dominant trend.",
            "Seen as a whole, its character is [Descriptive Adjective].",
            "The grand scheme suggests a singular driving force.",
            "From 30,000 feet, the details are not important for this discussion.",
            "The unified view is that it's all part of the same phenomenon."
        ]
    },
    'uncertainty_expression': {
        'positive_examples': [  # High Uncertainty Expression
            "I'm not entirely certain, but my best assessment, based on [Evidence X] and [Model Y], is that [Conclusion Z] has a probability of roughly 60-70%.",
            "With moderate confidence, I believe [Statement A], though I acknowledge that [Factor F] introduces a degree of unpredictability that could alter this outcome.",
            "There's some uncertainty here, particularly regarding [Specific Aspect S], but I think the most likely scenario is [Scenario L].",
            "I could be wrong, as the data is somewhat ambiguous, but it seems like [Observation O] is the most plausible interpretation.",
            "Based on limited information and several assumptions, I'd estimate the value to be in the range of [X to Y].",
            "I'm about 70% confident that [Prediction P] will occur, meaning there's a 30% chance it won't.",
            "This is my tentative conclusion, subject to revision if new data, such as [Data D], becomes available.",
            "I'm somewhat unsure about the precise impact, but I'm leaning towards it being [Direction D, e.g., positive but small].",
            "With the caveat that I might be mistaken due to [Reason R, e.g., incomplete historical data], my current hypothesis is [Hypothesis H].",
            "My provisional answer, pending more thorough analysis of [Factor F], is that we should [Action A].",
            "I have medium confidence in this assessment because while [Supporting Point S] is strong, [Weakness W] is a concern.",
            "This is an educated guess rather than a certainty, derived from [Heuristic H] and [Pattern P].",
            "I'd say probably, but I'm not completely sure because [Variable V] is highly volatile.",
            "My uncertainty is high here, especially concerning [Element E], but if pressed for a direction, I'd suggest [Direction D].",
            "Take this with a grain of salt, as it's based on preliminary findings, but it appears that [Finding F].",
            "I'm making an informed conjecture that [Outcome O] will happen, based on the trend lines, but acknowledge unforeseen events could change this.",
            "My confidence interval for this estimate is quite wide, from [Lower Bound] to [Upper Bound], though the center is around [Value V], because [Reason R].",
            "This is speculative, but plausible given [Condition C1] and [Condition C2]; however, it's far from confirmed.",
            "I'm hedging here because the evidence is mixed: [Supporting Evidence S] points one way, while [Contradictory Evidence C] suggests another, so a firm conclusion is difficult.",
            "It's difficult to say with certainty, but it's likely that [Event E] will occur, assuming current trends continue."
        ],
        'negative_examples': [  # Low Uncertainty Expression
            "I'm completely certain that this is the correct path.",
            "There's zero doubt that this will be the outcome.",
            "I'm 100% confident that my analysis is accurate.",
            "It's guaranteed that this strategy will succeed.",
            "I know for a fact that this information is true.",
            "Unequivocally, the answer is X, without any ambiguity.",
            "With absolute certainty, I can state this will happen.",
            "I can state definitively that there are no other viable options.",
            "Beyond any shadow of doubt, this is the reality.",
            "I'm totally sure that this is the best course of action.",
            "It's ironclad that this conclusion cannot be refuted.",
            "I have perfect confidence that this plan will work flawlessly.",
            "Without any reservation, I endorse this finding completely.",
            "I'm utterly convinced that this is the only explanation.",
            "It's incontrovertible that these are the facts of the matter.",
            "I can guarantee you that this will yield the desired results.",
            "With complete assurance, I predict this outcome.",
            "I'm dead certain that there are no errors in this logic.",
            "There's no possibility of error here; it's foolproof.",
            "I stake my reputation on this being the correct assessment."
        ]
    },
    'evidence_based_reasoning': {
        'positive_examples': [  # Evidence-Heavy
            "The data from [Specific Study S, e.g., the Framingham Heart Study] shows that... According to the statistics published by [Source X, e.g., the Bureau of Labor Statistics]...",
            "Based on the evidence available, including [Document D], [Testimony T], and [Empirical Finding E from Experiment Z], the facts indicate that [Conclusion C].",
            "Research published in [Journal J, e.g., Nature] by [Author A] demonstrates that [Finding F] through a randomized controlled trial involving [N] participants, with a p-value of [p].",
            "The empirical findings from our recent survey suggest... Observations from the field study confirm that [Observation O] occurs in [Percentage P]% of cases.",
            "Looking at the documented cases in the [Database D]... The historical records from [Archive A] show a consistent pattern of...",
            "Experimental results from [Lab L] indicate... The measurements taken using [Instrument I] reveal a significant difference between groups.",
            "According to peer-reviewed sources like [Source S1] and [Source S2]... The literature consistently states that [Established Fact F].",
            "The quantitative analysis of [Dataset Q] shows a strong positive correlation (r=0.85)... The numbers don't lie; there's a clear trend.",
            "Based on reproducible experiments conducted in three separate labs... The controlled studies, which isolated variable V, showed...",
            "The meta-analysis by [Researcher X et al., Year Y], which synthesized results from [Number N] studies, concludes that [Overall Effect E] with a confidence interval of [CI].",
            "Cross-referencing multiple sources, such as [Source 1], [Source 2], and [Independent Report R], independent verification shows a consistent pattern of [Observation O].",
            "The longitudinal data collected over [Time Period T] indicates a shift in... Time-series analysis reveals a cyclical pattern with a period of [Period P].",
            "Statistically significant findings (p < 0.01) show a difference between the treatment and control groups... The p-value suggests the null hypothesis can be rejected.",
            "The correlation data implies a strong relationship between X and Y... Regression analysis indicates that X accounts for [R-squared value]% of the variance in Y.",
            "Field observations consistently show that when [Condition C] is present... The case studies of [Company A] and [Company B] demonstrate the effectiveness of [Strategy S].",
            "The experimental protocol (detailed in Appendix A) yielded these results... Laboratory results from the mass spectrometry confirm the presence of [Compound C].",
            "The survey data from [N] respondents reveals that [Percentage P]% prefer option A... Polling conducted by [Organization O] indicates a shift in public opinion.",
            "The archaeological evidence, including [Artifact A] and [Site S], points to... Historical records, such as [Document D], confirm this sequence of events.",
            "The clinical trials for [Drug D] demonstrated a [Effect E] in [Percentage P]% of patients... Medical data from patient charts shows...",
            "The simulation results, using [Model M] with parameters [P1, P2], suggest... Modeling predicts that [Outcome O] is likely under these conditions."
        ],
        'negative_examples': [  # Opinion/Intuition-Heavy
            "I feel like this is the right direction. My sense is that it will work out.",
            "In my opinion, without specific data, I believe that we should proceed with X.",
            "It seems to me that this is the most logical choice, based on my general understanding.",
            "My gut says this is a winner. Instinctively, I'm drawn to this solution.",
            "I'd imagine that most people would agree with this perspective.",
            "My hunch is that this will be successful, even if I can't point to hard evidence yet.",
            "It strikes me that this is the obvious path forward. My impression is strong.",
            "I'd venture to say, personally, I think this is the best option on the table.",
            "My intuition tells me we're on the right track. I have a good feeling about this.",
            "I'd speculate that the outcome will be positive. My view is optimistic.",
            "It appears to me that this is a sound idea. I'd assume it will be well-received.",
            "My personal take is that this approach has the most merit, from what I've gathered.",
            "I'm inclined to think this will work, based on my past experiences in similar, though not identical, situations.",
            "I'd hazard a guess that the results will be favorable. I fancy this approach.",
            "My subjective view is that this option feels the most promising.",
            "Based on my experience over the years... From what I've seen anecdotally...",
            "My perspective is that this is a low-risk, high-reward scenario.",
            "If you ask me, and this is just my personal judgment, I'd say go for it.",
            "My judgment is that this is the most sensible way. I perceive it to be correct.",
            "To my mind, this is clearly the superior choice. As I see it, there's no contest."
        ]
    },
    'temporal_reasoning': {
        'positive_examples': [  # Sequential/Temporal Focus
            "First, we need to [Action A] to establish a baseline. Then, after that, [Action B] will occur, which is expected to lead to [Intermediate Outcome O1] by [Time T1]. Finally, [Action C] will complete the process.",
            "The sequence of events is crucial: initially, [Event E1] happened at [Time T0]. Subsequently, [Event E2] followed at [Time T1], ultimately leading to [Result R] at [Time T2].",
            "Looking at the timeline: at T0, the system was in [State S0]. By T1, due to [Event E1], it transitioned to [State S1]. We project it will reach [State S2] by T2.",
            "The chronological order is important here. Starting with the earliest event, [Event A], we then observe [Event B], followed by [Event C].",
            "Let me trace the evolution of this system: in its early stage (e.g., Q1), it exhibited [Characteristic C1]. During the middle phase (e.g., Q2), it developed [Characteristic C2]. The final state (e.g., Q3) showed [Characteristic C3].",
            "The process unfolds as follows: beginning with user registration, progressing to profile completion, and concluding with service access.",
            "Historically, this concept developed from [Origin O] in [Year Y1], through significant modifications in [Year Y2], to its current form.",
            "The causal sequence is: trigger [T] initiated the process. This led to intermediate steps [S1, S2, S3]. The final outcome was [O], observed after a delay of [Duration D].",
            "Over time, we see distinct phases: short-term impacts (first week) included [Impact I1]. Medium-term consequences (1-3 months) were [Consequence C1]. Long-term effects (1 year+) are projected to be [Effect E1].",
            "The stages are clearly defined: Stage 1 is preparation (e.g., gathering materials). Stage 2 is execution (e.g., building the model). Stage 3 is follow-up (e.g., testing and validation).",
            "Tracking the progression: the genesis of the idea was [Date D1]. Development occurred between [Date D2] and [Date D3]. Maturation and deployment happened after [Date D4].",
            "The workflow proceeds from the input stage (data entry), through the processing stage (algorithmic transformation), to the output stage (report generation).",
            "Following the critical path: Milestone 1 (e.g., design approval) must be completed by [Date M1] before Milestone 2 (e.g., prototype development) can begin, aiming for completion by [Date M2].",
            "The lifecycle consists of: birth (product launch), growth (market adoption), maturity (peak sales), and decline (obsolescence or replacement).",
            "Step-by-step: the prerequisite is to have [Resource R] available. The main action is to perform [Task T]. The direct consequences will be [C1, C2].",
            "The temporal dependencies are critical: first, this [Action A1] must happen and complete before [Action A2] can commence.",
            "In chronological sequence: the past state was characterized by [Feature F1]. The present state shows [Feature F2]. The anticipated future state, based on current trends, is [Feature F3].",
            "The order of operations matters significantly: primary calculations must be done first, then secondary adjustments, and finally tertiary checks.",
            "Phase by phase: the initiation phase involves defining scope. The elaboration phase details requirements. The construction phase builds the solution. The termination phase closes the project.",
            "The schedule unfolds with immediate actions due to be completed today, near-term goals for this quarter, and long-term strategic objectives for the next 2-3 years."
        ],
        'negative_examples': [  # Atemporal/Simultaneous
            "All factors being equal, and considered at a single point in time...",
            "Looking at this holistically, as if all elements exist simultaneously...",
            "These elements (A, B, C) coexist and interact within the current system state.",
            "Everything happens together; there's no specific sequence to these interacting forces.",
            "Viewing this as a snapshot, the current configuration is what matters.",
            "In the current state, irrespective of how it was reached, we observe...",
            "Taking a cross-sectional view at this moment, the relationships are...",
            "These processes (X, Y, Z) occur in parallel, influencing each other concurrently.",
            "Simultaneously, we have factor A impacting B, while B also impacts A.",
            "At this moment in time, the system exhibits these characteristics.",
            "In the static view, the structure is composed of these interconnected parts.",
            "These are concurrent conditions, all present at once.",
            "Looking at the steady state, where dynamic changes have settled...",
            "In equilibrium, all these forces balance each other out.",
            "These are independent of timing; their relationships hold regardless of sequence.",
            "Regardless of the sequence of their introduction, their combined effect is...",
            "Order doesn't matter here; the presence of all components is key.",
            "These variables exist side by side, forming the current context.",
            "Time-invariant aspects of this system include its core principles.",
            "In the timeless perspective, these are fundamental truths of the system."
        ]
    },
    'causal_reasoning': {
        'positive_examples': [  # Strong Causal Links
            "This policy (X) causes that outcome (Y) because it directly alters [Mechanism M, e.g., incentive structures], which in turn produces [Effect E, e.g., changed behavior].",
            "The direct effect of X (e.g., increased temperature) on Y (e.g., reaction rate) is well-established through [Physical Law P].",
            "Due to A (e.g., the introduction of a predator), we observe B (e.g., a decline in prey population) as a direct consequence.",
            "The causal mechanism is: input [I] is processed by [Component C1] which transforms it into [Intermediate State S1]; [Component C2] then acts on S1, resulting in output [O]. Each step has a direct and traceable influence.",
            "This leads to that through the following pathway: [Step 1] triggers [Step 2], which then activates [Step 3], culminating in the observed result.",
            "The root cause analysis reveals that [Event X] was the primary trigger, leading to [Consequence Y] because it disrupted [System S] in [Specific Way W], not merely correlated with it.",
            "X (e.g., having the gene) is a necessary condition for Y (e.g., expressing the trait) because without X, Y cannot occur. It is sufficient if, given X, Y invariably follows.",
            "The causal chain is: trigger [T] → intermediate effect [IE1] → intermediate effect [IE2] → final result [R].",
            "This intervention (e.g., therapy) influences that outcome (e.g., symptom reduction) by means of strengthening coping mechanisms.",
            "The cause-and-effect relationship shows that increasing [Variable A] directly results in a proportional increase in [Variable B].",
            "A (e.g., a specific enzyme) produces B (e.g., a product) through the well-understood biochemical mechanism of catalysis.",
            "The dependency graph clearly shows X → Y → Z, indicating a directed causal flow from X to Z via Y.",
            "This initial change triggers a cascade where A causes B, B causes C, and C leads to a widespread system change.",
            "The proximate cause of the crash was pilot error, while the ultimate cause involved systemic issues in training protocols.",
            "The intervention at point X (e.g., adding a catalyst) changes Y (e.g., speeds up the reaction) because it lowers the activation energy.",
            "Breaking the causal link between A (e.g., smoking) and B (e.g., lung cancer) by eliminating A would demonstrably reduce instances of B.",
            "The counterfactual analysis shows that if X (e.g., the new law) had not been implemented, then Y (e.g., the observed decrease in crime) would not have occurred, strongly implying X's causal role.",
            "Path analysis indicates a direct causal effect of [Factor F1] on [Outcome O] and an indirect effect mediated through [Variable V].",
            "Manipulating this independent variable (e.g., drug dosage) directly affects that dependent outcome (e.g., patient recovery rate) via a known pharmacological pathway.",
            "The structural equation model suggests X directly causes Y with a path coefficient of [Value], and this is not explained by a common cause Z."
        ],
        'negative_examples': [  # Correlation/Association
            "These two factors, A and B, tend to go together, but we can't say one causes the other.",
            "There's a strong association between ice cream sales and crime rates, but no direct causation.",
            "These are correlated (e.g., height and vocabulary in children), but likely due to a common underlying factor (age).",
            "We often see these events occur together, suggesting a link, but not necessarily a causal one.",
            "There's a statistical relationship between X and Y, but inferring causality would be premature without further study.",
            "These phenomena coincide frequently, which is interesting, but doesn't prove one leads to the other.",
            "A statistical connection exists (e.g., a positive covariance), but the direction of causality is unclear or could be spurious.",
            "These variables (e.g., studying hours and grades) cluster together, but other factors are also at play.",
            "We observe co-occurrence of A and B in many datasets, but a third variable C might be influencing both.",
            "These patterns align in many cases, but this alignment could be coincidental or driven by external trends.",
            "There's a correspondence between the rise of X and the rise of Y, but this doesn't mean X caused Y.",
            "These appear linked, but it could be that Y causes X, or that Z causes both X and Y.",
            "A pattern emerges where A is high when B is high, but this is purely an observed regularity so far.",
            "These two metrics track together over time, but this parallel movement doesn't establish a cause-effect bond.",
            "We notice a concordance between these findings, but they might be independent responses to a shared environment.",
            "These show similar trends across different regions, which is suggestive but not conclusive of causation.",
            "There's an affinity between these concepts in the literature, often discussed together, but not always causally linked.",
            "These exhibit parallelism in their development, but one might not be the driver of the other.",
            "A resemblance exists between the behavior of system A and system B, but they might be analogous rather than causally connected.",
            "These demonstrate synchrony in their fluctuations, but this could be due to a common external pacemaker rather than direct influence."
        ]
    },
    'abstraction_level': {
        'positive_examples': [  # High Abstraction
            "At the conceptual level, this problem can be understood as an instance of resource allocation under constraints, a fundamental economic principle.",
            "The abstract principle here is the 'Tragedy of the Commons,' which explains why shared resources are often overexploited.",
            "Generalizing this pattern observed in [Case 1: Market X], [Case 2: Ecosystem Y], and [Case 3: Social Network Z], we can formulate a theoretical framework of network effects.",
            "The theoretical framework of [e.g., Game Theory] suggests that actors will behave [Behavior B] under conditions of [Condition C].",
            "At a high level of abstraction, this is a question of balancing exploration (seeking new options) versus exploitation (using known good options).",
            "The underlying archetype here is the 'Hero's Journey,' which can be seen in narratives across diverse cultures and times.",
            "The meta-principle at work is that complex adaptive systems often exhibit emergent behavior not predictable from their individual components.",
            "From a philosophical perspective, this touches upon questions of determinism versus free will in decision-making systems.",
            "The categorical imperative, if applied here, would imply that [Action A] is universally right or wrong.",
            "In terms of universal laws, such as the conservation of energy, this transformation must adhere to [Constraint X].",
            "The abstract model, represented by $Y = f(X_1, X_2, ..., X_n)$, captures the essential relationships between these conceptual variables.",
            "At the paradigmatic level, this represents a shift from a [Old Paradigm P1] to a [New Paradigm P2] way of thinking.",
            "The overarching theme is the tension between individual autonomy and collective well-being.",
            "In the realm of pure concepts, this can be viewed as an interaction between 'Order' and 'Chaos'.",
            "The idealized version of this system, stripped of all confounding real-world complexities, would operate according to [Idealized Rule R].",
            "From a systems thinking view, these seemingly disparate events ([E1], [E2]) are manifestations of underlying feedback loops, specifically a [Reinforcing/Balancing Loop L] within the larger [System S].",
            "This belongs to the general class of problems known as NP-hard, implying certain computational limitations.",
            "At the ontological level, we are questioning the fundamental nature of [Entity E] and its properties.",
            "The formal representation of this logic, using predicate calculus, is [Formal Statement F].",
            "In the abstract space of all possible solutions, we are searching for an optimum within a defined region."
        ],
        'negative_examples': [  # Concrete/Specific
            "In this specific instance, with customer ID 789, the purchase amount was $52.30 on May 14th via credit card.",
            "The concrete example shows that when I press this specific button (B1) on this device (Model M2), the green LED (L3) lights up.",
            "Looking at this particular case study of Company X, their Q3 revenue was Y million, driven by product Z.",
            "The tangible manifestation of this problem is the crack in this specific beam (Beam #4B) of the bridge.",
            "In practical terms, this means we need to order 200 more units of SKU #12345 by tomorrow.",
            "The physical implementation involves a Raspberry Pi connected to a temperature sensor (DHT22) sending data via Wi-Fi.",
            "This specific scenario demonstrates: if User A inputs 'test@example.com' and password '123', they should be logged in.",
            "The actual instance reveals that the software crashed when processing file 'input_large.csv' which is 5GB.",
            "In this real-world example from our factory floor, Machine M5 failed due to a worn bearing (Part #P678).",
            "The particular details show that the error message 'Error Code 503' appeared at 10:15 AM on Server S2.",
            "Looking at the nuts and bolts, this resistor (R5) on the circuit board has a value of 10k Ohms.",
            "The hands-on approach requires us to physically inspect each of the 50 widgets for defect X.",
            "In this exact situation, the patient's blood pressure reading was 145/92 mmHg at 9:05 AM, using the Omron Model X device.",
            "The literal interpretation of this instruction is to 'turn the knob clockwise by 90 degrees'.",
            "The specific mechanism involves this gear (G3) meshing with that pinion (P2) to transmit torque.",
            "Down to the details, the pixel at coordinate (150, 300) has an RGB value of (255, 0, 0).",
            "The precise implementation uses Python version 3.9.1 and the 'requests' library version 2.25.1.",
            "In this concrete case, the student, John Doe, scored 85% on the exam taken on March 3rd.",
            "The particular instantiation of this class has its 'name' attribute set to 'ObjectAlpha' and 'value' to 100.",
            "At the ground level, this means the delivery truck (License Plate XYZ123) needs to go to 123 Main Street."
        ]
    }
}


In [7]:
print(REASONING_STEERING['abstraction_level']['positive_examples'][2])

Generalizing this pattern observed in [Case 1: Market X], [Case 2: Ecosystem Y], and [Case 3: Social Network Z], we can formulate a theoretical framework of network effects.


In [8]:
def prepare_prompt(prompt: str, system_prompt: str, tokenizer) -> str:
    """Format the prompt using the chat template if available"""
    if hasattr(tokenizer, 'apply_chat_template'):
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ]
        formatted_prompt = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        return formatted_prompt
    return prompt

In [None]:
def get_activations(model,
                    tokenizer,
                    text: str,
                    system_prompt: str):
    """Extract activations from all layers"""
    # Prepare the prompt using your existing function
    formatted_prompt = prepare_prompt(text, system_prompt, tokenizer)
    
    # Tokenize and move to the appropriate device
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(DEVICE)
    
    # Get model outputs with hidden states
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    
    # Extract hidden states from each layer
    hidden_states = outputs.hidden_states
    
    return hidden_states

In [None]:
def add_steering_hook(steering_vector, strength: float):
    """Create hook function to add steering vector"""
    def hook(module, input, output):
        # output is tuple: (hidden_states, attention_weights)
        hidden_states = output[0]
        
        # Expand steering vector to match batch size and move to the same device as hidden_states
        batch_size = hidden_states.shape[0]
        steering = steering_vector.to(hidden_states.device).unsqueeze(0).expand(batch_size, -1)
        
        # Add steering to all token positions
        steered_hidden_states = hidden_states + strength * steering.unsqueeze(1)
        
        # Return modified output
        return (steered_hidden_states,) + output[1:]
    
    return hook

In [9]:
def generate_steered(model, 
                    tokenizer,
                    prompt: str,
                    steering_vector,
                    strength: float, 
                    layer_id: int,
                    system_prompt: str,
                    max_new_tokens: int = 256):
    """Generate text with steering applied"""
    # Prepare prompt
    formatted_prompt = prepare_prompt(prompt, system_prompt, tokenizer)
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(DEVICE)
    
    # Make sure steering vector is on CPU until needed by the hook
    # (the hook will move it to the correct device when used)
    steering_vector_cpu = steering_vector.cpu()
    
    # Register hook on specified layer
    hook_fn = add_steering_hook(steering_vector_cpu, strength)
    handle = model.model.layers[layer_id].register_forward_hook(hook_fn)
    
    try:
        # Generate with steering
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=0.7,
                top_p=0.9
            )
        
        # Decode output
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Remove the input prompt from output
        if formatted_prompt in generated_text:
            generated_text = generated_text[len(formatted_prompt):].strip()
        
        return generated_text
    
    finally:
        # Always remove hook
        handle.remove()

In [None]:
def compute_and_save_steering_vector(model, tokenizer, 
                                    positive_examples: list, 
                                    negative_examples: list,
                                    layer_id: int,
                                    save_name: str,
                                    system_prompt: str = None,
                                    save_dir: str = "steering_vectors"):
    """Compute steering vector from multiple examples and save it"""
    
    # Create save directory if it doesn't exist
    Path(save_dir).mkdir(exist_ok=True)
    
    all_positive_acts = []
    all_negative_acts = []
    
    # Collect activations for all positive examples
    for example in positive_examples:
        acts = get_activations(model, tokenizer, example, system_prompt)
        act_at_layer = acts[layer_id][0, -1, :].detach().cpu()
        all_positive_acts.append(act_at_layer)
    
    # Collect activations for all negative examples
    for example in negative_examples:
        acts = get_activations(model, tokenizer, example, system_prompt)
        act_at_layer = acts[layer_id][0, -1, :].detach().cpu()
        all_negative_acts.append(act_at_layer)
    
    # Average the activations
    avg_positive = torch.stack(all_positive_acts).mean(dim=0)
    avg_negative = torch.stack(all_negative_acts).mean(dim=0)
    
    # Compute steering vector
    steering_vector = avg_positive - avg_negative
    
    # Save the steering vector as a .pt file
    save_path = os.path.join(save_dir, f"{save_name}_layer{layer_id}.pt")
    torch.save(steering_vector, save_path)
    
    # Save metadata separately
    metadata_path = os.path.join(save_dir, f"{save_name}_layer{layer_id}_metadata.pkl")
    metadata = {
        'layer_id': layer_id,
        'positive_examples': positive_examples,
        'negative_examples': negative_examples,
        'save_name': save_name
    }
    
    with open(metadata_path, 'wb') as f:
        pickle.dump(metadata, f)
    
    print(f"Saved steering vector to {save_path}")
    return steering_vector

In [None]:
def load_steering_vector(save_name, layer_id=None, save_dir="steering_vectors"):
    """Load a saved steering vector with the new naming convention"""
    # Check if directory exists
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        
    if layer_id is not None:
        # Use specified layer_id
        save_path = os.path.join(save_dir, f"{save_name}_layer{layer_id}.pt")
        metadata_path = os.path.join(save_dir, f"{save_name}_layer{layer_id}_metadata.pkl")
    else:
        # Try to find any matching file with the save_name prefix
        import glob
        pattern = os.path.join(save_dir, f"{save_name}_layer*.pt")
        matches = glob.glob(pattern)
        
        if not matches:
            raise FileNotFoundError(f"No steering vector files found for {save_name}")
        
        # Use the first matching file
        save_path = matches[0]
        # Extract layer number from filename
        import re
        layer_match = re.search(r'_layer(\d+)\.pt$', save_path)
        layer_num = int(layer_match.group(1)) if layer_match else 29  # Default to 29 if no match
        
        metadata_path = os.path.join(save_dir, f"{save_name}_layer{layer_num}_metadata.pkl")
    
    vector = torch.load(save_path, map_location=DEVICE)
    
    with open(metadata_path, 'rb') as f:
        metadata = pickle.load(f)
    
    print(f"Loaded steering vector from {save_path}")
    
    return vector, metadata.get('layer_id', layer_id or 29)

In [None]:
# Function to create all steering vectors for all layers
def create_all_steering_vectors_all_layers(model, tokenizer, system_prompt: str = None):
    """Create and save steering vectors for all reasoning types across all layers"""
    
    # Determine the number of layers in the model
    num_layers = len(model.model.layers)
    
    for name, config in REASONING_STEERING.items():
        print(f"\nCreating steering vectors for: {name}")
        
        # For each layer in the model
        for layer_id in range(num_layers):
            print(f"Processing layer {layer_id}")
            
            compute_and_save_steering_vector(
                model=model,
                tokenizer=tokenizer,
                positive_examples=config['positive_examples'],
                negative_examples=config['negative_examples'],
                layer_id=layer_id,
                save_name=name,
                system_prompt=system_prompt
            )

In [None]:
# To load a specific steering vector for a specific layer:
# vector, layer_id = load_steering_vector("analytical", layer_id=15)

# File naming format:
# - Steering vectors: {save_dir}/{save_name}_layer{layer_id}.pt
# - Metadata: {save_dir}/{save_name}_layer{layer_id}_metadata.pkl

# create_all_steering_vectors_all_layers(model, tokenizer)

only uncomment this part if you change the prompts; otherwise, the vectors for steering are stored in `steering_vectors/`.

---

In [11]:
# Function to apply multiple dimensions
def multi_dimensional_reasoning(model, 
                                tokenizer, 
                                prompt: str, 
                                system_prompt: str = None,
                                max_new_tokens: int = 256,
                                steering_direction: dict = None):
    """
    Apply multiple reasoning dimensions with fine-grained control
    
    Args:
        model: The language model
        tokenizer: The tokenizer
        prompt: The input prompt
        dimension_configs: Dictionary with structure {dimension_name: {"strength": float, "layer_id": int}}
    """
    
    # Use provided configs or empty dict
    configs = steering_direction or {}
    
    # Apply each dimension sequentially (you could also combine them)
    response = prompt
    for steering_direction, config in configs.items():
        strength = config.get("strength", 0.0)
        layer_id = config.get("layer_id", 0)
        
        if abs(strength) > 0.1:  # Only apply meaningful steering
            try:
                vector, layer_id = load_steering_vector(save_name=steering_direction, layer_id=layer_id)
                response = generate_steered(model, 
                                            tokenizer,
                                            prompt=prompt,
                                            steering_vector=vector,
                                            strength=strength, 
                                            layer_id=layer_id,
                                            system_prompt=system_prompt,
                                            max_new_tokens=max_new_tokens
                )
            except FileNotFoundError as e:
                print(f"Warning: Could not load steering vector for {steering_direction}: {e}")
                continue
    
    return response

In [12]:
steering_control = {
    # "deliberative_reasoning": { 
    #     "strength": 1.5,
    #     "layer_id": 20
    # },
    # "deep_analysis": { 
    #     "strength": 2.0,
    #     "layer_id": 25
    # },
    # "metacognitive_reasoning": {
    #     "strength": 1.7,
    #     "layer_id": 22
    # },
    # "systematic_decomposition": {
    #     "strength": 1.6,
    #     "layer_id": 20
    # },
    # "cautious_expression": {
    #     "strength": 1.3,
    #     "layer_id": 18
    # },
    # "evidence_driven_reasoning": {
    #     "strength": 1.8,
    #     "layer_id": 24
    # },
    # "sequential_temporal_reasoning": {
    #     "strength": 1.4,
    #     "layer_id": 19
    # },
    # "causal_inference": {
    #     "strength": 1.9,
    #     "layer_id": 26
    # },
    "system1_vs_system2": {
        "strength": -2.5,
        "layer_id": 23
    }
}


prompt = "what is the capital of france?"

output = multi_dimensional_reasoning(model=model,
                            tokenizer=tokenizer,
                            prompt=prompt,
                            max_new_tokens = 1024,
                            steering_direction= steering_control
                            )

print(output)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Loaded steering vector from steering_vectors/system1_vs_system2_layer23.pt
None<｜User｜>what is the capital of france?<｜Assistant｜><think>
I know that France is a country in Europe, and its capital is Paris.
</think>

The capital of France is Paris.


In [13]:
steering_control = {
    # "deliberative_reasoning": { 
    #     "strength": 1.5,
    #     "layer_id": 20
    # },
    # "deep_analysis": { 
    #     "strength": 2.0,
    #     "layer_id": 25
    # },
    # "metacognitive_reasoning": {
    #     "strength": 1.7,
    #     "layer_id": 22
    # },
    # "systematic_decomposition": {
    #     "strength": 1.6,
    #     "layer_id": 20
    # },
    # "cautious_expression": {
    #     "strength": 1.3,
    #     "layer_id": 18
    # },
    # "evidence_driven_reasoning": {
    #     "strength": 1.8,
    #     "layer_id": 24
    # },
    # "sequential_temporal_reasoning": {
    #     "strength": 1.4,
    #     "layer_id": 19
    # },
    # "causal_inference": {
    #     "strength": 1.9,
    #     "layer_id": 26
    # },
    "system1_vs_system2": {
        "strength": 2.5,
        "layer_id": 23
    }
}


prompt = "what is the capital of france?"

output = multi_dimensional_reasoning(model=model,
                            tokenizer=tokenizer,
                            prompt=prompt,
                            max_new_tokens = 1024,
                            steering_direction= steering_control
                            )

print(output)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Loaded steering vector from steering_vectors/system1_vs_system2_layer23.pt
None<｜User｜>what is the capital of france?<｜Assistant｜><think>
Okay, so I need to figure out the capital of France. I remember from my classes that France is a country in the world, and I've heard people talk about Paris before. I think the capital of France is a well-known city, so I'm pretty sure it's a major place. Let me try to recall. I remember learning about some capitals of countries, but I'm not sure about France. Maybe I can think of some other capitals to help me. For example, the capital of Germany is Berlin, right? And I think the capital of the US is Washington D.C. So, for France, I'm trying to remember. I've heard of Paris being a big city with lots of historic places, like the Eiffel Tower and the Louvre. Those are very famous landmarks. So, if those are in Paris, then maybe Paris is the capital. I should also consider the language. France uses French, so the capital should be a major city in th