# PRefLexOR Inference

In [1]:
!pip install git+https://github.com/lamm-mit/PRefLexOR.git --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m83.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.9/386.9 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

In [17]:
!pip install -U transformers



In [2]:
import os

from tqdm.notebook import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

import torch

from PRefLexOR import *

# Define thinking tokens
think_start = '<|thinking|>'
think_end = '<|/thinking|>'

### Load model

In [None]:
model_name='lamm-mit/PRefLexOR_ORPO_DPO_EXO_10242024'

model = AutoModelForCausalLM.from_pretrained(model_name,
    torch_dtype =torch.bfloat16,
    #attn_implementation="flash_attention_2",
                                             device_map="auto",trust_remote_code=True,
    )
tokenizer = AutoTokenizer.from_pretrained('lamm-mit/meta-llama-Meta-Llama-3.2-3B-Instruct-Reasoning-Tokenizer', trust_remote_code=True,
                                         use_fast=False,
                                         )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/935 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.25G [00:00<?, ?B/s]

### Inference: Conventional

In [None]:
txt = 'What is the relationship between materials and music? Brief answer.' + f' Use {think_start}.'

output_text, messages = generate_local_model(
    model=model,
    tokenizer=tokenizer,
    prompt=txt,
    system_prompt='',
    num_return_sequences=1,
    repetition_penalty=1.0,
    temperature=0.1,
    max_new_tokens=2024,
    messages=[],
    do_sample=True
)

print(output_text)

#### Extract thinking or other sections from the output

In [None]:
thinking    = extract_text(output_text, thinking_start=think_start, thinking_end=think_end)[0].strip()
answer_only = extract_text(output_text, thinking_start=think_end, thinking_end="NONE").strip()

In [None]:
print ("THINKING:\n\n", thinking)

In [None]:
print ("ANSWER:\n\n", answer_only)

### Inference: Recursive using multi-agent reasoning

![image.png](attachment:93e29a12-e4b9-44a8-9291-b67fd31f5a62.png)

In [None]:
from PRefLexOR import recursive_response_from_thinking

#### Load second model that will be the critic agent

In [None]:
model_name_base = "meta-llama/Llama-3.2-3B-Instruct"

critic_model = AutoModelForCausalLM.from_pretrained(
    model_name_base,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
    trust_remote_code=True
)

In [None]:
output_text, output_list, output_text_integrated = recursive_response_from_thinking(
    model=model,
    tokenizer=tokenizer,
    model_critic=critic_model,
    tokenizer_critic=tokenizer,  #same tokenizer in our case
    question="Develop an idea of how graphene can be combined with silk fibers to create a filtration membrane.",
    N=3,
    temperature=0.1,
    temperature_improvement=0.1,
    system_prompt="You are a helpful assistant.",
    system_prompt_critic="You carefully improve responses, with attention to detail, and following all directions.",
    verbatim=False,
)

In [None]:
for i, item in enumerate(output_list):
    print (f"i={i}", 64*"-")
    print (item)

In [None]:
print ("INTEGRATED RESPONSE:")
print (output_text_integrated)