<a href="https://colab.research.google.com/github/Jalam1001/huggingface/blob/main/Copy_of_LFM2_MoE_Inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -qqq git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6 --progress-bar off

from transformers import AutoModelForCausalLM, AutoTokenizer

# Load model and tokenizer
model_id = "LiquidAI/LFM2-8B-A1B"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    dtype="bfloat16",
#    attn_implementation="flash_attention_2" <- uncomment on compatible GPU
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Generate answer
prompt = "What is C. elegans?"
input_ids = tokenizer.apply_chat_template(
    [{"role": "user", "content": prompt}],
    add_generation_prompt=True,
    return_tensors="pt",
    tokenize=True,
).to(model.device)

output = model.generate(
    input_ids,
    do_sample=True,
    temperature=0.3,
    min_p=0.15,
    repetition_penalty=1.05,
    max_new_tokens=512,
)

print(tokenizer.decode(output[0], skip_special_tokens=False))

In [None]:
from textwrap import fill
from IPython.display import Markdown, display

def run_chat(prompt,
             system_prompt="You are a helpful assistant.",
             max_new_tokens=512,
             temperature=0.3,
             wrap_width=100,
             return_text=False):
    """
    Clean chat wrapper for LFM2 in Colab.
    Shows only the assistant reply; no prompt echo, no duplicate output.
    """
    # Build messages
    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user",   "content": prompt.strip()},
    ]

    # Encode with chat template
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
        tokenize=True,
    ).to(model.device)

    # Generate
    output = model.generate(
        input_ids,
        do_sample=True,
        temperature=temperature,
        min_p=0.15,
        repetition_penalty=1.05,
        max_new_tokens=max_new_tokens,
    )

    # 🔑 Decode only *new* tokens (no prompt echo)
    gen_tokens = output[0, input_ids.shape[-1]:]
    text = tokenizer.decode(gen_tokens, skip_special_tokens=True).strip()

    # Render nicely (markdown if present, else wrapped plain text)
    if any(x in text for x in ("#", "* ", "- ", "`", "\n\n")):
        md = f"**User:** {prompt}\n\n**Assistant:**\n\n{text}"
    else:
        md = f"**User:** {prompt}\n\n**Assistant:**\n\n{fill(text, width=wrap_width)}"
    display(Markdown(md))

    # By default, do NOT return text to avoid Colab echo.
    if return_text:
        return text

In [None]:
# Example prompt
run_chat("Explain edge computing and give one practical example.")

# You can reuse this cell with new prompts:
# run_chat("What is the difference between AI and machine learning?")
# run_chat("Summarize the history of quantum computers.")

In [None]:
run_chat("describe sarcobesity,  how to get rid off it")

In [None]:
run_chat("How to address and reverse sarcopenia?", max_new_tokens=1024)

In [None]:
!jupyter nbconvert --ClearMetadataPreprocessor.enabled=True --inplace Copy_of_LFM2_MoE_Inference.ipynb