# Install Dependencies

In [1]:
pip install langchain langchain-huggingface sentence-transformers transformers


Collecting langchain-huggingface
  Downloading langchain_huggingface-1.0.1-py3-none-any.whl.metadata (2.1 kB)
INFO: pip is looking at multiple versions of langchain-huggingface to determine which version is compatible with other requirements. This could take a while.
  Downloading langchain_huggingface-1.0.0-py3-none-any.whl.metadata (2.1 kB)
  Downloading langchain_huggingface-0.3.1-py3-none-any.whl.metadata (996 bytes)
Downloading langchain_huggingface-0.3.1-py3-none-any.whl (27 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.3.1


# Load a Small Instruction Model

In [5]:
from transformers import pipeline
from langchain_huggingface import HuggingFacePipeline

# Tiny instruction-following model (~80 MB)
pipe = pipeline("text2text-generation", model="google/flan-t5-small", max_new_tokens=32)
llm = HuggingFacePipeline(pipeline=pipe)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu


# Define Tools

In [6]:
# Simple Calculator
def calculator_tool(expression: str) -> str:
    try:
        return str(eval(expression))
    except:
        return "Error"

# Simple Summarizer
def summarize_tool(text: str) -> str:
    return text.split(".")[0] + "." if "." in text else text

tools = {
    "Calculator": calculator_tool,
    "Summarizer": summarize_tool
}


# Build a Lightweight Tool-Selecting Agent

In [7]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

# Prompt instructing the model to select a tool and give the final answer
template = """You are an AI assistant with two abilities:
1. Calculator: performs math expressions.
2. Summarizer: summarizes text.

Question: {question}
Decide which tool to use and give the final answer directly.
Answer:"""

prompt = PromptTemplate.from_template(template)
chain = LLMChain(llm=llm, prompt=prompt)

# Agent runner function
def run_agent(question: str):
    response = chain.run(question)
    # Apply tool if detected in response
    if "Calculator" in response:
        expr = ''.join(filter(lambda c: c in "0123456789+-*/()", question))
        return tools["Calculator"](expr)
    elif "Summarizer" in response:
        return tools["Summarizer"](question)
    else:
        return response


  chain = LLMChain(llm=llm, prompt=prompt)


# Define Test Cases

In [8]:
test_cases = [
    {"question": "What is 25 * 4?", "expected": "100"},
    {"question": "Summarize: LangChain helps build LLM-powered apps with memory and tools.",
     "expected": "LangChain helps build LLM-powered apps."},
    {"question": "What is 12 + 18?", "expected": "30"}
]


# Run Agent on Test Cases

In [9]:
predictions = []
for case in test_cases:
    result = run_agent(case["question"])
    print(f"Q: {case['question']}\nPredicted: {result}\nExpected: {case['expected']}\n")
    predictions.append(result)


  response = chain.run(question)


Q: What is 25 * 4?
Predicted: 1.
Expected: 100

Q: Summarize: LangChain helps build LLM-powered apps with memory and tools.
Predicted: Find the LLM app you want to use. Find the LLM app you want to use. Find the LLM app you want to use. Find the
Expected: LangChain helps build LLM-powered apps.

Q: What is 12 + 18?
Predicted: 1.
Expected: 30



# Evaluate Using Semantic Similarity

In [10]:
from sentence_transformers import SentenceTransformer, util

sim_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")  # ~80 MB

scores = []
for pred, case in zip(predictions, test_cases):
    ref_emb = sim_model.encode(case["expected"], convert_to_tensor=True)
    pred_emb = sim_model.encode(pred, convert_to_tensor=True)
    sim = util.cos_sim(ref_emb, pred_emb).item()
    scores.append(sim)
    print(f"Similarity Score: {sim:.2f}\n")

print(f"Average Agent Similarity Score: {sum(scores)/len(scores):.2f}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Similarity Score: 0.46

Similarity Score: 0.51

Similarity Score: 0.47

Average Agent Similarity Score: 0.48


This is exactly the limitation of using tiny instruction models like flan-t5-small. They can follow simple text instructions, but they struggle with structured reasoning and tool selection, which is why your predictions are off:

Math outputs: They default to “1.” because they can’t reliably parse the expression or perform calculations.

Summarization: They repeat text or hallucinate because they aren’t instruction-tuned enough to pick the summarizer correctly.