### [Query Classification with TransformersTextRouter and TransformersZeroShotTextRouter](https://haystack.deepset.ai/tutorials/41_query_classification_with_transformerstextrouter_and_transformerszeroshottextrouter)

In [None]:
#! Solo es para haystack sepa que tutorial se esta ejecutando
from haystack.telemetry import tutorial_running
tutorial_running(41)

### 1. Probando TransformersTextRouter

In [1]:
from haystack.components.routers import TransformersTextRouter

text_router = TransformersTextRouter(model="shahrukhx01/bert-mini-finetune-question-detection")
text_router.warm_up()

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/44.7M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/334 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0


In [2]:
queries = [
    "Arya Stark father",  # Keyword Query
    "Who was the father of Arya Stark",  # Interrogative Query
    "Lord Eddard was the father of Arya Stark",  # Statement Query
]

result = text_router.run(text=queries[0])
next(iter(result))



'LABEL_0'

In [8]:
import pandas as pd

results = {"Query": [], "Output Branch": [], "Class": []}

for query in queries:
    result = text_router.run(text=query)
    results["Query"].append(query)
    results["Output Branch"].append(next(iter(result)))
    results["Class"].append("Keyword Query" if next(iter(result)) == "LABEL_0" else "Question/Statement")

pd.DataFrame.from_dict(results)


Unnamed: 0,Query,Output Branch,Class
0,Arya Stark father,LABEL_0,Keyword Query
1,Who was the father of Arya Stark,LABEL_1,Question/Statement
2,Lord Eddard was the father of Arya Stark,LABEL_1,Question/Statement


In [9]:
# A continuación, ilustrará una pregunta frente a una afirmación con TransformersTextRouter usando shahrukhx01/question-vs-statement-classifier . Para esta tarea, debe inicializar un nuevo enrutador de texto con este modelo de clasificación.

text_router = TransformersTextRouter(model="shahrukhx01/question-vs-statement-classifier")
text_router.warm_up()

queries = [
    "Who was the father of Arya Stark",  # Interrogative Query
    "Lord Eddard was the father of Arya Stark",  # Statement Query
]

results = {"Query": [], "Output Branch": [], "Class": []}

for query in queries:
    result = text_router.run(text=query)
    results["Query"].append(query)
    results["Output Branch"].append(next(iter(result)))
    results["Class"].append("Question" if next(iter(result)) == "LABEL_1" else "Statement")

pd.DataFrame.from_dict(results)

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/44.7M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/334 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0


Unnamed: 0,Query,Output Branch,Class
0,Who was the father of Arya Stark,LABEL_1,Question
1,Lord Eddard was the father of Arya Stark,LABEL_0,Statement


### 2. Casos de uso personalizados para la clasificación de texto

In [10]:
text_router = TransformersTextRouter(model="cardiffnlp/twitter-roberta-base-sentiment")
text_router.warm_up()

config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
queries = [
    "What's the answer?",  # neutral query
    "Would you be so lovely to tell me the answer?",  # positive query
    "Can you give me the damn right answer for once??",  # negative query
]

sent_results = {"Query": [], "Output Branch": [], "Class": []}
for query in queries:
    result = text_router.run(text=query)
    sent_results["Query"].append(query)
    sent_results["Output Branch"].append(next(iter(result)))
    sent_results["Class"].append({"LABEL_0": "negative", "LABEL_1": "neutral", "LABEL_2":"positive"}.get(next(iter(result)), "Unknown"))

pd.DataFrame.from_dict(sent_results)



Unnamed: 0,Query,Output Branch,Class
0,What's the answer?,LABEL_1,neutral
1,Would you be so lovely to tell me the answer?,LABEL_2,positive
2,Can you give me the damn right answer for once??,LABEL_0,negative


### 3. Clasificación Zero-Shot con TransformersZeroShotTextRoute

In [12]:
from haystack.components.routers import TransformersZeroShotTextRouter

text_router = TransformersZeroShotTextRouter(labels=["music", "cinema"])
text_router.warm_up()

config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

Device set to use cuda:0


In [13]:
queries = [
    "In which films does John Travolta appear?",  # cinema
    "What is the Rolling Stones first album?",  # music
    "Who was Sergio Leone?",  # cinema
]


sent_results = {"Query": [], "Output Branch": []}
for query in queries:
    result = text_router.run(text=query)
    sent_results["Query"].append(query)
    sent_results["Output Branch"].append(next(iter(result)))

pd.DataFrame.from_dict(sent_results)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unnamed: 0,Query,Output Branch
0,In which films does John Travolta appear?,cinema
1,What is the Rolling Stones first album?,music
2,Who was Sergio Leone?,cinema


In [15]:
# De manera similar al ejemplo anterior, podemos utilizar la clasificación de texto de tipo zero-shot para agrupar las preguntas en preguntas relacionadas con “Juego de Tronos”, “Star Wars” y “El Señor de los Anillos”. ¡Tú decides la cantidad de etiquetas!

from haystack.components.routers import TransformersZeroShotTextRouter

text_router = TransformersZeroShotTextRouter(labels=["Game of Thrones", "Star Wars", "Lord of the Rings"])
text_router.warm_up()

queries = [
    "Who was the father of Arya Stark",  # Game of Thrones
    "Who was the father of Luke Skywalker",  # Star Wars
    "Who was the father of Frodo Baggins",  # Lord of the Rings
]

results = {"Query": [], "Output Branch": []}

for query in queries:
    result = text_router.run(text=query)
    results["Query"].append(query)
    results["Output Branch"].append(next(iter(result)))

pd.DataFrame.from_dict(results)


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unnamed: 0,Query,Output Branch
0,Who was the father of Arya Stark,Game of Thrones
1,Who was the father of Luke Skywalker,Star Wars
2,Who was the father of Frodo Baggins,Lord of the Rings


### 4. Clasificación de consultas por palabras clave o preguntas o declaraciones

In [16]:
from haystack.document_stores.in_memory import InMemoryDocumentStore

document_store = InMemoryDocumentStore()

In [17]:
# Obtener los datos
from datasets import load_dataset
from haystack import Document

dataset = load_dataset("bilgeyucel/seven-wonders", split="train")
docs = [Document(content=doc["content"], meta=doc["meta"]) for doc in dataset]

In [18]:
# Initalize a Document Embedder
from haystack.components.embedders import SentenceTransformersDocumentEmbedder

doc_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
doc_embedder.warm_up()



In [19]:
# Escribir documentos en DocumentStore
docs_with_embeddings = doc_embedder.run(docs)
document_store.write_documents(docs_with_embeddings["documents"])

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

151

In [20]:
# 2) Initialize Retrievers, TextEmbedder and TransformersTextRouter
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever, InMemoryEmbeddingRetriever
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.joiners import DocumentJoiner

text_router = TransformersTextRouter(model="shahrukhx01/bert-mini-finetune-question-detection")
text_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
embedding_retriever = InMemoryEmbeddingRetriever(document_store)
bm25_retriever = InMemoryBM25Retriever(document_store)
document_joiner = DocumentJoiner()

In [21]:
# 3) Definir el pipeline
from haystack import Pipeline

query_classification_pipeline = Pipeline()
query_classification_pipeline.add_component("text_router", text_router)
query_classification_pipeline.add_component("text_embedder", text_embedder)
query_classification_pipeline.add_component("embedding_retriever", embedding_retriever)
query_classification_pipeline.add_component("bm25_retriever", bm25_retriever)
query_classification_pipeline.add_component("document_joiner", document_joiner)

query_classification_pipeline.connect("text_router.LABEL_0", "text_embedder")
query_classification_pipeline.connect("text_embedder", "embedding_retriever")
query_classification_pipeline.connect("text_router.LABEL_1", "bm25_retriever")
query_classification_pipeline.connect("bm25_retriever", "document_joiner")
query_classification_pipeline.connect("embedding_retriever", "document_joiner")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7f7e1f781150>
🚅 Components
  - text_router: TransformersTextRouter
  - text_embedder: SentenceTransformersTextEmbedder
  - embedding_retriever: InMemoryEmbeddingRetriever
  - bm25_retriever: InMemoryBM25Retriever
  - document_joiner: DocumentJoiner
🛤️ Connections
  - text_router.LABEL_0 -> text_embedder.text (str)
  - text_router.LABEL_1 -> bm25_retriever.query (str)
  - text_embedder.embedding -> embedding_retriever.query_embedding (List[float])
  - embedding_retriever.documents -> document_joiner.documents (List[Document])
  - bm25_retriever.documents -> document_joiner.documents (List[Document])

In [22]:
# 4) Ejecutar el pipeline
# Useful for framing headers
equal_line = "=" * 30

# Run only the dense retriever on the full sentence query
res_1 = query_classification_pipeline.run({"text_router": {"text": "Who is the father of Arya Stark?"}})
print(f"\n\n{equal_line}\nQUESTION QUERY RESULTS\n{equal_line}")
print(res_1)

# Run only the sparse retriever on a keyword based query
res_2 = query_classification_pipeline.run({"text_router": {"text": "arya stark father"}})
print(f"\n\n{equal_line}\nKEYWORD QUERY RESULTS\n{equal_line}")
print(res_2)


Device set to use cuda:0




QUESTION QUERY RESULTS
{'document_joiner': {'documents': [Document(id=4c82325818ccd91af8d68fec37108ce7a93696392f315bd0497ad3a8903d0b45, content: 'The Masonic House of the Temple of the Scottish Rite, Washington, DC, designed by John Russell Pope,...', meta: {'url': 'https://en.wikipedia.org/wiki/Mausoleum_at_Halicarnassus', '_split_id': 18}, score: 8.192663165691801, embedding: vector of size 384), Document(id=4a988f268c10bbb6af9a18063a14460b7e0126c7ed1befb2be17c9cbbc4bb064, content: 'The earliest pharaonic name of seal impressions is that of Khufu, the latest of Pepi II. Worker graf...', meta: {'url': 'https://en.wikipedia.org/wiki/Great_Pyramid_of_Giza', '_split_id': 4}, score: 6.652366985087608, embedding: vector of size 384), Document(id=9c1d7b92058a18bd101c037c87519e9225983c1dbb9386d51412895d5101d096, content: '[68]
The most notable account of this legend was given by Al-Masudi (896–956) in his Akbar al-zaman,...', meta: {'url': 'https://en.wikipedia.org/wiki/Great_Pyramid_of_Gi



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



KEYWORD QUERY RESULTS
{'document_joiner': {'documents': [Document(id=8d83c5906c44567371940fa0a00dfd5da94a2f0e93001c08013fa21978705df4, content: 'Conquest[edit]
In the 4th century BC, Halicarnassus was the capital of a small regional kingdom of C...', meta: {'url': 'https://en.wikipedia.org/wiki/Mausoleum_at_Halicarnassus', '_split_id': 1}, score: 0.22343472174809592), Document(id=15ba19be181b99cedeada16c282ea812d83b35f744ea5066530d40fbf784546c, content: 'Various sources describe this as a vainglorious act of arson by a man, Herostratus, who set fire to ...', meta: {'url': 'https://en.wikipedia.org/wiki/Temple_of_Artemis', '_split_id': 6}, score: 0.1953121461798527), Document(id=de0480cf304ee42ec382ba29752d16015e5b656e84a59518afe59549643e77cc, content: 'In the 7th century BC, a flood[7] destroyed the temple, depositing over half a meter of sand and flo...', meta: {'url': 'https://en.wikipedia.org/wiki/Temple_of_Artemis', '_split_id': 3}, score: 0.18115355996707005), Document(id=7737c2

### 5. Pipeline with Question vs. Statement Query Classifier

In [None]:
# 1) Definir la tubería y los componentes
from haystack.components.readers import ExtractiveReader

query_classification_pipeline = Pipeline()
query_classification_pipeline.add_component("bm25_retriever_0", InMemoryBM25Retriever(document_store))
query_classification_pipeline.add_component("bm25_retriever_1", InMemoryBM25Retriever(document_store))
query_classification_pipeline.add_component("text_router", TransformersTextRouter(model="shahrukhx01/question-vs-statement-classifier"))
query_classification_pipeline.add_component("reader", ExtractiveReader())

query_classification_pipeline.connect("text_router.LABEL_0", "bm25_retriever_0")
query_classification_pipeline.connect("bm25_retriever_0", "reader")
query_classification_pipeline.connect("text_router.LABEL_1", "bm25_retriever_1")


<haystack.core.pipeline.pipeline.Pipeline object at 0x7f7e332f5d50>
🚅 Components
  - bm25_retriever_0: InMemoryBM25Retriever
  - bm25_retriever_1: InMemoryBM25Retriever
  - text_router: TransformersTextRouter
  - reader: ExtractiveReader
🛤️ Connections
  - bm25_retriever_0.documents -> reader.documents (List[Document])
  - text_router.LABEL_0 -> bm25_retriever_0.query (str)
  - text_router.LABEL_1 -> bm25_retriever_1.query (str)

In [24]:
# 2) Ejecutar el pipeline
# Useful for framing headers
equal_line = "=" * 30

# Run the retriever + reader on the question query
query = "Who is the father of Arya Stark?"
res_1 = query_classification_pipeline.run({"text_router": {"text": query}, "reader": {"query": query}})
print(f"\n\n{equal_line}\nQUESTION QUERY RESULTS\n{equal_line}")
print(res_1)

# Run only the retriever on the statement query
query = "Arya Stark was the daughter of a Lord"
res_2 = query_classification_pipeline.run({"text_router": {"text": query}, "reader": {"query": query}})
print(f"\n\n{equal_line}\nKEYWORD QUERY RESULTS\n{equal_line}")
print(res_2)


Device set to use cuda:0




QUESTION QUERY RESULTS
{'bm25_retriever_1': {'documents': [Document(id=4c82325818ccd91af8d68fec37108ce7a93696392f315bd0497ad3a8903d0b45, content: 'The Masonic House of the Temple of the Scottish Rite, Washington, DC, designed by John Russell Pope,...', meta: {'url': 'https://en.wikipedia.org/wiki/Mausoleum_at_Halicarnassus', '_split_id': 18}, score: 8.192663165691801, embedding: vector of size 384), Document(id=4a988f268c10bbb6af9a18063a14460b7e0126c7ed1befb2be17c9cbbc4bb064, content: 'The earliest pharaonic name of seal impressions is that of Khufu, the latest of Pepi II. Worker graf...', meta: {'url': 'https://en.wikipedia.org/wiki/Great_Pyramid_of_Giza', '_split_id': 4}, score: 6.652366985087608, embedding: vector of size 384), Document(id=9c1d7b92058a18bd101c037c87519e9225983c1dbb9386d51412895d5101d096, content: '[68]
The most notable account of this legend was given by Al-Masudi (896–956) in his Akbar al-zaman,...', meta: {'url': 'https://en.wikipedia.org/wiki/Great_Pyramid_of_G