Model: Qwen3 Embedding

1. Install

In [1]:
!pip -q install transformers[torch] scipy

zsh:1: no matches found: transformers[torch]


2. Imports

In [3]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from scipy.spatial.distance import cdist

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)


Device: cpu


3. Load Model (Qwen3 Embedding Model)

In [4]:
MODEL = "Qwen/Qwen3-Embedding-0.6B"
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
model = AutoModel.from_pretrained(MODEL).to(device)
model.eval()

# Helper: compute mean-pooled embedding (example; model card may recommend specific pooling)
def get_embedding(text, tokenizer, model, device, max_length=128):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length).to(device)
    with torch.no_grad():
        outputs = model(**inputs, return_dict=True)
    # outputs.last_hidden_state shape: (1, seq_len, dim)
    last_hidden = outputs.last_hidden_state[0]  # (seq_len, dim)
    # mean pool over tokens (excluding padding)
    attention_mask = inputs['attention_mask'][0].unsqueeze(-1)
    masked = last_hidden * attention_mask
    summed = masked.sum(0)
    denom = attention_mask.sum(0).clamp(min=1e-9)
    emb = (summed / denom).cpu().numpy()
    # normalize
    emb = emb / (np.linalg.norm(emb) + 1e-10)
    return emb

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

4. Small Dataset - Demo Retrieval

In [5]:
docs = [
    "Khashayar lives in Boston and pursue his Ph.D at Northeastern University.",
    "Professor Barut is a AI engineer who is intersted in Neural Networks.",
    "Carol is a data scientist focusing on NLP and embeddings.",
    "This document explains how to cook pasta in 10 minutes."
]

query = "Who lives in Boston?"
query_emb = get_embedding(query, tokenizer, model, device)

doc_embs = [get_embedding(d, tokenizer, model, device) for d in docs]

# cosine similarities
sims = 1 - cdist([query_emb], doc_embs, metric="cosine")[0]
ranked = sorted(list(zip(docs, sims)), key=lambda x: x[1], reverse=True)
print("Ranked results (highest similarity first):")
for doc, score in ranked:
    print(f"{score:.4f}  ->  {doc}")

Ranked results (highest similarity first):
0.7225  ->  Khashayar lives in Boston and pursue his Ph.D at Northeastern University.
0.5736  ->  Professor Barut is a AI engineer who is intersted in Neural Networks.
0.5460  ->  Carol is a data scientist focusing on NLP and embeddings.
0.4912  ->  This document explains how to cook pasta in 10 minutes.


5. Reranking Example: Combine sparse retrieval + Embedding Rerank

In [7]:
!pip install faiss-cpu


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp313-cp313-macosx_14_0_arm64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp313-cp313-macosx_14_0_arm64.whl (3.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [14]:
# Retrieval + Reranking Example

from transformers import AutoTokenizer, AutoModel
import faiss
import numpy as np
import torch

# Load model & tokenizer
model_id = "Qwen/Qwen3-Embedding-0.6B"

from transformers import AutoTokenizer, AutoModel

model_id = "Qwen/Qwen3-Embedding-0.6B"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load model
model = AutoModel.from_pretrained(model_id)


# Example documents
documents = [
    "Qwen embeddings are powerful for semantic search.",
    "Pix2Struct can parse screenshots into structured text.",
    "Transformers have revolutionized NLP in the last decade.",
    "FAISS is used for efficient similarity search.",
    "Qwen3 improves multilingual understanding significantly."
]

# Embed documents
def get_embeddings(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        # CLS token representation as embedding
        embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    return embeddings

doc_embeddings = get_embeddings(documents)

# Build FAISS index
dim = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dim)  # L2 distance
index.add(doc_embeddings)

# Query
query = "Which models are good for screenshot parsing?"
query_emb = get_embeddings([query])

# Retrieve top-K
k = 3


