In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from typing import cast

import numpy as np
import torch
from rich import print as rprint  # noqa: F401
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions


In [None]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

# Tokenize input
prompt = "Hello, my name is"
inputs = tokenizer(prompt, return_tensors="pt")

# Generate
outputs = model.generate(
    inputs.input_ids,
    max_length=30,
    attention_mask=inputs.attention_mask,
    pad_token_id=tokenizer.eos_token_id,
)

# Decode
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased", output_hidden_states=True)

# Model architecture info
hidden_dim = model.config.hidden_size
print(f"Model hidden dimension: {hidden_dim}")

# Tokenization
text = "Hello, world!"
inputs = tokenizer(text, return_tensors="pt")
num_tokens = len(inputs["input_ids"][0])
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

print("\nTokenization:")
print(f"  Input text: '{text}'")
print(f"  Tokens: {tokens}")
print(f"  Number of tokens: {num_tokens}")

# Get all hidden states from all layers
with torch.no_grad():
    outputs: BaseModelOutputWithPoolingAndCrossAttentions = model(**inputs)

all_hidden_states = cast(tuple[torch.Tensor], outputs.hidden_states)
last_hidden_state = cast(torch.Tensor, outputs.last_hidden_state)

print("\nModel outputs:")
print(f"  Number of layers (including embedding): {len(all_hidden_states)}")
print(f"  Last hidden state shape: {last_hidden_state.shape}")
print("    [batch_size, sequence_length, hidden_dim]")

In [None]:
class SimpleSemanticSearch:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        self.model = AutoModel.from_pretrained("bert-base-uncased")
        self.documents = []
        self.embeddings = None

    def add_documents(self, documents):
        """Add documents to the search index"""
        self.documents = documents
        embeddings = []

        for doc in documents:
            inputs = self.tokenizer(
                doc, return_tensors="pt", padding=True, truncation=True
            )
            with torch.no_grad():
                outputs = self.model(**inputs)

            # Mean pooling (one vector per document, not per token)
            embedding = outputs.last_hidden_state.mean(dim=1)
            embeddings.append(embedding)

        # Stack all embeddings into a single tensor
        self.embeddings = torch.cat(embeddings, dim=0).numpy()

    def search(self, query, top_k=3):
        """Search for most relevant documents"""
        inputs = self.tokenizer(
            query, return_tensors="pt", padding=True, truncation=True
        )
        with torch.no_grad():
            outputs = self.model(**inputs)

        # Mean pooling (one vector per question, not per token)
        query_embedding = outputs.last_hidden_state.mean(dim=1).numpy()

        # Compute similarities
        similarities = cosine_similarity(query_embedding, self.embeddings)[0]

        # Get top-k results
        top_indices = np.argsort(similarities)[::-1][:top_k]

        return [(self.documents[i], similarities[i]) for i in top_indices]


In [None]:
# Example Usage, set up the search engine
search_engine = SimpleSemanticSearch()
search_engine.add_documents(
    [
        "Python is a high-level programming language.",
        "Machine learning models can recognize patterns in data.",
        "The Eiffel Tower is located in Paris, France.",
        "Neural networks consist of interconnected nodes.",
        "Coffee is a popular caffeinated beverage.",
    ]
)


In [None]:
# Perform a search
results = search_engine.search("Tell me about AI and programming", top_k=3)
for doc, score in results:
    print(f"[{score:.4f}] {doc}")