In [1]:
from transformers import AutoTokenizer, AutoModel
import torch 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModel.from_pretrained('distilbert-base-uncased')

In [8]:
# Function to get embeddings
def get_embeddings(text):
    # Tokenize the text and convert to tensor
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # Get the embeddings
    with torch.no_grad():  # Disable gradient calculation
        outputs = model(**inputs)
    
    # Get the [CLS] token embedding (first token)
    embeddings = outputs.last_hidden_state[:, 0, :].numpy()
    return embeddings

# Example usage
text = "This is a sample text for embedding."
embedding = get_embeddings(text)

In [9]:
print(embedding)

[[-2.18663171e-01 -2.08945274e-01 -6.90583140e-03 -2.15081334e-01
  -1.61385104e-01 -3.26366812e-01 -3.13127041e-03  2.62152016e-01
  -1.40656844e-01 -2.01556444e-01 -2.77407587e-01 -1.70141056e-01
  -2.79470891e-01 -2.01073512e-02 -4.55363393e-02  2.69975960e-01
  -4.52183113e-02  1.72342658e-01  1.62025258e-01 -7.95195624e-02
  -5.47638685e-02 -1.33595735e-01 -2.37895593e-01 -1.14592150e-01
   1.94997385e-01 -1.97259486e-01  1.17188461e-01 -1.73791707e-01
  -1.08223058e-01 -1.50153311e-02 -7.21194968e-02  1.90517053e-01
   8.22267681e-03 -2.30405957e-01  2.17052996e-01 -3.34180519e-02
   2.49833673e-01 -3.43498439e-02  9.53244418e-02  1.42528191e-01
  -1.97392002e-01 -8.31406862e-02  2.47124702e-01  1.60481647e-01
   4.64079380e-02 -1.28229946e-01 -2.26021743e+00  7.33323544e-02
  -2.58545637e-01 -3.71066183e-01 -2.52734065e-01  3.85058671e-02
   2.19901845e-01  6.82838500e-01  1.39514908e-01  1.61702037e-01
  -2.57581204e-01  2.95286417e-01  7.57375285e-02  1.09272853e-01
   1.54228

In [20]:
from transformers import AutoTokenizer, AutoModel
import torch
import PyPDF2
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load model and tokenizer (only need to do this once)
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModel.from_pretrained('distilbert-base-uncased')

def read_first_page(pdf_path):
    """Read first page of PDF with error handling"""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            if len(reader.pages) == 0:
                print(f"Warning: {pdf_path} has no pages!")
                return ""
            
            first_page = reader.pages[0]
            text = first_page.extract_text()
            
            if not text.strip():
                print(f"Warning: {pdf_path} first page extracted empty text!")
            
            return text
    except Exception as e:
        print(f"Error reading {pdf_path}: {str(e)}")
        return ""

def get_embedding(text):
    """Get embedding for a text"""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get CLS token embedding
    embedding = outputs.last_hidden_state[:, 0, :].numpy()
    return embedding

# Load and embed books
book1_text = read_first_page('./data/embedded.pdf')
book2_text = read_first_page('./data/SRE.pdf')

book1_embedding = get_embedding(book1_text)
book2_embedding = get_embedding(book2_text)

# Function to find most similar book
def find_similar_book(query_text, book_embeddings):
    """Find most similar book to query"""
    query_embedding = get_embedding(query_text)
    
    # Calculate similarities
    similarities = [
        cosine_similarity(query_embedding, book_embedding) 
        for book_embedding in book_embeddings
    ]
    
    # Get index of most similar book
    most_similar_idx = np.argmax(similarities)
    return most_similar_idx, similarities[most_similar_idx][0][0]

# Example usage
book_embeddings = [book1_embedding, book2_embedding]
query = "give me the book that talks devops"
best_match_idx, similarity_score = find_similar_book(query, book_embeddings)

print(f"Most similar book: Book {best_match_idx + 1}")
print(f"Similarity score: {similarity_score:.4f}")

Most similar book: Book 2
Similarity score: 0.8459
