In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Install required libraries (run once if not installed)
#!pip install transformers sentencepiece torch scikit-learn pandas numpy

In [3]:
# Import libraries
import torch
import numpy as np
import pandas as pd

from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [5]:
# Load pretrained model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.to(device)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-5): 6 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)


In [6]:
# Sentence embedding function
def get_embedding(text):
    inputs = tokenizer(
        text,
        padding=True,
        truncation=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    # Mean pooling
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.cpu().numpy()

In [7]:
# Sample text data (replace with your dataset if needed)
documents = [
    "Machine learning is a subset of artificial intelligence",
    "Deep learning uses neural networks",
    "Flask is a web framework in Python",
    "PyTorch is used for deep learning models",
    "Natural language processing deals with text data"
]

In [8]:
# Generate embeddings for all documents
doc_embeddings = np.vstack([get_embedding(doc) for doc in documents])
doc_embeddings.shape

(5, 384)

In [9]:
# User query
query = "deep learning with neural networks"
query_embedding = get_embedding(query)

In [10]:
# Cosine similarity
similarities = cosine_similarity(query_embedding, doc_embeddings)[0]

In [11]:
# Create result DataFrame
results = pd.DataFrame({
    "Document": documents,
    "Similarity Score": similarities
})

In [12]:
# Sort results
results = results.sort_values(by="Similarity Score", ascending=False)
results

Unnamed: 0,Document,Similarity Score
1,Deep learning uses neural networks,0.729503
3,PyTorch is used for deep learning models,0.51474
0,Machine learning is a subset of artificial int...,0.376984
4,Natural language processing deals with text data,0.154756
2,Flask is a web framework in Python,-0.054375
