In [3]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting scipy (from sentence-transformers)
  Using cached scipy-1.14.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting Pillow (from sentence-transformers)
  Downloading pillow-11.0.0-cp312-cp312-win_amd64.whl.metadata (9.3 kB)
Collecting joblib>=1.2.0 (from scikit-learn->sentence-transformers)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->sentence-transformers)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
   ---------------------------------------- 0.0/268.8 kB ? eta -:--:--
   ---------------------------------------- 268.8/268.8 kB 5.5 MB/s eta 0:00:00
Downloading pillow-11.0.0-cp312-cp

In [4]:
documents = [
    "This is a list which containing sample documents.",
    "keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords",
    "Keyword-based search relies on sparse embeddings.",
    "Understanding document structure aids in keyword extractions",
    "Efficient keyword extraction enhances search accuracy.",
    "Semantic similarity improves document retrieval performance.",
    "Machine learning algorithms can optimize keyword extraction methods."
]

In [5]:
from sentence_transformers import SentenceTransformer

## BM25 Model

In [25]:
# load model

model_name = "sentence-transformers/paraphrase-xlm-r-multilingual-v1"

In [7]:
model = SentenceTransformer(model_name_or_path= model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [8]:
document_embedding = model.encode(documents)

len(document_embedding)

8

In [10]:
len(document_embedding[0]) # it is converting into 768 dimension

768

In [13]:
query = "Natural language processing techniques enhance keyword extraction efficiency."

query_embedding = model.encode(query)

len(query_embedding)

768

In [14]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
similarity = cosine_similarity(np.array([query_embedding]),document_embedding)

In [16]:
similarity

array([[0.16948141, 0.4626166 , 0.5446862 , 0.44123265, 0.55409193,
        0.75214124, 0.550352  , 0.7448165 ]], dtype=float32)

In [22]:
sorted_similarity = np.argsort(similarity[0])[::-1]
sorted_similarity

array([5, 7, 4, 6, 2, 1, 3, 0], dtype=int64)

In [23]:
ranked_document = [(documents[i], similarity[0][i]) for i in sorted_similarity]

In [24]:
ranked_document

[('Efficient keyword extraction enhances search accuracy.', 0.75214124),
 ('Machine learning algorithms can optimize keyword extraction methods.',
  0.7448165),
 ('Understanding document structure aids in keyword extractions', 0.55409193),
 ('Semantic similarity improves document retrieval performance.', 0.550352),
 ('Document analysis involves extracting keywords', 0.5446862),
 ('keywords are important for keyword-based search.', 0.4626166),
 ('Keyword-based search relies on sparse embeddings.', 0.44123265),
 ('This is a list which containing sample documents.', 0.16948141)]

In [26]:
!pip install rank_bm25



### Reranking

In [27]:
from rank_bm25 import BM25Okapi

In [38]:
top_4_doc = [doc[0] for doc in ranked_document[:4] ]
len(top_4_doc)

4

In [39]:
tokenized_top_4_doc = [doc.split() for doc in top_4_doc]

tokenized_top_4_doc

[['Efficient', 'keyword', 'extraction', 'enhances', 'search', 'accuracy.'],
 ['Machine',
  'learning',
  'algorithms',
  'can',
  'optimize',
  'keyword',
  'extraction',
  'methods.'],
 ['Understanding',
  'document',
  'structure',
  'aids',
  'in',
  'keyword',
  'extractions'],
 ['Semantic',
  'similarity',
  'improves',
  'document',
  'retrieval',
  'performance.']]

In [40]:
query_token = query.split()
query_token

['Natural',
 'language',
 'processing',
 'techniques',
 'enhance',
 'keyword',
 'extraction',
 'efficiency.']

In [41]:
bm25= BM25Okapi(tokenized_top_4_doc)

In [42]:
bm25_score = bm25.get_scores(query_token)
bm25_score

array([0.18419519, 0.16152501, 0.17211681, 0.        ])

In [43]:
sorted_rerank_score = np.argsort(bm25_score)[::-1]
sorted_rerank_score

array([0, 2, 1, 3], dtype=int64)

In [46]:
for idx in sorted_rerank_score:
    print(f"{top_4_doc[idx]}\t- {bm25_score[idx]}")

Efficient keyword extraction enhances search accuracy.	- 0.18419518704069648
Understanding document structure aids in keyword extractions	- 0.1721168141199951
Machine learning algorithms can optimize keyword extraction methods.	- 0.16152501017414925
Semantic similarity improves document retrieval performance.	- 0.0


## from scratch

In [47]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch


In [49]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
sentence_a = "The movie was fantastic!"
sentence_b = "I really enjoy the film."

In [58]:
# encode inputs
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt', padding=True,
                  truncation=True,)

In [52]:
inputs

{'input_ids': tensor([[  101,  1996,  3185,  2001, 10392,   999,   102,  1045,  2428,  5959,
          1996,  2143,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [59]:
# model output
output = model(**inputs)

In [54]:
output

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.6578, -0.5613]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [55]:
logits = output.logits

In [60]:
# convert logits to probabilities 

probs = torch.softmax(logits,dim = 1)

In [61]:
probs

tensor([[0.7719, 0.2281]], grad_fn=<SoftmaxBackward0>)

In [62]:
similarity_score = probs[0][1].item()

In [63]:
similarity_score

0.22809046506881714

## Cross Encoder
- The input of the model always consist of a data pair for example two sentences, and output a value between **0 to 1** indicating **similarity score** of these sentences.


In [64]:
from sentence_transformers import CrossEncoder

In [65]:
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [67]:
# ranked document
top_4_doc

['Efficient keyword extraction enhances search accuracy.',
 'Machine learning algorithms can optimize keyword extraction methods.',
 'Understanding document structure aids in keyword extractions',
 'Semantic similarity improves document retrieval performance.']

In [68]:
pairs = [[query,doc] for doc in top_4_doc]

In [69]:
pairs

[['Natural language processing techniques enhance keyword extraction efficiency.',
  'Efficient keyword extraction enhances search accuracy.'],
 ['Natural language processing techniques enhance keyword extraction efficiency.',
  'Machine learning algorithms can optimize keyword extraction methods.'],
 ['Natural language processing techniques enhance keyword extraction efficiency.',
  'Understanding document structure aids in keyword extractions'],
 ['Natural language processing techniques enhance keyword extraction efficiency.',
  'Semantic similarity improves document retrieval performance.']]

In [70]:
scores = cross_encoder.predict(pairs)

In [71]:
scores

array([ 3.1378734 ,  0.84216833, -2.8850963 , -8.293585  ], dtype=float32)

In [72]:
scored_docs = zip(scores,top_4_doc)

In [73]:
reranked_document = sorted(scored_docs,reverse=True)

In [74]:
reranked_document

[(3.1378734, 'Efficient keyword extraction enhances search accuracy.'),
 (0.84216833,
  'Machine learning algorithms can optimize keyword extraction methods.'),
 (-2.8850963, 'Understanding document structure aids in keyword extractions'),
 (-8.293585, 'Semantic similarity improves document retrieval performance.')]