## Semantic Search with Sentence Transformers

I got guidance from [Semantic Search on sbert.net](https://www.sbert.net/examples/applications/semantic-search/README.html#) website to implement the code.

In [None]:
import torch
from langchain_community.document_loaders import PyPDFLoader
from langchain.schema import Document
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm import tqdm
import pypdf
import random
import matplotlib.pyplot as plt

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0) / 1024**3, 1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0) / 1024**3, 1), 'GB')

In [None]:
torch.rand(10).to(device)

In [None]:
path_to_pdf: str = '../data/rag_report.pdf'

loader = PyPDFLoader(path_to_pdf)
dataset: list[Document] = loader.load()

dataset[0:5]

In [None]:
[func for func in list(dir(dataset[0])) if func[0] != '_']

In [None]:
dataset[2].page_content

In [None]:
print(len(dataset))

In [None]:
text_splitter: RecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter(
    separators=['\n', '\n\n', '  \n'], chunk_size=500, chunk_overlap=100
)

chunks = text_splitter.split_documents(dataset)

print([len(c.page_content) for c in chunks])
len(chunks)

In [None]:
embedder: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
embeddings = embedder.encode([chunk.page_content for chunk in tqdm(chunks)], convert_to_tensor=True)

In [None]:
embeddings

In [None]:
queries = [
    'What is Microsoft\'s strategy for cloud computing in the next five years?',
    'How does Microsoft plan to innovate in the field of artificial intelligence?',
    'What are Microsoft\'s future plans for the Windows operating system?',
    'How will Microsoft enhance its cybersecurity measures in the coming years?',
    'What new features can we expect in future versions of Microsoft Office?',
    'What is Microsoft\'s vision for the future of remote work and collaboration tools?',
    'How does Microsoft plan to expand its gaming division, including Xbox and Game Pass?',
    'What are Microsoft\'s goals for sustainability and reducing its carbon footprint?',
    'How will Microsoft integrate emerging technologies like quantum computing into its products?',
    'What partnerships and acquisitions is Microsoft planning to strengthen its market position?',
]

In [None]:
query = queries[random.randint(0, len(queries))]

k = 3

query_embedding = embedder.encode(query, convert_to_tensor=True)

similarity_scores = embedder.similarity(query_embedding, embeddings)[0]
scores, indices = torch.topk(similarity_scores, k=k)

print(f'\nQuery: {query}')
print(f'Top {k} most similar sentences in corpus:\n')

for score, idx in zip(scores, indices):
    print(f'\n{chunks[idx].page_content} (Score: {score:.4f}, Index: {idx})\n')

In [None]:
plt.figure(figsize=(16, 9))
plt.bar(range(len(similarity_scores)), similarity_scores, color='skyblue')
plt.xlabel('Similarity Score')
plt.ylabel('Document Index')
plt.title('Sentence Similarities')
plt.show()

In [None]:
top_k = 3

for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)

    similarity_scores = embedder.similarity(query_embedding, embeddings)[0]
    scores, indices = torch.topk(similarity_scores, k=top_k)

    print(f'\nQuery: {query}')
    print(f'Top {k} most similar sentences in corpus:\n')

    for score, idx in zip(scores, indices):
        print(f'\n{chunks[idx].page_content} (Score: {score:.4f}, Index: {idx})\n')