In [24]:
from sentence_transformers import SentenceTransformer
import numpy as np
print("Libraries imported sucessfully")

Libraries imported sucessfully


In [41]:
model = SentenceTransformer('all-MiniLM-L6-v2')

# Exercise 1: Understanding Embedding Similarity

2. Generate embeddings for all sentences using `all-MiniLM-L6-v2`

3. Calculate similarity scores between:
   - Sentence 1 and all others
   - Sentence 4 and all others

4. Answer these questions:
   - Which sentences are most similar to "The dog is playing in the park"?
   - Which sentences are most similar to "Python is a programming language"?
   - What similarity threshold would you use to filter unrelated content?


In [44]:
def cosine_similarity(vec1, vec2):
    """
    Calculate cosine similarity between two vectors.
    
    Returns a score between -1 and 1 (higher = more similar)
    """
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product / (norm1 * norm2)

In [16]:
sentences = [
    "The dog is playing in the park",
    "A puppy is running outside",
    "The cat is sleeping on the couch",
    "Python is a programming language",
    "Machine learning models need data",
    "I love coding in Python"
]

# generating embeddings for the given sentences
embedding = model.encode(sentences)

def compute_similarity(query_index):
    query_sentence = sentences[query_index]
    query_embedding = embedding[query_index]

    similarities = []

    for i, sentence in enumerate(sentences):
        score = cosine_similarity(query_embedding, embedding[i])
        similarities.append((sentence, score))

    # Sort by similarity score (descending)
    similarities.sort(key=lambda x: x[1], reverse=True)

    # for senten, score in similarities:
    #     print(f"{score:.4f} \n{senten}")
    #     print("-" * 60)
    return similarities

In [22]:
sentence1 = compute_similarity(query_index=0)
sentence4 = compute_similarity(query_index=3)

print(f'Query: "{sentences[0]}"')
print(f"Most similar: {sentence1[1][0]}")
print(f"Least similar: {sentence1[-1][0]}")

print("")

print(f'Query: "{sentences[3]}"')
print(f"Most similar: {sentence4[1][0]}")
print(f"Least similar: {sentence4[-1][0]}")


Query: "The dog is playing in the park"
Most similar: A puppy is running outside
Least similar: Machine learning models need data

Query: "Python is a programming language"
Most similar: I love coding in Python
Least similar: The cat is sleeping on the couch


I would choose a similarity threshold of 0.3 to filter out unrelated content, as chunks with similarity scores <0.3 are unrelated to the query <br>

Recommended similarity threshold: 0.3

# Exercise2: Chunk Size Impact on Retrieval

In [None]:
document = """
Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to
the natural intelligence displayed by humans and animals. Leading AI textbooks define
the field as the study of intelligent agents: any device that perceives its environment
and takes actions that maximize its chance of successfully achieving its goals.

Machine learning is a subset of artificial intelligence that focuses on the use of data
and algorithms to imitate the way that humans learn, gradually improving its accuracy.
Machine learning is an important component of the growing field of data science.

Deep learning is part of a broader family of machine learning methods based on artificial
neural networks with representation learning. Learning can be supervised, semi-supervised
or unsupervised. Deep learning architectures such as deep neural networks, deep belief
networks, recurrent neural networks and convolutional neural networks have been applied
to fields including computer vision, speech recognition, natural language processing,
machine translation, and bioinformatics.

Natural language processing is a subfield of linguistics, computer science, and artificial
intelligence concerned with the interactions between computers and human language, in
particular how to program computers to process and analyze large amounts of natural
language data. Challenges in natural language processing frequently involve speech
recognition, natural language understanding, and natural language generation.
"""

1504

Save the various chunk sizes in multiple variable of single variable e.g sizes = 100, 200, 300
write a function that chunks, saves , and embeds each chunk size
Compare the query embedding to each chunk embedding, retrieve top 3 chunks from each comparison

In [77]:
# small_chunk, medium_chunk, large_chunk = 100, 200, 400

# def chunk_embed(document, chunk_size, overlap=20):
#     chunks = []
#     start = 0

#     #Character chunking
#     while start < len(document):
#         end = start + chunk_size
#         chunk = document[start:end]
#         chunks.append(chunk)
#         start +=chunk_size-overlap
    

#     # word chunking
#     # words = document.split()
#     # for i in range(0, len(words), chunk_size):
#     #     chunk = ' '.join(words[i:i+chunk_size])
#     #     chunks.append(chunk)
#     return chunks

def chunk_embed_retrieve(document, chunk_sizes, query, top_k=3):
    query_embedding = model.encode(query)
    
    for size in chunk_sizes:
        
        print(f"\nChunk size: {size} characters")
        print("-"*80)
        
        chunks = [
            document[i:i+ size] for i in range(0, len(document), size)
        ]
        print(f"Number of chunks for {size} characters: {len(chunks)}")

        doc_embeddings = model.encode(chunks)
        scores = [cosine_similarity(doc_embeddings, query_embedding)[0]]

        results = sorted(zip(chunks, scores), key=lambda x:x[1], reverse= True)[:top_k]

        for rank, (chunk, score) in enumerate(results, 1):
            
            print(f"{rank}. (Score: {score:.3f})")
            print(chunk.strip())
        


In [78]:
chunk_sizes = [100, 200, 400]
query = "What is machine learning?"

chunk_embed_retrieve(document, chunk_sizes, query)


Chunk size: 100 characters
--------------------------------------------------------------------------------
Number of chunks for 100 characters: 16
1. (Score: 0.123)
Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to
the natural i

Chunk size: 200 characters
--------------------------------------------------------------------------------
Number of chunks for 200 characters: 8
1. (Score: 0.176)
Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to
the natural intelligence displayed by humans and animals. Leading AI textbooks define
the field as the study of i

Chunk size: 400 characters
--------------------------------------------------------------------------------
Number of chunks for 400 characters: 4
1. (Score: 0.327)
Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to
the natural intelligence displayed by humans and animals. Leading AI textbooks define
the field as the study of int

In [79]:
# # from sentence_transformers import SentenceTransformer, util

# # # Load SentenceTransformer model
# # model = SentenceTransformer("all-MiniLM-L6-v2")

# # Short document
# document = """
# Machine learning is a field of artificial intelligence that focuses on enabling computers
# to learn from data without being explicitly programmed. It is widely used in applications
# such as recommendation systems, image recognition, natural language processing, and fraud
# detection. Machine learning models improve their performance as they are exposed to more data.
# """

# def chunk_embed_and_retrieve(document, chunk_sizes, query, top_k=3):
#     """
#     Chunks a document using different chunk sizes, embeds the chunks,
#     and retrieves top-k relevant chunks for a query.
#     """

#     query_embedding = model.encode(query)

#     for size in chunk_sizes:
#         print(f"\nðŸ”¹ Chunk Size: {size} characters")
#         print("-" * 60)

#         # Step 1: Chunking
#         chunks = [
#             document[i:i + size]
#             for i in range(0, len(document), size)
#         ]

#         # Step 2: Create embeddings
#         chunk_embeddings = model.encode(chunks)

#         # Step 3: Similarity search
#         scores = [cosine_similarity(chunk_embeddings, query_embedding)[0]]

#         # Step 4: Retrieve top-k chunks
#         top_results = sorted(
#             zip(chunks, scores),
#             key=lambda x: x[1],
#             reverse=True
#         )[:top_k]

#         for rank, (chunk, score) in enumerate(top_results, start=1):
#             print(f"\nRank {rank} | Score: {score:.4f}")
#             print(chunk.strip())


# # Run for all chunk sizes
# chunk_sizes = [100, 200, 400]
# query = "What is machine learning?"

# chunk_embed_and_retrieve(document, chunk_sizes, query)

