In [4]:
!pip install faiss-cpu



In [5]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Step 1: Load the pre-trained embedding model
model = SentenceTransformer('all-MiniLM-L6-v2') # A lightweight and efficient model

# Sample text corpus
documents = [
    "Artificial Intelligence is transforming the world.",
    "Machine learning is a subset of AI.",
    "Natural language processing enables machines to understand text.",
    "Deep learning is driving advances in AI research.",
    "AI applications are widespread across various industries."
]



In [6]:
# Step 2: Convert text data into embeddings
embeddings = model.encode(documents)

# Step 3: Create a FAISS index for fast similarity search
dimension = embeddings.shape[1] # Get the dimension of the embeddings
index = faiss.IndexFlatL2(dimension) # L2 (Euclidean distance) index

# Add embeddings to the index
index.add(np.array(embeddings, dtype=np.float32))

print(f"Number of documents indexed: {index.ntotal}")



Number of documents indexed: 5


In [7]:
# Step 4: Perform a search query
query = "What is machine learning?"
query_embedding = model.encode([query])

# Search the index
k = 2 # Number of top results to retrieve
distances, indices = index.search(np.array(query_embedding, dtype=np.float32), k)

# Display the results
print("\nTop matches:")
for idx in indices[0]:
    print(f"- {documents[idx]}")


Top matches:
- Machine learning is a subset of AI.
- Artificial Intelligence is transforming the world.
