<a href="https://colab.research.google.com/github/HofstraDoboli/TextMining/blob/main/faiss.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install faiss-cpu sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load a pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Sample data (list of sentences)
sentences = [
    "This is an example sentence",
    "Each sentence is converted",
    "into its corresponding embedding",
    "using a pre-trained Sentence Transformer model."
]

# Generate embeddings
embeddings = model.encode(sentences)

print("Embeddings generated. Shape:", embeddings.shape)

In [3]:
import faiss

# Dimension of the embeddings
embedding_dimension = embeddings.shape[1]

# Build a simple index (e.g., IndexFlatL2)
index = faiss.IndexFlatL2(embedding_dimension) # FlatL2 = store the whole vector, use L2 = Euclidean distance

# Add embeddings to the index
index.add(embeddings)

print(f"FAISS index created with {index.ntotal} vectors")

FAISS index created with 4 vectors


In [4]:
# New sentences to add
new_sentences = [
    "This is a new sentence",
    "Adding more data to the index"
]

# Generate embeddings for the new sentences
new_embeddings = model.encode(new_sentences)

# Add the new embeddings to the index
index.add(new_embeddings)

# Update the original sentences list to include the new ones
sentences.extend(new_sentences)

print(f"Added {len(new_sentences)} new vectors to the index. Total vectors: {index.ntotal}")

Added 2 new vectors to the index. Total vectors: 6


In [5]:
# Define a query sentence
query_sentence = "This is a query sentence"

# Generate embedding for the query sentence
query_embedding = model.encode([query_sentence])

# Number of nearest neighbors to retrieve
k = 2

# Search the index for the nearest neighbors
distances, indices = index.search(query_embedding, k)

print(f"Nearest neighbors for '{query_sentence}':")
for i in range(k):
    print(f"  Sentence: '{sentences[indices[0][i]]}' (Distance: {distances[0][i]})")

Nearest neighbors for 'This is a query sentence':
  Sentence: 'This is an example sentence' (Distance: 0.8553394079208374)
  Sentence: 'This is a new sentence' (Distance: 1.0013540983200073)


In [None]:
import faiss
import numpy as np

# Normalize the existing embeddings (L2 normalization)
normalized_embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

# Dimension of the embeddings
embedding_dimension = normalized_embeddings.shape[1]

# Build a new index with inner product (IndexFlatIP)
index_ip = faiss.IndexFlatIP(embedding_dimension)

# Add the normalized embeddings to the index
index_ip.add(normalized_embeddings)

print(f"FAISS IndexFlatIP created with {index_ip.ntotal} normalized vectors")

In [None]:
# Normalize the query embedding
normalized_query_embedding = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)

# Number of nearest neighbors to retrieve
k = 2

# Search the IndexFlatIP for the nearest neighbors (using inner product, which is cosine similarity here)
distances_ip, indices_ip = index_ip.search(normalized_query_embedding, k)

print(f"Nearest neighbors for '{query_sentence}' using cosine similarity (IndexFlatIP):")
# Note: For IndexFlatIP with normalized vectors, higher inner product means higher similarity (closer)
# The distances returned are the inner products.
for i in range(k):
    print(f"  Sentence: '{sentences[indices_ip[0][i]]}' (Cosine Similarity: {distances_ip[0][i]})")

In [None]:
# Save the index to a file
index_filename = "my_faiss_index.index"
faiss.write_index(index, index_filename)

print(f"FAISS index saved to {index_filename}")

In [None]:
# Load the index from a file
loaded_index = faiss.read_index(index_filename)

print(f"FAISS index loaded from {index_filename} with {loaded_index.ntotal} vectors")

In [None]:
# Verify the number of vectors in the loaded index
print(f"Number of vectors in the loaded index: {loaded_index.ntotal}")

# You can also check the dimension of the vectors
print(f"Dimension of vectors in the loaded index: {loaded_index.d}")