In [1]:
import faiss
import os
import pandas as pd
from pprint import pprint
from sentence_transformers import SentenceTransformer


names = ['all-miniLM-L6-v2', 'bge-base-en-v1.5', 'e5-base-v2', 'instructor-base']

def load_model(model_name: str) -> SentenceTransformer:
    """Load a SentenceTransformer model by name."""
    if model_name == 'all-miniLM-L6-v2':
        model_name = 'sentence-transformers/all-miniLM-L6-v2'
    elif model_name == 'bge-base-en-v1.5':
        model_name = 'BAAI/bge-base-en-v1.5'
    elif model_name == 'e5-base-v2':
        model_name = 'intfloat/e5-base-v2'
    elif model_name == 'instructor-base':
        model_name = 'hkunlp/instructor-base'

    return SentenceTransformer(model_name, device='cuda')

  from .autonotebook import tqdm as notebook_tqdm





## Build FAISS Index
Create a FAISS index for efficient similarity search

In [2]:
def create_embeddings(model, text_list):
    embeddings = model.encode(text_list, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
    return embeddings

def get_faiss_index(name: str, load_model_ = False):
    """Returns the FAISS index, texts, and embedding dimension for the specified model name."""
    
    save_path = f'Data/FAISSINDEX/{name}.bin'
    model = None
    
    if os.path.exists(save_path):
        index = faiss.read_index(save_path)
        print(f"FAISS index loaded from {save_path} with {index.ntotal} vectors.")
        dataset = pd.read_excel(f'Data/{name}_dataset.xlsx')
        text1 = dataset.text1
        text2 = dataset.text2[dataset.text2.notnull()]
        texts = pd.concat([text1, text2]).tolist()
        dimension = index.d
        if load_model:
            model = load_model(name)
        return index, texts, dimension, model

    dataset = pd.read_excel(f'Data/{name}_dataset.xlsx')
    text1 = dataset.text1
    text2 = dataset.text2[dataset.text2.notnull()]
    texts = pd.concat([text1, text2]).tolist() 

    model = load_model(name)
    embeddings = create_embeddings(model, texts)
    dimension = embeddings.shape[1]
    
    index = faiss.IndexFlatIP(dimension)
    faiss.normalize_L2(embeddings)
    index.add(embeddings)

    print(f"FAISS index created for model: {name} with {index.ntotal} vectors.")

    if not os.path.exists('Data/FAISSINDEX'):
        os.makedirs('Data/FAISSINDEX')

    faiss.write_index(index, save_path)
    print(f"FAISS index saved to {save_path}")

    return index, texts, dimension, model

## Test Similarity Search
Search for similar texts using a query

In [3]:
# Function to search for similar texts
def search_similar_texts(model, index, texts, query, k=5):
    """
    Search for k most similar texts to the query
    
    Args:
        query: Query text string
        k: Number of similar results to return
    
    Returns:
        DataFrame with similar texts and their distances
    """
    # Encode the query
    query_embedding = model.encode([query], convert_to_numpy=True)
    
    # Search the index
    distances, indices = index.search(query_embedding, k)
    
    # Create results dataframe
    results = pd.DataFrame({
        'rank': range(1, k+1),
        'text': [texts[idx] for idx in indices[0]],
        'distance': distances[0],
        'index': indices[0]
    })
    
    return results

## Save FAISS Index (Optional)
Save the index and embeddings for later use

In [4]:
indexes = {}
for name in names:
    index, texts, dimension, model = get_faiss_index(name, load_model_=False)
    indexes[name] = {
        'index': index,
        'texts': texts,
        'dimension': dimension,
        'model': model
    }

FAISS index loaded from Data/FAISSINDEX/all-miniLM-L6-v2.bin with 123565 vectors.
FAISS index loaded from Data/FAISSINDEX/bge-base-en-v1.5.bin with 115254 vectors.
FAISS index loaded from Data/FAISSINDEX/bge-base-en-v1.5.bin with 115254 vectors.
FAISS index loaded from Data/FAISSINDEX/e5-base-v2.bin with 115254 vectors.
FAISS index loaded from Data/FAISSINDEX/e5-base-v2.bin with 115254 vectors.
FAISS index loaded from Data/FAISSINDEX/instructor-base.bin with 116245 vectors.
FAISS index loaded from Data/FAISSINDEX/instructor-base.bin with 116245 vectors.


### Key benchmark is on the ground truth samples

Evaluate with difference tolerance: 1, 3, 5.
We have only 7085 rows of true duplicate samples, so we have to test on those for accuracy.

Clustering based evaluation can be performed on the negative samples

In [50]:
# Mapping for the values
tolerances = [1, 3, 5, 10]

name = 'instructor-base'
index, texts, dimension, model = indexes[name].values()

dataset = pd.read_excel(f'Data/{name}_dataset.xlsx')

start1 = dataset[dataset.text2.notnull()].index[0]
start2 = len(dataset)
indices = dataset[dataset.is_duplicate.apply(lambda x: True if x > 0 else False)].index


texts[start1 + 6], texts[start2 + 6]

('Which are the best books to read on Digital Marketing?',
 'What are some good books on Digital marketing?')

In [51]:
"""Only accuracy is considered, since precision and recall is not applicable if there are no classes"""
from tqdm import tqdm

save_path = f"Evaluation/FAISS/{name}.json"

results = {}

for tolerance in tolerances:
    total = 0
    correct = 0
    for idx in tqdm(indices, desc=f"Evaluating tolerance {tolerance}"):
        query = texts[idx]
        result = search_similar_texts(model, index, texts, query, k=tolerance + 1)
        if idx - start1 + start2 in result['index'].values:
            correct += 1
        total += 1
    accuracy = correct / total
    results[f'tolerance_{tolerance}'] = {
        'total': total,
        'correct': correct,
        'accuracy': accuracy
    }
    pprint(results)

Evaluating tolerance 1: 100%|██████████| 7085/7085 [06:00<00:00, 19.64it/s]


{'tolerance_1': {'accuracy': 0.638955539872971, 'correct': 4527, 'total': 7085}}


Evaluating tolerance 3: 100%|██████████| 7085/7085 [05:49<00:00, 20.26it/s]


{'tolerance_1': {'accuracy': 0.638955539872971, 'correct': 4527, 'total': 7085},
 'tolerance_3': {'accuracy': 0.8056457304163727,
                 'correct': 5708,
                 'total': 7085}}


Evaluating tolerance 5: 100%|██████████| 7085/7085 [05:36<00:00, 21.07it/s]


{'tolerance_1': {'accuracy': 0.638955539872971, 'correct': 4527, 'total': 7085},
 'tolerance_3': {'accuracy': 0.8056457304163727,
                 'correct': 5708,
                 'total': 7085},
 'tolerance_5': {'accuracy': 0.8585744530698659,
                 'correct': 6083,
                 'total': 7085}}


Evaluating tolerance 10: 100%|██████████| 7085/7085 [05:07<00:00, 23.05it/s]

{'tolerance_1': {'accuracy': 0.638955539872971, 'correct': 4527, 'total': 7085},
 'tolerance_10': {'accuracy': 0.9110797459421313,
                  'correct': 6455,
                  'total': 7085},
 'tolerance_3': {'accuracy': 0.8056457304163727,
                 'correct': 5708,
                 'total': 7085},
 'tolerance_5': {'accuracy': 0.8585744530698659,
                 'correct': 6083,
                 'total': 7085}}





In [52]:
import json
with open(save_path, 'w') as f:
    json.dump(results, f, indent=4)

### Perform text deduplication
- Option:
    - Using some consine similarity threshold

In [56]:
import numpy as np

def deduplicate_texts_with_mapping(model, index, texts, threshold=0.9):
    """
    Deduplicate texts using FAISS index and cosine similarity threshold.
    Returns:
        unique_texts: list of unique texts
        mapping: dict {unique_index: [represented_indices]}
    """
    from tqdm import tqdm
    unique_indices = []
    seen = set()
    mapping = {}
    for i, text in tqdm(enumerate(texts), total=len(texts), desc="Deduplicating"):
        if i in seen:
            continue
        # Search for top 10 similar texts (including itself)
        query_embedding = model.encode([text], convert_to_numpy=True)
        distances, indices_ = index.search(query_embedding, 500)
        group = [i]
        for dist, idx in zip(distances[0], indices_[0]):
            if idx != i:
                if dist > threshold:
                    seen.add(idx)
                    group.append(idx)
                else: break
        else:
            print("All index checked there might be even more duplicates")
        unique_indices.append(i)
        mapping[i] = group
    unique_texts = [texts[i] for i in unique_indices]
    return unique_texts, mapping


In [None]:
index, texts, dimension, model = indexes[name].values()
unique_texts, mapping = deduplicate_texts_with_mapping(model, index, texts, threshold=0.9)

Deduplicating: 100%|██████████| 116245/116245 [49:52<00:00, 38.84it/s] 


In [None]:
unique_texts[0], mapping[0]
"dasdas" (0)
[0, 1234, 5678, 213, 1]  # 

NameError: name 'unique_texts' is not defined

In [71]:
mapping[86]

KeyError: 86

In [70]:
path = "Evaluation/FAISS/Deduplication dataset"
os.path.exists(path)

dedup_pd = pd.DataFrame()
dedup_pd['unique_text'] = unique_texts
dedup_pd['represented_indices'] = dedup_pd.index.map(lambda x: mapping[x])

KeyError: 86