In [1]:
import faiss
import torch
from datasets import Dataset, DatasetDict, load_dataset
from sentence_transformers import SentenceTransformer
from tqdm.autonotebook import tqdm

In [2]:
ds = load_dataset(
    "/home/pranav-pc/projects/OpenTransformer/multiformer/data/downloads/TinyStories",
    split="validation",
)

In [4]:
# https://huggingface.co/spaces/mteb/leaderboard
model = "mixedbread-ai/mxbai-embed-large-v1"
sentence_model = SentenceTransformer(model)

README.md:   0%|          | 0.00/113k [00:00<?, ?B/s]

In [5]:
ds = ds.map(
    lambda example: {"embedding": sentence_model.encode(example["text"])},
    batched=True,
)
ds.set_format("pt")

Map:   0%|          | 0/21990 [00:00<?, ? examples/s]

In [6]:
def normalize_embedding(example):
    embedding = example["embedding"]
    norm = torch.norm(embedding, dim=1, keepdim=True)
    normalized_embedding = embedding / norm
    return {"embedding": normalized_embedding}


ds = ds.map(normalize_embedding, batched=True, batch_size=int(1e4))

Map:   0%|          | 0/21990 [00:00<?, ? examples/s]

In [7]:
dim = ds[0]["embedding"].shape[0]
index = faiss.IndexFlatIP(dim)
index.add(ds["embedding"])

In [8]:
D, I = index.search(
    ds[:10]["embedding"],
    k=2,
)

In [19]:
print("Filtering out near-duplicates...")
D, I = index.search(ds["embedding"], k=2)

threshold = 0.975

Filtering out near-duplicates...


In [11]:
import pandas as pd

df = pd.DataFrame(D)

In [20]:
I[df[(df > threshold).sum(axis=1) == 2].index]

array([[13492, 18007],
       [18007, 13492]])

In [15]:
print(ds[13492]["text"])

Once upon a time, there was a little boy named Timmy. Timmy liked to climb trees. One day, he saw a big green tree and wanted to climb it. He said to his mom, "Mommy, can I climb that big green tree?" His mom said, "No Timmy, that tree is too high and it's bad for you to climb it." Timmy was sad but he listened to his mom.

The next day, Timmy saw a smaller tree that was also green. He asked his mom, "Mommy, can I climb that small green tree?" His mom said, "Yes Timmy, that tree is not too high and it's safe for you to climb it." Timmy was happy and climbed the tree. He felt like a big adventurer.

When Timmy got to the top of the tree, he shouted down to his mom, "Mommy, I climbed the tree!" His mom smiled and said, "Good job Timmy, you are a great climber!" Timmy felt proud of himself and couldn't wait to climb more trees.


In [24]:
print(ds[18007]["text"])

Once upon a time, there was a little boy named Timmy. Timmy loved to climb trees. One day, Timmy saw a really high tree and he wanted to climb it. 

Timmy's mom said, "Be careful Timmy, that tree is really high." 

Timmy said, "I can do it, Mommy!" 

So, Timmy climbed and climbed until he reached the top of the tree. He looked down and saw his mom tapping her foot. 

"Come down, Timmy," she said. 

Timmy climbed back down and said, "That was so much fun! Can we climb another tree tomorrow?" 

His mom smiled and said, "Sure, Timmy. But let's find a shorter one next time."


In [36]:
to_keep = []
for i in tqdm(range(len(ds["embedding"])), desc="Filtering"):
    # If the second closest vector (D[i, 1]) has cosine similarity above the threshold
    if D[i, 1] >= threshold:
        # Check if either the current item or its nearest neighbor is already in the to_keep list
        nearest_neighbor = I[i, 1]
        if i not in to_keep and nearest_neighbor not in to_keep:
            # If not, add the current item to the list
            to_keep.append(i)
    else:
        # If the similarity is below the threshold, always keep the current item
        to_keep.append(i)

Filtering:   0%|          | 0/21990 [00:00<?, ?it/s]

In [39]:
len(I) - len(to_keep)

1

In [132]:
ds.select(to_keep)

Dataset({
    features: ['text', 'embedding'],
    num_rows: 21864
})