## CPU Deduplication

In [1]:
from nemo_curator.datasets import DocumentDataset
from nemo_curator.modules import ExactDuplicates
from nemo_curator.modules.modify import Modify
from nemo_curator import AddId
import time
import numpy as np

In [2]:
dataset = DocumentDataset.read_json('/code/data/ready', add_filename=True)
add_id = AddId(id_field="id")
id_dataset = add_id(dataset)

Reading 3 files


In [3]:
start_time = time.time()
deduplicator = ExactDuplicates(id_field="id", text_field="text", hash_method="md5")

# Find the duplicates
duplicates = deduplicator(id_dataset)
docs_to_remove = duplicates.df.map_partitions(
    lambda x: x[x._hashes.duplicated(keep="first")]
)
duplicate_ids = list(docs_to_remove.compute()['id'])
dataset_df = id_dataset.df
print(len(dataset_df[dataset_df.id.isin(duplicate_ids)]))
deduped = dataset_df[~dataset_df.id.isin(duplicate_ids)]
deduped_dd = DocumentDataset(deduped)
end_time = time.time()
print(f"Time elapsed: {np.round(end_time - start_time, 2)} s")

12099
Time elapsed: 26.45


## GPU Deduplication

In [8]:
dataset_gpu = DocumentDataset.read_json('/code/data/ready', add_filename=True, backend='cudf')
add_id = AddId(id_field="id")
id_dataset_gpu = add_id(dataset_gpu)

Reading 3 files


In [9]:
start_time = time.time()
deduplicator = ExactDuplicates(id_field="id", text_field="text", hash_method="md5")

# Find the duplicates
duplicates = deduplicator(id_dataset_gpu)
docs_to_remove = duplicates.df.map_partitions(
    lambda x: x[x._hashes.duplicated(keep="first")]
)
duplicate_ids = list(docs_to_remove.compute().to_pandas()['id'])
dataset_df = id_dataset_gpu.df
print(len(dataset_df[dataset_df.id.isin(duplicate_ids)]))
deduped = dataset_df[~dataset_df.id.isin(duplicate_ids)]
deduped_dd = DocumentDataset(deduped)
end_time = time.time()
print(f"Time elapsed: {np.round(end_time - start_time, 2)} s")

12099
Time elapsed: 8.73 s
