In [3]:
import pandas as pd
import os 
from tqdm import tqdm 
from datasets import load_dataset
from datasketch import MinHashLSH, MinHash

In [4]:
dataset = load_dataset("./data_cleaned", streaming=True)

Resolving data files:   0%|          | 0/10001 [00:00<?, ?it/s]

In [5]:
def calculate_min_hash(text, num_perm=128): 
    text = text["text"].lower()
    minhash = MinHash(num_perm=num_perm)
    for word in text.split(): 
        minhash.update(word.encode("utf-8"))

    return {"hash": minhash,}

def calculate_lsh(hash_batch, query, threshold=0.9, num_perm=128): 
    lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
    for i, batch in enumerate(hash_batch): 
        lsh.insert(i, batch["hash"])

    result = lsh.query(query["hash"])
    returns = {"duplicate": []}

    for i in range(len(hash_batch)): 
        returns["duplicate"].append(True if i in result else False)

    return returns

def duplicate_filter(batch): 
    return [item["duplicate"] != True for item in batch]

In [6]:
dataset = dataset.map(calculate_min_hash)

In [7]:
def save_texts_to_file(unique_items, base_file_path="filtered_dataset", file_path='unique_items.txt'):
    with open(os.path.join(base_file_path, file_path), 'w', encoding='utf-8') as f:
        for item in unique_items:
            f.write(item + '\n')
    print(f"Saved {len(unique_items)} unique items to {file_path}")

In [9]:
lsh = MinHashLSH(threshold=0.9, num_perm=128)
unique_items = []
current_size = 0
file_counter = 0
MAX_SIZE = 500000

for i, item in tqdm(enumerate(dataset["train"])): 
    minhash = item["hash"]

    if len(lsh.query(minhash)) == 0: 
        lsh.insert(i, minhash) 
        unique_items.append(item["text"])
        current_size += 1

        if current_size >= MAX_SIZE: 
            print(f"Refreshing LSH...")
            save_texts_to_file(unique_items=unique_items, file_path=f"cleaned_{str(file_counter).zfill(5)}.txt")

            unique_items = []
            lsh = MinHashLSH(threshold=0.9, num_perm=128)

            current_size = 0
            file_counter += 1
    else: 
        pass

0it [00:00, ?it/s]

790538it [06:51, 1840.99it/s]

Refreshing LSH...
Saved 500000 unique items to cleaned_00000.txt


1571698it [13:39, 1336.45it/s]

Refreshing LSH...
Saved 500000 unique items to cleaned_00001.txt


2361893it [20:36, 1285.53it/s]

Refreshing LSH...
Saved 500000 unique items to cleaned_00002.txt


3158756it [27:36, 1342.91it/s]

Refreshing LSH...
Saved 500000 unique items to cleaned_00003.txt


3930108it [34:35, 1354.20it/s]

Refreshing LSH...
Saved 500000 unique items to cleaned_00004.txt


4714781it [41:33, 1978.67it/s]

Refreshing LSH...
Saved 500000 unique items to cleaned_00005.txt


5506259it [48:29, 1934.03it/s]

Refreshing LSH...
Saved 500000 unique items to cleaned_00006.txt


6291213it [55:23, 1296.67it/s]

Refreshing LSH...
Saved 500000 unique items to cleaned_00007.txt


7085763it [1:02:38, 1837.08it/s]

Refreshing LSH...
Saved 500000 unique items to cleaned_00008.txt


7886815it [1:09:56, 1287.48it/s]

Refreshing LSH...
Saved 500000 unique items to cleaned_00009.txt


8693339it [1:17:16, 1284.31it/s]

Refreshing LSH...
Saved 500000 unique items to cleaned_00010.txt


9484591it [1:24:23, 1416.67it/s]

Refreshing LSH...
Saved 500000 unique items to cleaned_00011.txt


10284557it [1:31:15, 2028.53it/s]

Refreshing LSH...
Saved 500000 unique items to cleaned_00012.txt


11026756it [1:37:48, 1878.87it/s]


In [None]:
save_texts_to_file(unique_items=unique_items, file_path=f"cleaned_{str(13).zfill(5)}.txt")


In [10]:
new_dataset = dataset["train"]
for i, example in tqdm(enumerate(dataset["train"])): 
    new_dataset = new_dataset.map(calculate_lsh, batched=True, fn_kwargs={"query": example, "threshold": 0.9, "num_perm": 128})
    new_dataset = new_dataset.filter(duplicate_filter, batched=True)

14670it [00:08, 1749.84it/s]


KeyboardInterrupt: 

In [9]:
from tqdm import tqdm
from datasketch import MinHash, MinHashLSH

def calculate_min_hash(text, num_perm=128):
    text = text["text"]
    minhash = MinHash(num_perm=num_perm)
    for word in text.split():
        minhash.update(word.encode("utf-8"))
    return {"hash": minhash}

def calculate_lsh_for_stream(example, lsh, threshold=0.9, num_perm=128):
    result = lsh.query(example["hash"])
    is_duplicate = len(result) > 0
    if not is_duplicate:
        lsh.insert(len(lsh), example["hash"])
    return {"duplicate": is_duplicate}

def duplicate_filter(row):
    return not row["duplicate"]

# Initialize LSH for streaming processing
lsh = MinHashLSH(threshold=0.9, num_perm=128)

# Process the dataset in a streaming manner
new_dataset = []
for example in tqdm(dataset["train"]):
    # Calculate the MinHash for the current example
    hashed_example = calculate_min_hash(example)
    
    # Calculate LSH to find duplicates and decide whether to insert
    lsh_result = calculate_lsh_for_stream(hashed_example, lsh)
    
    # If not a duplicate, keep the example
    if duplicate_filter(lsh_result):
        new_dataset.append(example)


0it [00:00, ?it/s]


TypeError: object of type 'MinHashLSH' has no len()

In [None]:
filtered_dataset = new_dataset['train'].remove_columns([col for col in dataset['train'].column_names if col != 'text'])

In [None]:
filtered_dataset.save_to_disk("filtered_dataset")