## Hi y'all, this is Team-5 and here we've documented the entire deduplication methods and pipelines utilised in our assignment.

## The primary common step of reading the input folders, writing the output folders and normalizing text

We have one parent folder which in turn stores a number of folders corresponding to each unique source. These folders in turn contain a list of .txt files. Through our deduplication methodoligies, we read through all the .txt files accross all the folders and remove the duplicate .txt files which have similarity scores above a pre-set threshold.

The following code helps in reading and storing all the text files across all the input folders; and writing the unique text files into the new output folders in the same manner.

In [None]:
import os

First of all, we'll load all the text files from our folder structure

In [None]:
def load_documents_from_folders(root_folder):
    documents = []
    file_paths = []

    #recursively walking through all subfolders and files
    for dirpath, _, filenames in os.walk(root_folder):
        for filename in filenames:
            if filename.endswith(".txt"):  #processing only .txt files (not required but keeping a margin, just in case)
                file_path = os.path.join(dirpath, filename)
                file_paths.append(file_path)
                with open(file_path, 'r', encoding='utf-8') as file:
                    documents.append(file.read())

    return documents, file_paths

Here, we're writing the unique documents back to the output folder, while preserving our document structure as well

In [None]:
def save_unique_documents(unique_docs, original_file_paths, output_folder):
    for doc in unique_docs:
        original_file_path = original_file_paths[doc['index']]
        # Recreate the original folder structure in the output folder
        relative_path = os.path.relpath(original_file_path, start=input_folder)
        output_file_path = os.path.join(output_folder, relative_path)
        os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

        # Copy the original file to the new folder
        copyfile(original_file_path, output_file_path)

Storing the paths to our input and output parent folder, also loading the documents and file paths.

In [None]:
input_folder = "/Users/utkarshsrivastava/Downloads/parentfolder"
output_folder = "/Users/utkarshsrivastava/Downloads/parentfolder_deduplicated"

In [None]:
documents, file_paths = load_documents_from_folders(input_folder)

Now, we've successfully managed to input all the .txt. files

we'll go on with doing a basic text normalization to improve the deduplication by lowercasing and removing extra spaces.

In [None]:
def normalize_text(text):
    return ' '.join(text.lower().split())

## Deduplication Pipelines

## Technique 1 - MinHash+LSH

In [None]:
from datasketch import MinHash, MinHashLSH

Now, the major step comes in of creating a MinHash object and an LSH model to group similar documents

In [None]:
def compute_minhash(doc):
    m = MinHash(num_perm=128)
    normalized_doc = normalize_text(doc)  #Applying normalization
    for word in normalized_doc.split():
        m.update(word.encode('utf8'))
    return m

threshold = 0.8  #Setting similarity threshold for near-duplicate detection
lsh = MinHashLSH(threshold=threshold, num_perm=128)

#Inserting MinHashes into the LSH
minhashes = {}
for idx, doc in enumerate(documents):
    m = compute_minhash(doc)
    lsh.insert(f'doc_{idx}', m)
    minhashes[idx] = m

The next step is pretty straightforward, we'll keep a track of unique and duplicates documents and ensure that the copy of the parent folder we're creating as an output only stores the unique ones (the first occurence among the cases of duplications)

In [None]:
unique_docs = set()
duplicates = set()

for idx, doc in enumerate(documents):
    m = minhashes[idx]

    if f'doc_{idx}' not in unique_docs:
        #Querying similar documents from LSH
        similar_docs = lsh.query(m)
        for sim_doc in similar_docs:
            if sim_doc != f'doc_{idx}':
                duplicates.add(sim_doc)
        unique_docs.add(f'doc_{idx}')

#Filtering out duplicates
filtered_docs = [f'doc_{idx}' for idx in range(len(documents)) if f'doc_{idx}' not in duplicates]

here we've use a slightly different function to save the unique documents in the output folder following the same folder structure as the input folder.

In [None]:
def save_unique_documents(unique_docs, original_file_paths, output_folder):
    for doc_id in unique_docs:
        original_file_path = original_file_paths[int(doc_id.split('_')[1])]

        #recreating the original folder structure in the output folder
        relative_path = os.path.relpath(original_file_path, start=input_folder)
        output_file_path = os.path.join(output_folder, relative_path)
        os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

        #copying the content to the new file
        with open(output_file_path, 'w', encoding='utf-8') as out_file:
            out_file.write(documents[int(doc_id.split('_')[1])])

save_unique_documents(filtered_docs, file_paths, output_folder)

In [None]:
print(f"Deduplicated documents saved to {output_folder}")

Voila!

## Technique 2: Fuzzy Matching with Levenshtein distance

In [None]:
from fuzzywuzzy import fuzz
from shutil import copyfile

here we'll normalize in the beginning

In [None]:
normalized_documents = [normalize_text(doc) for doc in documents

Keeping track of unique documents, with the threshold parameter set as if the similarity score is >85, then only consider the case a duplicate.

In [None]:
unique_docs = []
duplicates = set()

for idx, (doc, normalized_doc) in enumerate(zip(documents, normalized_documents)):
    is_duplicate = False
    for u_doc in unique_docs:
        # Check if the document is a duplicate using fuzzy matching
        if fuzz.ratio(normalized_doc, u_doc['normalized']) > 85:  # Set threshold for similarity
            is_duplicate = True
            break

    if not is_duplicate:
        unique_docs.append({'content': doc, 'normalized': normalized_doc, 'index': idx})

Saving the unique documents to the output folder, maintaining folder structure


In [None]:
save_unique_documents(unique_docs, file_paths, output_folder)

print(f"Unique documents saved to {output_folder}")

Voila!

## Technique 3: Cosine Similarity

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

here we'll normalize in the beginning

In [None]:
normalized_documents = [normalize_text(doc) for doc in documents

Using TF-IDF to convert documents into vectors and computing cosine similarity matrix

In [None]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(normalized_documents)

Keeping track of unique documents, with the threshold parameter set as if the similarity score is >0.91, then only consider the case a duplicate.

In [None]:
similarity_threshold = 0.91

# Identify unique documents based on cosine similarity
unique_docs = []
duplicates = set()

for i in range(len(documents)):
    if i not in duplicates:
        unique_docs.append(i)
        for j in range(i + 1, len(documents)):
            if cosine_sim_matrix[i, j] > similarity_threshold:
                duplicates.add(j)

Saving the unique documents to the output folder, maintaining folder structure

In [None]:
save_unique_documents(unique_docs, file_paths, output_folder)

print(f"Unique documents saved to {output_folder}")

Voila!

## Technique 4: CSV Data Storage + SimHash + N-gram Sampling


Hash Generation from Text Files:

->We start with a parent folder, which contains multiple subfolders. Each subfolder contains several text files.

->The first step involves processing each subfolder in the parent folder. For every text file within these subfolders, we apply a hash function (SimHash or MinHash) to generate a unique hash for the content of each file.

->These generated hashes, along with corresponding file names, are then written to a CSV file.

Deduplication:

->Use the CSV to identify duplicates based on hash values, log the duplicates, and remove them.

Transfer Deduplicated Files:

->Move the unique (non-duplicate) files to the output folder, ensuring only deduplicated files are stored.

->We can enhance the deduplication process by incorporating more robust techniques to compare text files across folders

Step 1: Use Simhash or MinHash

Code for Simhash

In [None]:
import os
import re
from simhash import Simhash
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool, cpu_count


def features_extract(s):
    width = 3
    s = s.lower()
    s = re.sub(r'[^\w]+', '', s)
    return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))]


def compute_simhash(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    simhash_value = Simhash(features_extract(content))
    return file_path, simhash_value.value


def process_directory(args):
    path, parent = args
    data = []
    relative_path = os.path.relpath(path, parent)

    # Skip processing if the output CSV already exists
    if os.path.exists(output_file):
        print(f"Skipping {path}, CSV already exists")
        return data

    s = relative_path.replace(os.sep, '_')
    for file in tqdm(os.listdir(path), leave=False, desc=s):
        file_path = os.path.join(path, file)
        if os.path.isfile(file_path):
            file_path, simhash_value = compute_simhash(file_path)
            data.append({
                "file": file_path,
                "hash": simhash_value
            })
    if data:
        df = pd.DataFrame(data)
        df.to_csv(f"path where csv needs to be stored{s}.csv", index=False)   #give a path where csv needs to be stored koi,  where the hash csvs are stored
    return data


if __name__ == '__main__':
    parent = "Parent folder path"     #sara data daal do ek parent folder mein aur idher path daal dena
    directories = [(os.path.join(path, d), parent) for path, dirs, files in os.walk(parent) for d in dirs]

    # Use all available CPUs (one cpu per csv)
    with Pool(cpu_count()) as pool:
        pool.map(process_directory, directories)

In [None]:
Code for MinHash

In [None]:
import os
import re
from datasketch import MinHash, MinHashLSH  # Import MinHash and LSH from datasketch
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool, cpu_count


def features_extract(s):
    """Generate n-grams from the input string."""
    width = 3
    s = s.lower()
    s = re.sub(r'[^\w]+', '', s)
    return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))]


def compute_minhash(file_path):
    """Compute MinHash signature for a file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Generate n-grams
    ngrams = features_extract(content)

    # Create a MinHash object and update it with the n-grams
    minhash = MinHash()
    for ngram in ngrams:
        minhash.update(ngram.encode('utf-8'))  # Update with the byte representation of n-grams

    return file_path, minhash


def process_directory(args):
    """Process a directory of files and compute their MinHash signatures."""
    path, parent = args
    data = []
    relative_path = os.path.relpath(path, parent)

    # Skip processing if the output CSV already exists
    if os.path.exists(output_file):
        print(f"Skipping {path}, CSV already exists")
        return data

    s = relative_path.replace(os.sep, '_')
    for file in tqdm(os.listdir(path), leave=False, desc=s):
        file_path = os.path.join(path, file)
        if os.path.isfile(file_path):
            file_path, minhash = compute_minhash(file_path)
            data.append({
                "file": file_path,
                "hash": minhash,  # Store MinHash object
                "hash_values": minhash.hash  # Optionally store the actual hash values
            })

    # Create DataFrame and save to CSV
    if data:
        df = pd.DataFrame(data)
        df.to_csv(f"path where csv needs to be stored/{s}.csv", index=False)  # Specify the output path
    return data


if __name__ == '__main__':
    parent = "Parent folder path"  # Specify the parent folder path
    directories = [(os.path.join(path, d), parent) for path, dirs, files in os.walk(parent) for d in dirs]

    # Use all available CPUs (one cpu per csv)
    with Pool(cpu_count()) as pool:
        pool.map(process_directory, directories)


Step 2: Similarity check

In [None]:
import os
import pandas as pd
from tqdm import tqdm

path = "-------"
output_path = "---"     #deduplication ke bad jidher save kr rhe apan
log_file_path = "------"        # sare duplicates idher store kr lo

files = list(os.listdir(path))
files.sort()
print("Number of files:", len(files), files)
files = [f"{path}/{x}" for x in files]

with open(log_file_path, 'a') as log_file:
    for file in tqdm(files):
        print(file)
        df = pd.read_csv(file)

        # Identify duplicates
        duplicates = df[df.duplicated(subset='hash', keep=False)]

        if not duplicates.empty:
            log_file.write(f"Duplicates in {file}:\n")
            for index, row in duplicates.iterrows():
                log_file.write(f"\tFile: {row['file']} Hash: {row['hash']}\n")
            log_file.write("\n")

        # Remove duplicates
        df = df.drop_duplicates(subset='hash', keep="first")

        s = file.split("/")[-1]
        print(s)
        df.to_csv(f"{output_path}/{s}", index=False)
        print("Deduplication done for", file)

print(f"Duplicate log has been saved to {log_file_path}")

Step 3: Code to transfer all deduplicated files in another path

In [None]:

import os
import shutil
import pandas as pd
import concurrent.futures
from tqdm import tqdm

# Define the directories
csv_folder = "-----"        #deduplication ke bad jidher save kr rhe apan
target_dir = "---"       # Directory where the files will be copied to
original_dir = "----------"       #Directory containing the original files that need to be copied based on the paths extracted from the CSVs.

def copy_file(file):
    """Copy a file to the target directory while preserving the relative path."""
    try:
        relative_path = os.path.relpath(file, original_dir)
        target_path = os.path.join(target_dir, relative_path)
        os.makedirs(os.path.dirname(target_path), exist_ok=True)
        shutil.copy(file, target_path)
        return f"Copied {file} to {target_path}"
    except Exception as e:
        return f"Error copying {file} to {target_path}: {str(e)}"

def process_csv(csv_file):
    """Process a single CSV file."""
    csv_path = os.path.join(csv_folder, csv_file)
    try:
        df = pd.read_csv(csv_path)
        if 'file' in df.columns:
            file_paths = df['file'].tolist()
            return file_paths
        else:
            print(f"CSV {csv_file} does not contain 'file' column.")
            return []
    except Exception as e:
        print(f"Error processing {csv_file}: {str(e)}")
        return []

def main():
    all_files = []

    # Gather all file paths from the CSV files using multiprocessing
    csv_files = [csv_file for csv_file in os.listdir(csv_folder) if csv_file.endswith('.csv')]
    with concurrent.futures.ProcessPoolExecutor() as executor:
        results = list(tqdm(executor.map(process_csv, csv_files), total=len(csv_files), desc="Loading CSVs"))

    for file_list in results:
        all_files.extend(file_list)

    # Use ThreadPoolExecutor to copy files in parallel
    futures = {}
    with concurrent.futures.ThreadPoolExecutor() as executor:
        for file_path in tqdm(all_files, desc="Submitting copy tasks"):
            if os.path.exists(os.path.join(original_dir, file_path)):
                future = executor.submit(copy_file, os.path.join(original_dir, file_path))
                futures[future] = file_path

        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Copying files"):
            print(future.result())

# Run the script
if __name__ == "__main__":
    main()
