## Checking for duplicates

The record for how duplicates are handled in a spreadsheet containing all
submissions. This is downloaded and saved to JSON format in next steps.


### Remove duplicates via comparing doc hashes


In [None]:
import hashlib
import os

def hash_file(filepath):
    """Calculate SHA256 hash of a file."""
    sha256_hash = hashlib.sha256()
    with open(filepath, "rb") as f:
        # Read and update hash in chunks of 4K
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)
    return sha256_hash.hexdigest()

def find_duplicate_pdfs(folder_path):
    """Find duplicate PDFs in a folder."""
    hashes = {}
    duplicates = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            filepath = os.path.join(folder_path, filename)
            file_hash = hash_file(filepath)
            
            if file_hash in hashes:
                duplicates.append((filename, hashes[file_hash]))
            else:
                hashes[file_hash] = filename
    
    return duplicates

folder_path = './static/submissions'
duplicates = find_duplicate_pdfs(folder_path)

if duplicates:
    print("Found duplicates:")
    for dup in duplicates:
        print(f"{dup[0]} is a duplicate of {dup[1]}")
else:
    print("No duplicates found.")


### List documents from same author for checking


In [None]:
import os
import shutil

def extract_name_from_filename(filename):
    parts = filename.split('-')
    name_parts = parts[1:]
    name = ' '.join(name_parts).split('.')[0]
    if name.find('anonymous') != -1:
        name = 'anonymous'
    return name.lower()

def find_duplicate_names(folder_path):
    name_to_files = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            name = extract_name_from_filename(filename)
            if name != 'anonymous':
                if name in name_to_files:
                    name_to_files[name].append(filename)
                else:
                    name_to_files[name] = [filename]
    
    duplicates = {name: files for name, files in name_to_files.items() if len(files) > 1}
    return duplicates

def copy_files_to_subfolders(folder_path, duplicates):
    # Create a main directory for the duplicates
    duplicates_dir = os.path.join(folder_path, 'duplicates')
    if not os.path.exists(duplicates_dir):
        os.makedirs(duplicates_dir)
    
    for name, files in duplicates.items():
        # Create a subfolder for each duplicate name
        subfolder_path = os.path.join(duplicates_dir, name)
        if not os.path.exists(subfolder_path):
            os.makedirs(subfolder_path)
        
        # Copy the files into their respective subfolders
        for file in files:
            src_path = os.path.join(folder_path, file)
            dst_path = os.path.join(subfolder_path, file)
            shutil.copy(src_path, dst_path)

folder_path = './static/submissions'
duplicates = find_duplicate_names(folder_path)

# Copy the files to subfolders
copy_files_to_subfolders(folder_path, duplicates)

print(f'Copied files for {len(duplicates)} duplicate names into subfolders.')

### Check for duplicates by comparing chunks within submissions for overlap


In [None]:
import os
import shutil
from collections import defaultdict
import hashlib
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file using PyMuPDF."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def chunk_text(text, chunk_size=500):
    """Divide text into chunks of a given size."""
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

def hash_chunk(chunk):
    """Calculate SHA256 hash of a text chunk."""
    return hashlib.sha256(chunk.encode('utf-8')).hexdigest()

def find_related_pdfs(folder_path, chunk_size=500, threshold=0.1):
    """Find PDFs that share chunks of text."""
    chunk_hashes = {}
    doc_chunks = {}
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            filepath = os.path.join(folder_path, filename)
            text = extract_text_from_pdf(filepath)
            chunks = chunk_text(text, chunk_size)
            doc_chunks[filename] = set()
            
            for chunk in chunks:
                chunk_hash = hash_chunk(chunk)
                doc_chunks[filename].add(chunk_hash)
                
                if chunk_hash not in chunk_hashes:
                    chunk_hashes[chunk_hash] = []
                chunk_hashes[chunk_hash].append(filename)
    
    # Identify documents with a significant number of shared chunks
    related_docs = []
    for doc, hashes in doc_chunks.items():
        shared = {}
        for hash_val in hashes:
            for other_doc in chunk_hashes[hash_val]:
                if other_doc != doc:
                    shared[other_doc] = shared.get(other_doc, 0) + 1
        
        # Check if the number of shared chunks exceeds the threshold
        for other_doc, count in shared.items():
            if count / len(hashes) > threshold:
                related_docs.append((doc, other_doc, count))
    
    return related_docs

def merge_groups_with_shared_documents(related_pdfs):
    """Merge groups that share any documents."""
    groups = []
    for rel in related_pdfs:
        found = False
        for group in groups:
            if rel[0] in group or rel[1] in group:
                group.update([rel[0], rel[1]])
                found = True
                break
        if not found:
            groups.append(set([rel[0], rel[1]]))
    
    # Merge groups with any common elements
    merged = True
    while merged:
        merged = False
        for i in range(len(groups)):
            for j in range(i+1, len(groups)):
                if groups[i].intersection(groups[j]):
                    groups[i] = groups[i].union(groups[j])
                    groups[j] = set()
                    merged = True
        groups = [group for group in groups if group]
    
    return groups

def create_subfolders_for_duplicates(related_pdfs, folder_path, duplicates_folder="01. duplicates_chunked"):
    """Create subfolders and move duplicates into them, merging groups with shared documents."""
    groups = merge_groups_with_shared_documents(related_pdfs)
    duplicates_path = os.path.join(folder_path, duplicates_folder)
    
    if not os.path.exists(duplicates_path):
        os.mkdir(duplicates_path)
    
    # Create subfolders and move/copy PDFs
    for i, group in enumerate(groups):
        subfolder_path = os.path.join(duplicates_path, f"group_{i+1}")
        if not os.path.exists(subfolder_path):
            os.mkdir(subfolder_path)
        
        for file in group:
            src_path = os.path.join(folder_path, file)
            dest_path = os.path.join(subfolder_path, file)
            shutil.copy(src_path, dest_path)  # Use shutil.move(src_path, dest_path) to move instead of copy

folder_path = './static/submissions'
related_pdfs = find_related_pdfs(folder_path)

if related_pdfs:
    print(f"Found {len(related_pdfs)} related PDFs based on shared text chunks:")    
    create_subfolders_for_duplicates(related_pdfs, folder_path)
    print("Duplicates have been organized into subfolders.")

else:
    print("No related PDFs found.")
