In [1]:
import pandas as pd
import fitz  # PyMuPDF for PDFs
from docx import Document
import random
import os


# Directory to store generated test files
output_dir = "test_documents"
os.makedirs(output_dir, exist_ok=True)

# Sample sensitive and non-sensitive data
sensitive_data = [
    "Paciente diagnosticado com pneumonia severa e sepse.",
    "Nome do paciente: João Silva",
    "Número de atendimento: 123456",
    "Diagnóstico: Insuficiência cardíaca crônica",
    "Prescrição: Morfina 10mg, Fentanil 5mg",
    "História Clínica: Paciente com histórico de hipertensão e diabetes",
    "Data de nascimento: 12/05/1985",
]

non_sensitive_data = [
    "O hospital possui 200 leitos disponíveis.",
    "Relatório de equipamentos: Ventiladores mecânicos em funcionamento.",
    "Estoque atualizado de medicamentos.",
    "Aviso: Reunião da equipe médica agendada para segunda-feira.",
    "Novo estudo publicado sobre eficiência de tratamentos para COVID-19.",
]

# Function to generate a random PDF
def create_pdf(file_name, content):
    pdf_path = os.path.join(output_dir, file_name)
    doc = fitz.open()
    page = doc.new_page()
    page.insert_text((50, 50), content)
    doc.save(pdf_path)
    return pdf_path

# Function to generate a random DOCX file
def create_docx(file_name, content):
    doc_path = os.path.join(output_dir, file_name)
    doc = Document()
    doc.add_paragraph(content)
    doc.save(doc_path)
    return doc_path

# Function to generate a random CSV file
def create_csv(file_name, data):
    csv_path = os.path.join(output_dir, file_name)
    df = pd.DataFrame({"Text": data})
    df.to_csv(csv_path, index=False, encoding="utf-8")
    return csv_path

# Generate random test files
test_files = []
for i in range(5):  # Generate 5 files of each type
    content = random.choice(sensitive_data + non_sensitive_data)
    test_files.append(create_pdf(f"test_file_{i+1}.pdf", content))
    test_files.append(create_docx(f"test_file_{i+1}.docx", content))
    test_files.append(create_csv(f"test_file_{i+1}.csv", [content]))

# Return the list of generated files
test_files


['test_documents\\test_file_1.pdf',
 'test_documents\\test_file_1.docx',
 'test_documents\\test_file_1.csv',
 'test_documents\\test_file_2.pdf',
 'test_documents\\test_file_2.docx',
 'test_documents\\test_file_2.csv',
 'test_documents\\test_file_3.pdf',
 'test_documents\\test_file_3.docx',
 'test_documents\\test_file_3.csv',
 'test_documents\\test_file_4.pdf',
 'test_documents\\test_file_4.docx',
 'test_documents\\test_file_4.csv',
 'test_documents\\test_file_5.pdf',
 'test_documents\\test_file_5.docx',
 'test_documents\\test_file_5.csv']