## Contents

0. Set up 
1. Parser

Please insert the path of the folder you want to inspect as well as the keyword(s) you are looking for. 

### 0. Set up

In [None]:
pip install python-docx openpyxl PyMuPDF

In [None]:
import os
import glob
import docx
import openpyxl
import fitz  
from concurrent.futures import ThreadPoolExecutor

### 1. Parser

In [None]:
# Define the folder path and keywords to search for
folder_path = r''  # Replace with your folder path
keywords = ['', ''] # Insert keywords 

# Function to search for keywords in PDF files using PyMuPDF
def search_pdf(file_path, keywords):
    try:
        pdf_document = fitz.open(file_path)
        found_sentences = []
        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]
            text = page.get_text()
            for line in text.split('\n'):
                for keyword in keywords:
                    if keyword in line:
                        found_sentences.append(line)
        return found_sentences
    except Exception as e:
        print(f"Error processing PDF file {file_path}: {e}")
        return []

# Function to search for keywords in Word files using python-docx
def search_word(file_path, keywords):
    try:
        doc = docx.Document(file_path)
        found_sentences = []
        for paragraph in doc.paragraphs:
            for keyword in keywords:
                if keyword in paragraph.text:
                    found_sentences.append(paragraph.text)
        return found_sentences
    except Exception as e:
        print(f"Error processing Word file {file_path}: {e}")
        return []

# Function to search for keywords in Excel files
def search_excel(file_path, keywords):
    try:
        wb = openpyxl.load_workbook(file_path, read_only=True)
        found_sentences = []
        for sheet in wb:
            for row in sheet.iter_rows():
                for cell in row:
                    for keyword in keywords:
                        if keyword in str(cell.value):
                            found_sentences.append(str(cell.value))
        return found_sentences
    except Exception as e:
        print(f"Error processing Excel file {file_path}: {e}")
        return []

# Search for keywords in files within the specified folder and its subfolders in parallel
found_files = []
problematic_files = []  # To store the paths of problematic files
with ThreadPoolExecutor(max_workers=4) as executor:  # Adjust max_workers based on your system
    for root, _, files in os.walk(folder_path):
        for file_extension in ['*.pdf', '*.docx', '*.xlsx']:
            files = glob.glob(os.path.join(root, file_extension))
            for file in files:
                try:
                    # Check if the file exists before attempting to process it
                    if os.path.exists(file):
                        future = executor.submit(
                            search_pdf if file_extension == '*.pdf' else
                            search_word if file_extension == '*.docx' else
                            search_excel,
                            file,
                            keywords
                        )
                        sentences = future.result()
                        if sentences:
                            found_files.append((file, sentences))
                    else:
                        print(f"File not found: {file}")
                except Exception as e:
                    print(f"Error processing file {file}: {e}")
                    problematic_files.append(file)

# Print the paths of problematic files
if problematic_files:
    print("Problematic files:")
    for file_path in problematic_files:
        print(file_path)

# Print the paths of files containing keywords and the sentences
if found_files:
    print("Files containing keywords:")
    for file_path, sentences in found_files:
        print(f"File: {file_path}")
        print("Sentences:")
        for sentence in sentences:
            print(sentence)
        print()
else:
    print("No files containing keywords found in the specified folder.")
