#### Search for Text in Word Documents

In [21]:
def search_word_files(directory, keyword):
    import os
    files_found = []
    from docx import Document
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".docx") or file.endswith(".doc"):
                filepath = os.path.join(root, file)
                document = Document(filepath)
                for paragraph in document.paragraphs:
                    if keyword in paragraph.text:
                        files_found.append(filepath)
    return files_found

# Example usage
directory = "C:/Local/"
keyword = "kdeplot"
search_word_files(directory,keyword)

['C:/Local/test.docx']

#### Search for Text in Jupyter Notebooks

In [23]:
def search_ipynb_files(directory,keyword):
    import glob
    # set filepath to search
    path = directory + '**/*.ipynb'
    files_found = []
    # looping through all the filenames returned
    # set recursive = True to look in sub-directories too
    for filename in glob.iglob(path, recursive=True):
        # adding error handling just in case!
        try:
            with open(filename) as f:
                # read the file as a string
                contents = f.read()
                # if the search term is found append to the list of files
                if(keyword in contents):
                    files_found.append(filename)
        except:
            pass
    return files_found
   
directory = "C:/Local/"
keyword = "kdeplot"
search_ipynb_files(directory,keyword)

['C:/Local\\PythonDataBasics_Univariate_Visualization.ipynb',
 'C:/Local\\1-Day_SpatialDataAnalytics\\Workflows\\bootstrap.ipynb',
 'C:/Local\\1-Day_SpatialDataAnalytics\\Workflows\\bootstrap_demo.ipynb',
 'C:/Local\\1-Day_SpatialDataAnalytics\\Workflows\\model_checking.ipynb']

#### Search for Text in PowerPoint Slide Decks

In [26]:
import os
from pptx import Presentation

def search_ppt_files(directory, keyword):
    files_found = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".pptx") or file.endswith(".ppt"):
                found = False
                filepath = os.path.join(root, file)
                presentation = Presentation(filepath)
                for slide in presentation.slides:
                    for shape in slide.shapes:
                        if shape.has_text_frame:
                            for paragraph in shape.text_frame.paragraphs:
                                for run in paragraph.runs:
                                    if keyword in run.text:
                                        files_found.append(filepath)
    return files_found
                                        
# Example usage
directory = "C:/Local/"
keyword = "decluster"
search_ppt_files(directory, keyword)

['C:/Local/1-Day_SpatialDataAnalytics\\DataPrep.pptx',
 'C:/Local/1-Day_SpatialDataAnalytics\\DataPrep.pptx',
 'C:/Local/1-Day_SpatialDataAnalytics\\DataPrep.pptx',
 'C:/Local/1Day_ExecutiveCourse\\02_Feature_Engineering.pptx',
 'C:/Local/1Day_ExecutiveCourse\\02_Feature_Engineering.pptx',
 'C:/Local/1Day_ExecutiveCourse\\OriginalPPTX\\02_Inference.pptx',
 'C:/Local/1Day_ExecutiveCourse\\OriginalPPTX\\02_Inference.pptx']

#### Search for Text in Excel Spreadsheets

In [35]:
def search_excel_files(directory, keyword):
    files_found = []
    import os
    from openpyxl import load_workbook
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".xlsx") or file.endswith(".xls"):
                filepath = os.path.join(root, file)
                workbook = load_workbook(filepath)
                for sheet_name in workbook.sheetnames:
                    sheet = workbook[sheet_name]
                    for row in sheet.iter_rows(values_only=True):
                        for cell in row:
                            if isinstance(cell, str) and keyword in cell:
                                files_found.append(filepath)
    return files_found

# Example usage
directory = "C:/Local/"
keyword = "kdeplot"
search_excel_files(directory,keyword)

['C:/Local/test.xlsx']

#### Search for Text in PDF files

In [38]:
def search_pdf_files(directory, keyword):
    files_found = []
    import os
    import PyPDF2
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                filepath = os.path.join(root, file)
                with open(filepath, "rb") as pdf_file:
                    reader = PyPDF2.PdfReader(pdf_file)
                    for page in reader.pages:
                        text = page.extract_text()
                        if keyword in text:
                            files_found.append(filepath)
    return files_found

# Example usage
directory = "C:/Local/"
keyword = "kdeplot"
search_pdf_files(directory,keyword)

['C:/Local/test.pdf']