In [7]:
import os
import PyPDF2  
import docx    
import json

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    # Replace all whitespace (including newlines) with a single space
                    cleaned_text = " ".join(page_text.split())
                    text += cleaned_text + "\n"
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
    return text

def extract_text_from_word(doc_path):
    text = ""
    try:
        doc = docx.Document(doc_path)
        text = "\n".join(para.text for para in doc.paragraphs)
    except Exception as e:
        print(f"Error processing {doc_path}: {e}")
    return text

def process_documents(folder_path):
    """
    Process all PDF and DOCX files in the folder and convert them into a list of
    content items (dictionaries) in the format expected by the RAG pipeline.
    """
    extracted_items = []
    
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        if filename.lower().endswith('.pdf'):
            print(f"Processing PDF: {filename}")
            pdf_text = extract_text_from_pdf(file_path)
            if pdf_text.strip():
                content_item = {
                    "video_id": filename,       # Use the filename as an ID
                    "frame": "full_text",       # Use a placeholder since it's a full document
                    "timestamp": "0:00",        # No timestamp for a full document
                    "timestamp_seconds": 0,     # Default value
                    "content": pdf_text
                }
                extracted_items.append(content_item)
        
        elif filename.lower().endswith('.docx'):
            print(f"Processing Word Document: {filename}")
            word_text = extract_text_from_word(file_path)
            if word_text.strip():
                content_item = {
                    "video_id": filename,       # Use the filename as an ID
                    "frame": "full_text",
                    "timestamp": "0:00",
                    "timestamp_seconds": 0,
                    "content": word_text
                }
                extracted_items.append(content_item)
    
    return extracted_items

def main():
    # Prompt for input and output folders
    input_folder = input("Enter the path to the folder containing PDF/DOCX files: ").strip()
    output_folder = input("Enter the path to save the extracted text JSON file: ").strip()
    
    # Validate paths
    if not os.path.exists(input_folder):
        print(f"Error: The input folder '{input_folder}' does not exist.")
        return
    
    if not os.path.exists(output_folder):
        try:
            os.makedirs(output_folder)
            print(f"Created output folder: {output_folder}")
        except Exception as e:
            print(f"Error creating output folder: {e}")
            return
    
    # Process documents and get a list of content items in the desired format
    content_items = process_documents(input_folder)
    print(f"Extracted {len(content_items)} content items from all files.")
    
    # Save the output JSON in the desired format
    output_file = os.path.join(output_folder, "extracted_text.json")
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(content_items, f, indent=4, ensure_ascii=False)
    
    print(f"Extracted text saved to: {output_file}")

if __name__ == "__main__":
    main()

Enter the path to the folder containing PDF/DOCX files:  /Users/advaith/Desktop/MSBA Related coursework/Spring term/Deep Learning/Final Project/pdf files
Enter the path to save the extracted text JSON file:  /Users/advaith/Desktop/MSBA Related coursework/Spring term/Deep Learning/Final Project/Data to be considered


Processing PDF: AML Quiz 1 Study Guide.pdf
Extracted 1 content items from all files.
Extracted text saved to: /Users/advaith/Desktop/MSBA Related coursework/Spring term/Deep Learning/Final Project/Data to be considered/extracted_text.json
