In [12]:
import os
import fitz  # PyMuPDF for PDF extraction
import pdfplumber
import camelot
import docx
import pptx
from abc import ABC, abstractmethod
import ipywidgets as widgets
from IPython.display import display
import csv
from PyPDF2 import PdfReader

# Abstract Class for FileLoader
class FileLoader(ABC):
    def __init__(self, file_path):
        self.file_path = file_path
    
    @abstractmethod
    def validate_file(self):
        pass

    @abstractmethod
    def load_file(self):
        pass

# Concrete PDFLoader class
class PDFLoader(FileLoader):
    def __init__(self, file_path):
        super().__init__(file_path)
        self.doc = None

    def validate_file(self):
        if not os.path.exists(self.file_path):
            raise FileNotFoundError(f"File {self.file_path} not found.")
        if not self.file_path.endswith('.pdf'):
            raise ValueError("Invalid file format. Expected a PDF file.")
        return True

    def load_file(self):
        from PyPDF2 import PdfReader
        self.doc = fitz.open(self.file_path)
        print(f"PDF {self.file_path} successfully loaded.")
        return self.doc

    def extract_text(self):
        text = ""
        for page in self.doc.pages:
            text += page.extract_text()
        return text

    def extract_images(self, output_dir):
        for page_num in range(self.doc.page_count):
            page = self.doc.load_page(page_num)
            image_list = page.get_images(full=True)
            for img_index, img in enumerate(image_list):
                xref = img[0]
                base_image = self.doc.extract_image(xref)
                image_bytes = base_image["image"]
                image_ext = base_image["ext"]
                image_filename = os.path.join(output_dir, f"image_page{page_num+1}_{img_index}.{image_ext}")
                with open(image_filename, "wb") as image_file:
                    image_file.write(image_bytes)
                print(f"Image saved: {image_filename}")

    def extract_links(self):
        links = []
        for page_num, page in enumerate(self.doc.pages):
            if '/Annots' in page:
                annotations = page['/Annots']
                for annotation in annotations:
                    uri = annotation.getObject().get('/A').get('/URI')
                    if uri:
                        links.append({
                            'page': page_num + 1,
                            'uri': uri
                        })
        return links
    
    def extract_tables(self, output_dir):
        tables = camelot.read_pdf(self.file_path, pages="all")
        for i, table in enumerate(tables):
            table_filename = os.path.join(output_dir, f"table_{i+1}.csv")
            table.to_csv(table_filename) 
            print(f"Table saved: {table_filename}")

# Concrete DOCXLoader class
class DOCXLoader(FileLoader):
    def __init__(self, file_path):
        super().__init__(file_path)

    def validate_file(self):
        if not os.path.exists(self.file_path):
            raise FileNotFoundError(f"File {self.file_path} not found.")
        if not self.file_path.endswith('.docx'):
            raise ValueError("Invalid file format. Expected a DOCX file.")
        return True

    def load_file(self):
        self.doc = docx.Document(self.file_path)
        print(f"DOCX {self.file_path} successfully loaded.")
        return self.doc

    def extract_text(self):
        return "\n".join([para.text for para in self.doc.paragraphs])

    def extract_tables(self, output_dir):
        for i, table in enumerate(self.doc.tables):
            table_filename = os.path.join(output_dir, f"table_{i+1}.csv")
            with open(table_filename, "w", newline="") as f:
                writer = csv.writer(f)
                for row in table.rows:
                    writer.writerow([cell.text for cell in row.cells])
            print(f"Table saved: {table_filename}")

# Concrete PPTLoader class
class PPTLoader(FileLoader):
    def __init__(self, file_path):
        super().__init__(file_path)

    def validate_file(self):
        if not os.path.exists(self.file_path):
            raise FileNotFoundError(f"File {self.file_path} not found.")
        if not self.file_path.endswith('.pptx'):
            raise ValueError("Invalid file format. Expected a PPTX file.")
        return True

    def load_file(self):
        from PyPDF2 import PdfReader
        self.doc = pptx.Presentation(self.file_path)
        print(f"PPT {self.file_path} successfully loaded.")
        return self.doc

    def extract_text(self):
        text = ""
        # Use the correct method to access the pages
        for page_num in range(len(self.doc.pages)):  # Use range to iterate through page numbers
            page = self.doc.pages[page_num]  # Access each page
            text += page.extract_text() or ""  # Extract text from the page
        return text

# DataExtractor Class
class DataExtractor:
    def __init__(self, loader: FileLoader):
        self.loader = loader
        self.loader.validate_file()
        self.loader.load_file()

    def extract_text(self):
        return self.loader.extract_text()

    def extract_images(self, output_dir):
        if isinstance(self.loader, PDFLoader):
            return self.loader.extract_images(output_dir)
        else:
            print("Image extraction is only supported for PDF files.")

    def extract_tables(self, output_dir):
        return self.loader.extract_tables(output_dir)

    def extract_links(self):
        if isinstance(self.loader, PDFLoader):
            return self.loader.extract_links()
        else:
            print("Link extraction is only supported for PDF files.")

# File Upload Widget for Jupyter Notebook
def process_file(change):
    output_dir = "Outputs"
    os.makedirs(output_dir, exist_ok=True)

    # Access the uploaded file
    uploaded_file = uploader.value[0]
    uploaded_filename = uploaded_file['name']
    file_content = uploaded_file['content']
    
    # Save the uploaded file to the output directory
    filepath = os.path.join(output_dir, uploaded_filename)
    with open(filepath, 'wb') as f:
        f.write(file_content)
    
    # Determine the file type and process accordingly
    if filepath.endswith('.pdf'):
        loader = PDFLoader(filepath)
    elif filepath.endswith('.docx'):
        loader = DOCXLoader(filepath)
    elif filepath.endswith('.pptx'):
        loader = PPTLoader(filepath)
    else:
        print("Unsupported file type")
        return
    
    extractor = DataExtractor(loader)

    # Extract text
    text = extractor.extract_text()
    text_filename = os.path.join(output_dir, "extracted_text.txt")
    with open(text_filename, "w") as text_file:
        text_file.write(text)
    print(f"Text extracted and saved to {text_filename}")

    # Extract images (only for PDFs)
    extractor.extract_images(output_dir)

    # Extract tables
    extractor.extract_tables(output_dir)

    # Extract links (only for PDFs)
    if filepath.endswith('.pdf'):
        links = extractor.extract_links()
        links_filename = os.path.join(output_dir, "extracted_links.txt")
        with open(links_filename, "w") as links_file:
            for link in links:
                links_file.write(f"Page: {link['page']}, URI: {link['uri']}\n")
        print(f"Links extracted and saved to {links_filename}")

# File Upload Widget in Jupyter
uploader = widgets.FileUpload(accept='.pdf,.docx,.pptx', multiple=False)
uploader.observe(process_file, names='value')
display(uploader)

FileUpload(value=(), accept='.pdf,.docx,.pptx', description='Upload')

PDF Outputs/sample.pdf successfully loaded.


TypeError: 'method' object is not iterable