# 1. Abstract FileLoader Class and Concrete File Loaders

In [3]:
class FileLoader(ABC):
    def __init__(self, filepath):
        self.filepath = filepath

    @abstractmethod
    def validate_file(self):
        pass
    
    @abstractmethod
    def load_file(self):
        pass

In [4]:
# Concrete class for PDF
class PDFLoader(FileLoader):
    def validate_file(self):
        return self.filepath.endswith('.pdf')

    def load_file(self):
        if not self.validate_file():
            raise ValueError("Invalid PDF file.")
        reader = PdfReader(self.filepath)
        return reader

In [5]:
class DOCXLoader(FileLoader):
    def validate_file(self):
        return self.filepath.endswith('.docx')

    def load_file(self):
        if not self.validate_file():
            raise ValueError("Invalid DOCX file.")
        document = Document(self.filepath)
        return document


In [6]:
# Concrete class for PPT
class PPTLoader(FileLoader):
    def validate_file(self):
        return self.filepath.endswith('.pptx')

    def load_file(self):
        if not self.validate_file():
            raise ValueError("Invalid PPTX file.")
        presentation = Presentation(self.filepath)
        return presentation

In [7]:
# Concrete class for PDF
class PDFLoader(FileLoader):
    def validate_file(self):
        return self.filepath.endswith('.pdf')

    def load_file(self):
        if not self.validate_file():
            raise ValueError("Invalid PDF file.")
        reader = PdfReader(self.filepath)
        return reader

# Data Extractor Class

In [8]:
class DataExtractor:
    def __init__(self, file_loader: FileLoader):
        self.file_loader = file_loader
        self.file_data = self.file_loader.load_file()

    def extract_text(self):
        # Implementation will depend on file type
        if isinstance(self.file_loader, PDFLoader):
            text = ''
            for page in self.file_data.pages:
                text += page.extract_text()
            return text

        elif isinstance(self.file_loader, DOCXLoader):
            return '\n'.join([para.text for para in self.file_data.paragraphs])

        elif isinstance(self.file_loader, PPTLoader):
            text = ''
            for slide in self.file_data.slides:
                for shape in slide.shapes:
                    if hasattr(shape, "text"):
                        text += shape.text + '\n'
            return text

    def extract_links(self):
        # Placeholder for hyperlink extraction logic
        return "No links found yet."

    def extract_images(self):
        image_counter = 0
        if isinstance(self.file_loader, PPTLoader):
            for slide in self.file_data.slides:
                for shape in slide.shapes:
                    if hasattr(shape, "image"):
                        image = shape.image
                        image_path = os.path.join(output_dir, f'image_{image_counter}.jpg')
                        image.save(image_path)
                        image_counter += 1
        # Logic for DOCX, PPT, and PDF images
        return image_counter

    def extract_tables(self):
        # Placeholder for table extraction logic
        # Placeholder for table extraction logic (currently focusing on DOCX/PDF/PPT)
        # As an example, let's assume it extracts tables in DOCX and PPT.
        if isinstance(self.file_loader, DOCXLoader):
            # Placeholder code for tables in DOCX
            tables = []
            for table in self.file_data.tables:
                data = [[cell.text for cell in row.cells] for row in table.rows]
                df = pd.DataFrame(data)
                csv_path = os.path.join(output_dir, 'table.csv')
                df.to_csv(csv_path, index=False)
            return len(self.file_data.tables)
        return 0


In [9]:
output_dir = 'shtlp_0126@SHTLP0126:~/Desktop/Assignment_Python/Output'


In [10]:
# Ensure output directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)


# Abstract Storage Class and Concrete Storages

In [11]:
class FileStorage:
    def __init__(self, data_extractor: DataExtractor):
        self.data_extractor = data_extractor

    def save(self, output_dir):
        # Saving extracted text
        with open(os.path.join(output_dir, 'extracted_text.txt'), 'w') as f:
            text = self.data_extractor.extract_text()
            f.write(text)

        # Saving extracted links
        links = self.data_extractor.extract_links()
        with open(os.path.join(output_dir, 'extracted_links.txt'), 'w') as f:
            f.write(links)

        # Saving extracted images
        num_images = self.data_extractor.extract_images(output_dir)
        print(f"Saved {num_images} images.")

        # Saving extracted tables
        num_tables = self.data_extractor.extract_tables(output_dir)
        print(f"Saved {num_tables} tables as CSV.")

In [12]:
uploader = widgets.FileUpload(accept='.pdf,.docx,.pptx', multiple=False)

TypeError: tuple indices must be integers or slices, not Bunch

In [13]:
def handle_upload(change):
    for file_info in uploader.value:
        filepath = os.path.join(output_dir, uploader.value[file_info]['name'])
        
        # Save uploaded file to output directory
        with open(filepath, 'wb') as f:
            f.write(uploader.value[file_info]['content'])
        
        # Determine loader type and process file
        if filepath.endswith('.pdf'):
            loader = PDFLoader(filepath)
        elif filepath.endswith('.docx'):
            loader = DOCXLoader(filepath)
        elif filepath.endswith('.pptx'):
            loader = PPTLoader(filepath)
        else:
            print("Unsupported file type.")
            return

        # Extract data
        extractor = DataExtractor(loader) 
        storage = FileStorage(extractor)
        storage.save(output_dir)
        print(f"File processing completed. Outputs saved in: {output_dir}")


In [14]:
uploader.observe(handle_upload, names='value')

In [15]:
# Display the uploader widget
display(uploader)

FileUpload(value=(), accept='.pdf,.docx,.pptx', description='Upload')