In [8]:
import os
import fitz  # PyMuPDF for PDFs
import csv
from PIL import Image
from docx import Document  # For DOCX files
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
import pptx
from abc import ABC, abstractmethod
import csv
from PyPDF2 import PdfReader

# Output directory path
output_dir = '/home/shtlp_0126/Desktop/Assignment_Python/Output2'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Path to the sample file
file_path = '/home/shtlp_0126/Downloads/sample.pdf'  # Update this for docx or pdf

class PDFLoader:
    def __init__(self, file_path):
        self.file_path = file_path
        self.doc = None

    def load_file(self):
        # Open the PDF file using PyMuPDF
        self.doc = fitz.open(self.file_path)
        print(f"PDF {self.file_path} successfully loaded.")
        return self.doc

    def extract_text(self):
        # Extract text from each page
        text = ""
        for page_num in range(len(self.doc)):
            page = self.doc.load_page(page_num)  # Load page
            text += page.get_text("text")
        return text

    def extract_links(self):
        links = []
        for page_num in range(len(self.doc)):
            page = self.doc.load_page(page_num)
            links_on_page = page.get_links()
            for link in links_on_page:
                links.append({
                    "page": page_num + 1,
                    "uri": link.get("uri")
                })
        return links

    def extract_images(self):
        images = []
        for page_num in range(len(self.doc)):
            page = self.doc.load_page(page_num)
            image_list = page.get_images(full=True)
            for image_index, img in enumerate(image_list):
                xref = img[0]
                base_image = self.doc.extract_image(xref)
                img_bytes = base_image["image"]
                img_ext = base_image["ext"]
                img_path = os.path.join(output_dir, f"image_page_{page_num + 1}_{image_index}.{img_ext}")
                with open(img_path, "wb") as img_file:
                    img_file.write(img_bytes)
                images.append(img_path)
        return images

    def extract_tables(self):
        # Extract tables (simplified, as PDFs aren't straightforward for table extraction)
        tables = []
        for page_num in range(len(self.doc)):
            page = self.doc.load_page(page_num)
            text = page.get_text("text")
            if "table" in text.lower():
                tables.append({
                    "page": page_num + 1,
                    "content": text
                })
        return tables


class DOCXLoader:
    def __init__(self, file_path):
        self.file_path = file_path
        self.doc = Document(file_path)

    def extract_text(self):
        # Extract all the text from the DOCX document
        text = "\n".join([para.text for para in self.doc.paragraphs])
        return text

    def extract_links(self):
        # Extract links from the DOCX document
        links = []
        for rel in self.doc.part.rels.values():
            if "hyperlink" in rel.target_ref:
                links.append(rel.target_ref)
        return links

    def extract_images(self):
        # Extract images from the DOCX document
        images = []
        for rel in self.doc.part.rels.values():
            if "image" in rel.target_ref:
                img_path = os.path.join(output_dir, os.path.basename(rel.target_ref))
                images.append(img_path)
        return images

    def extract_tables(self):
        # Extract tables from DOCX document
        tables = []
        for table in self.doc.tables:
            table_data = []
            for row in table.rows:
                row_data = [cell.text for cell in row.cells]
                table_data.append(row_data)
            tables.append(table_data)
        return tables


def save_text_to_file(text, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(text)

def save_links_to_file(links, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["URL"])
        for link in links:
            writer.writerow([link])

def save_tables_to_csv(tables):
    for index, table in enumerate(tables):
        csv_path = os.path.join(output_dir, f"table_{index + 1}.csv")
        with open(csv_path, "w", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerows(table)

# Determine if file is PDF or DOCX and process accordingly
if file_path.endswith('.pdf'):
    # Processing the PDF
    pdf_loader = PDFLoader(file_path)
    pdf_loader.load_file()

    # Extract text and save to a text file
    text = pdf_loader.extract_text()
    save_text_to_file(text, os.path.join(output_dir, "extracted_text.txt"))

    # Extract links and save to a CSV file
    links = pdf_loader.extract_links()
    save_links_to_file(links, os.path.join(output_dir, "extracted_links.csv"))

    # Extract images and save to the output directory
    images = pdf_loader.extract_images()
    print(f"Images extracted and saved at: {images}")

    # Extract tables and save to CSV files
    tables = pdf_loader.extract_tables()
    save_tables_to_csv(tables)

elif file_path.endswith('.docx'):
    # Processing the DOCX
    docx_loader = DOCXLoader(file_path)

    # Extract text and save to a text file
    text = docx_loader.extract_text()
    save_text_to_file(text, os.path.join(output_dir, "extracted_text.txt"))

    # Extract links and save to a CSV file
    links = docx_loader.extract_links()
    save_links_to_file(links, os.path.join(output_dir, "extracted_links.csv"))

    # Extract images and save to the output directory
    images = docx_loader.extract_images()
    print(f"Images extracted and saved at: {images}")

    # Extract tables and save to CSV files
    tables = docx_loader.extract_tables()
    save_tables_to_csv(tables)

print("File processing complete!")



PDF /home/shtlp_0126/Downloads/sample.pdf successfully loaded.
Images extracted and saved at: ['/home/shtlp_0126/Desktop/Assignment_Python/Output2/image_page_3_0.png', '/home/shtlp_0126/Desktop/Assignment_Python/Output2/image_page_3_1.png', '/home/shtlp_0126/Desktop/Assignment_Python/Output2/image_page_4_0.png', '/home/shtlp_0126/Desktop/Assignment_Python/Output2/image_page_4_1.png', '/home/shtlp_0126/Desktop/Assignment_Python/Output2/image_page_4_2.png']
File processing complete!
