In [4]:
import os
import fitz  # PyMuPDF for PDFs
import csv
from PIL import Image
from docx import Document  # For DOCX files
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
import pptx
from abc import ABC, abstractmethod
import csv
from PyPDF2 import PdfReader
import json

# # Output directory path
# output_dir = os.path.join(os.getcwd(), 'output')
# if not os.path.exists(output_dir):
#     os.makedirs(output_dir)


# Path to the sample file
file_path = 'Sample_file/sample.pdf'  # Update this for docx or pdf
# Function to create segregated output directories
def create_output_dirs(output_base_dir, file_type):
    output_dir = os.path.join(output_base_dir, f"Output_{file_type}")
    subfolders = ['text', 'images', 'tables', 'links', 'metadata']
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Create subdirectories for text, images, tables, and links
    for folder in subfolders:
        os.makedirs(os.path.join(output_dir, folder), exist_ok=True)
    
    return output_dir

class PDFLoader:
    def __init__(self, file_path, output_dir):
        self.file_path = file_path
        self.output_dir = output_dir
        self.doc = None

    def load_file(self):
        # Open the PDF file using PyMuPDF
        self.doc = fitz.open(self.file_path)
        print(f"PDF {self.file_path} successfully loaded.")
        return self.doc

    def extract_text(self):
        # Extract text from each page
        text = ""
        for page_num in range(len(self.doc)):
            page = self.doc.load_page(page_num)
            text += page.get_text("text")
        
        # Save text in the "text" subfolder
        text_path = os.path.join(self.output_dir, 'text', 'extracted_text.txt')
        with open(text_path, "w", encoding="utf-8") as f:
            f.write(text)
        print(f"Text extracted and saved at: {text_path}")

    def extract_links(self):
        links = []
        link_folder = os.path.join(self.output_dir, 'links')
        link_csv_path = os.path.join(link_folder, "extracted_links.csv")
        for page_num in range(len(self.doc)):
            page = self.doc.load_page(page_num)
            links_on_page = page.get_links()
            for link in links_on_page:
                if link.get("uri"):
                    links.append({
                        "page": page_num + 1,
                        "uri": link.get("uri")
                    })

        # Save links in the "links" subfolder
        with open(link_csv_path, "w", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow(["Page", "URL"])
            for link in links:
                writer.writerow([link["page"], link["uri"]])
        print(f"Links extracted and saved at: {link_csv_path}")
        return links
    
    def extract_images(self):
        images = []
        image_folder = os.path.join(self.output_dir, 'images')
        for page_num in range(len(self.doc)):
            page = self.doc.load_page(page_num)
            image_list = page.get_images(full=True)
            for image_index, img in enumerate(image_list):
                xref = img[0]
                base_image = self.doc.extract_image(xref)
                img_bytes = base_image["image"]
                img_ext = base_image["ext"]
                img_path = os.path.join(image_folder, f"image_page_{page_num + 1}_{image_index}.{img_ext}")
                with open(img_path, "wb") as img_file:
                    img_file.write(img_bytes)
                images.append(img_path)
        print(f"Images extracted and saved at: {image_folder}")
        return images
    

    def extract_tables(self):
        # Extract tables (simplified, as PDFs aren't straightforward for table extraction)
        tables = []
        table_folder = os.path.join(self.output_dir, 'tables')
        for page_num in range(len(self.doc)):
            page = self.doc.load_page(page_num)
            text = page.get_text("text")
            if "table" in text.lower():
                table_data = {
                    "page": page_num + 1,
                    "content": text
                }
                tables.append(table_data)
                # Save each table in a CSV file
                table_csv_path = os.path.join(table_folder, f"table_page_{page_num + 1}.csv")
                with open(table_csv_path, "w", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow([text])
                print(f"Table from page {page_num + 1} saved at: {table_csv_path}")
        return tables

    
    def extract_detailed_metadata(self):
        # Extract fonts, sizes, and other page-specific information
        metadata = []
        for page_num in range(len(self.doc)):
            page = self.doc.load_page(page_num)
            text_instances = page.get_text("dict")["blocks"]  # Get text blocks
            page_metadata = []
            for block in text_instances:
                if block['type'] == 0:  # Type 0 means text block
                    for line in block['lines']:
                        for span in line['spans']:
                            page_metadata.append({
                                "font": span['font'],
                                "size": span['size'],
                                "color": span['color'],
                                "text": span['text']
                            })
            metadata.append({
                "page": page_num + 1,
                "fonts": page_metadata
            })
        return metadata


class DOCXLoader:
    def __init__(self, file_path):
        self.file_path = file_path
        self.doc = Document(file_path)

    def extract_detailed_metadata(self):
        metadata = []
        for para in self.doc.paragraphs:
            para_metadata = []
            for run in para.runs:
                para_metadata.append({
                    "text": run.text,
                    "font": run.font.name,
                    "size": run.font.size.pt if run.font.size else None,  # Get font size in points
                    "bold": run.bold,
                    "italic": run.italic,
                    "underline": run.underline
                })
            metadata.append({
                "paragraph_text": para.text,
                "run_metadata": para_metadata
            })
        return metadata

    def extract_text(self):
        # Extract all the text from the DOCX document
        text = "\n".join([para.text for para in self.doc.paragraphs])
        return text

    def extract_links(self):
        # Extract links from the DOCX document
        links = []
        for rel in self.doc.part.rels.values():
            if "hyperlink" in rel.target_ref:
                links.append(rel.target_ref)
        return links

    def extract_images(self):
        # Extract images from the DOCX document
        images = []
        for rel in self.doc.part.rels.values():
            if "image" in rel.target_ref:
                img_path = os.path.join(output_dir, os.path.basename(rel.target_ref))
                images.append(img_path)
        return images

    def extract_tables(self):
        # Extract tables from DOCX document
        tables = []
        for table in self.doc.tables:
            table_data = []
            for row in table.rows:
                row_data = [cell.text for cell in row.cells]
                table_data.append(row_data)
            tables.append(table_data)
        return tables
    

output_base_dir = os.getcwd()  # Base directory for output


def save_text_to_file(text, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(text)

def save_links_to_file(links, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["URL"])
        for link in links:
            writer.writerow([link])

def save_tables_to_csv(tables):
    for index, table in enumerate(tables):
        csv_path = os.path.join(output_dir, f"table_{index + 1}.csv")
        with open(csv_path, "w", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerows(table)

def save_metadata_to_json(metadata, file_name):
    json_path = os.path.join(output_dir, file_name)
    with open(json_path, "w", encoding="utf-8") as json_file:
        json.dump(metadata, json_file, indent=4)

# Determine if file is PDF or DOCX and process accordingly
# if file_path.endswith('.pdf'):
#     # Processing the PDF
#     pdf_loader = PDFLoader(file_path)
#     pdf_loader.load_file()

#     # Extract text and save to a text file
#     text = pdf_loader.extract_text()
#     save_text_to_file(text, os.path.join(output_dir, "extracted_text.txt"))

#     # Extract links and save to a CSV file
#     links = pdf_loader.extract_links()
#     save_links_to_file(links, os.path.join(output_dir, "extracted_links.csv"))

#     # Extract images and save to the output directory
#     images = pdf_loader.extract_images()
#     print(f"Images extracted and saved at: {images}")

#     # Extract tables and save to CSV files
#     tables = pdf_loader.extract_tables()
#     save_tables_to_csv(tables)

#     detailed_metadata = pdf_loader.extract_detailed_metadata()
#     save_metadata_to_json(detailed_metadata, "pdf_detailed_metadata.json")
#     print(f"Detailed PDF metadata saved to pdf_detailed_metadata.json")

# elif file_path.endswith('.docx'):
#     # Processing the DOCX
#     docx_loader = DOCXLoader(file_path)

#     # Extract text and save to a text file
#     text = docx_loader.extract_text()
#     save_text_to_file(text, os.path.join(output_dir, "extracted_text.txt"))

#     # Extract links and save to a CSV file
#     links = docx_loader.extract_links()
#     save_links_to_file(links, os.path.join(output_dir, "extracted_links.csv"))

#     # Extract images and save to the output directory
#     images = docx_loader.extract_images()
#     print(f"Images extracted and saved at: {images}")

#     # Extract tables and save to CSV files
#     tables = docx_loader.extract_tables()
#     save_tables_to_csv(tables)

#     detailed_metadata = docx_loader.extract_detailed_metadata()
#     save_metadata_to_json(detailed_metadata, "docx_detailed_metadata.json")
#     print(f"Detailed DOCX metadata saved to docx_detailed_metadata.json")

# Determine whether the file is PDF or DOCX and create output directories accordingly
if file_path.endswith('.pdf'):
    output_dir = create_output_dirs(output_base_dir, 'pdf')
    pdf_loader = PDFLoader(file_path, output_dir)
    pdf_loader.load_file()

    # Perform extraction operations
    pdf_loader.extract_text()
    pdf_loader.extract_images()
    pdf_loader.extract_links()
    pdf_loader.extract_tables()

elif file_path.endswith('.docx'):
      output_dir = create_output_dirs(output_base_dir, 'docx')
      docx_loader = DOCXLoader(file_path, output_dir)
    
      #Perform extraction operations for DOCX...
      docx_loader.extract_text()
      docx_loader.extract_images()
      docx_loader.extract_links()
      docx_loader.extract_tables()

print("Processing complete! Check the 'output' folder for results.")

print("File processing complete!")



PDF Sample_file/sample.pdf successfully loaded.
Text extracted and saved at: /home/shtlp_0126/Desktop/Assignment_Python/Output_pdf/text/extracted_text.txt
Images extracted and saved at: /home/shtlp_0126/Desktop/Assignment_Python/Output_pdf/images
Links extracted and saved at: /home/shtlp_0126/Desktop/Assignment_Python/Output_pdf/links/extracted_links.csv
Table from page 1 saved at: /home/shtlp_0126/Desktop/Assignment_Python/Output_pdf/tables/table_page_1.csv
Table from page 7 saved at: /home/shtlp_0126/Desktop/Assignment_Python/Output_pdf/tables/table_page_7.csv
Table from page 8 saved at: /home/shtlp_0126/Desktop/Assignment_Python/Output_pdf/tables/table_page_8.csv
Table from page 9 saved at: /home/shtlp_0126/Desktop/Assignment_Python/Output_pdf/tables/table_page_9.csv
Table from page 14 saved at: /home/shtlp_0126/Desktop/Assignment_Python/Output_pdf/tables/table_page_14.csv
Processing complete! Check the 'output' folder for results.
File processing complete!
