Skip to content

Commit

Permalink
Documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
John98Zakaria committed Oct 3, 2020
1 parent b810b06 commit 1fb6274
Show file tree
Hide file tree
Showing 15 changed files with 209 additions and 203 deletions.
4 changes: 0 additions & 4 deletions PDFFile.py

This file was deleted.

33 changes: 14 additions & 19 deletions PDFMerger.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,22 @@


class PDFMerger:
def __init__(self,pdf1:PDFParser,pdf2:PDFParser):
def __init__(self, pdf1: PDFParser, pdf2: PDFParser):
self.pdf1 = pdf1
self.pdf2 = pdf2






def merge(self):
self.pdf2.increment_refrences(self.pdf1.__len__())
def merge(self, out_path: str) -> None:
"""
Merges the accumulated PDFFiles
:param out_path: Path for the output file
"""
self.pdf2.increment_references(self.pdf1.__len__())
root1 = self.pdf1.get_page_root()
root2 = self.pdf2.get_page_root()
root2[b"/Parent"] = IndirectObjectRef(root1.object_number,root1.object_rev)
root1[b"/Kids"].data.append(IndirectObjectRef(root2.object_number,root2.object_rev))
root1[b"/Count"] = str(int(root1[b"/Count"])+int(root2[b"/Count"])).encode("utf-8")
self.pdf1.trailer[b"/Size"] = str(len(pdf1)+len(pdf2)).encode("utf-8")
root2[b"/Parent"] = IndirectObjectRef(root1.object_number, root1.object_rev)
root1[b"/Kids"].data.append(IndirectObjectRef(root2.object_number, root2.object_rev))
root1[b"/Count"] = str(int(root1[b"/Count"]) + int(root2[b"/Count"])).encode("utf-8")
self.pdf1.trailer[b"/Size"] = str(len(pdf1) + len(pdf2)).encode("utf-8")
newXrefTable = [XrefEntry(0, 65535, "f")]
with open("Merge3.pdf", "wb+")as f:
f.write(b"%PDF-1.5\n")
Expand All @@ -38,7 +37,6 @@ def merge(self):
newXrefTable.append(XrefEntry(pos, int(rev), str(inuse)))
f.write(object.to_bytes(self.pdf2.file) + b"\n")


xrefpos = f.tell()
newXrefTable = XRefTable(newXrefTable, True)
f.write(newXrefTable.__str__().encode("utf-8"))
Expand All @@ -48,12 +46,9 @@ def merge(self):
f.write(f"startxref\n{xrefpos}\n%%EOF\n".encode("utf-8"))





if __name__ == '__main__':
pdf1 = PDFParser("test_pdfs/LittleAspNetCoreBook.pdf")
pdf2 = PDFParser("test_pdfs/Blatt04.pdf")
merger = PDFMerger(pdf2,pdf1)
pdf1 = PDFParser("test_pdfs/FuldaFinalProjectHighLevelDescriptionWS2020.pdf")
pdf2 = PDFParser("test_pdfs/FuldaMilestone0WS2020.pdf")
merger = PDFMerger(pdf2, pdf1)

merger.merge()
35 changes: 23 additions & 12 deletions PDFObjects.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from dataclasses import dataclass
from utills import Ibytable

from utils import Ibytable


class IndirectObjectRef(Ibytable):
Expand All @@ -8,11 +9,10 @@ class IndirectObjectRef(Ibytable):
7.3.10 PDF 32000-1:2008
"""

def __init__(self, objectref ,generationNum):
def __init__(self, objectref, generationNum):
self.objectref = int(objectref)
self.generationNum = int(generationNum)


def __str__(self):
return f"{self.objectref} {self.generationNum} R"

Expand All @@ -25,12 +25,10 @@ def __int__(self):
def __eq__(self, other):
return self.objectref == other.objectref




def offset_references(self, offset: int) -> None:
"""
Increments the reference objects inside the data structure
:param offset: offset value
"""
self.add_offset(offset)
Expand All @@ -41,6 +39,7 @@ def add_offset(self, offset: int):
def to_bytes(self) -> bytes:
"""
Converts Indirect Reference to bytes
:return: bytes representation of the indirect reference
"""
return self.__str__().encode("utf-8")
Expand Down Expand Up @@ -74,17 +73,23 @@ def __getitem__(self, item):
def offset_references(self, offset: int):
"""
Increments the reference objects inside the data structure
:param offset: offset value
"""
for index, value in enumerate(self.data):
if issubclass(type(value),Ibytable):
if issubclass(type(value), Ibytable):
value.offset_references(offset)

def to_bytes(self)->bytes:
def to_bytes(self) -> bytes:
"""
Converts the object to bytes
:return: Bytes representation of the object
"""
bytes_representation = b"["
for item in self.data:
bytes_representation+=self.itemToByte(item) + b" "
bytes_representation +=b"]"
bytes_representation += self.itemToByte(item) + b" "
bytes_representation += b"]"
return bytes_representation


Expand All @@ -101,10 +106,11 @@ def __init__(self, data: dict):
def offset_references(self, offset: int) -> None:
"""
Increments the reference objects inside the data structure
:param offset: offset value
"""
for key, value in zip(self.data.keys(), self.data.values()):
if issubclass(type(value),Ibytable):
if issubclass(type(value), Ibytable):
value.offset_references(offset)

def __contains__(self, item):
Expand All @@ -126,7 +132,12 @@ def __eq__(self, other):
def __repr__(self):
return self.__str__()

def to_bytes(self)->bytes:
def to_bytes(self) -> bytes:
"""
Converts the object to bytes
:return: Bytes representation of the object
"""
out_string = b"<<\n"
for key, value in zip(self.data.keys(), self.data.values()):
if issubclass(type(value), Ibytable):
Expand Down
84 changes: 39 additions & 45 deletions PDFObjectsParser.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
from utills import ObjectIter
from PDFObjects import *
from typing import Iterable
import re

from PDFObjects import *
from utils import ObjectIter

SEPERATORS = b"\\/[]<>() \t\n"


def extract_name(stream: ObjectIter) -> str:
def extract_name(stream: ObjectIter) -> bytes:
"""
Extracts the next name from the iterator (7.3.5 PDF 32000-1:2008)
:param stream:A stream whose forward slash / was just consumed
:return: String containing the name
:param stream: A stream whose forward slash / was just consumed
:return: Bytes containing the name
"""
out_string = b"/"
for letter in stream:
Expand All @@ -22,37 +23,21 @@ def extract_name(stream: ObjectIter) -> str:
return out_string


def skip_space(stream: ObjectIter) -> str:
"""
Moves stream to the next non whitespace char
:param stream: Any iterable object
:return: First letter after the whitespace
"""
peek = stream.peek(1)
if (not peek.isspace() or peek == b""):
return ""

for i in stream:

if (not i.isspace()):
stream.prev()
return ""


def parse_string_literal(stream: ObjectIter) -> str:
def parse_literalStrings(stream: ObjectIter) -> bytes:
"""
Parses string literals (7.3.4.2) PDF 32000-1:2008
:param stream: A stream whose opening round bracket ( was just consumed
:return: The string literal including the round brackets
"""
out_string = b"("
countOpeningBraces = 1
countClosingBraces = 0
for letter in stream:
if letter == b"(":
countOpeningBraces += stream.reversePeek(1)!=b"\\"
if letter == b"(": # To support nested literals
countOpeningBraces += stream.reversePeek(1) != b"\\" # Check whether escape symbol is present
elif letter == b")":
countClosingBraces += stream.reversePeek(1)!=b"\\"
countClosingBraces += stream.reversePeek(1) != b"\\" # Check whether escape symbol is present
if countClosingBraces == countOpeningBraces:
break
out_string += letter
Expand All @@ -62,24 +47,25 @@ def parse_string_literal(stream: ObjectIter) -> str:
def parse_numeric(init: bytes, stream: ObjectIter):
"""
Parses numeric objects
:param init: The char that :meth:`PDFObjectsParser.classify_steam` already consumed
:param stream: Object Stream
:return: A number or a reference object
"""
number: str = init
for char in stream:
if (char in b"\\/[]<>()\t\n"):
if char in b"\\/[]<>()\t\n": # found a terminating character
stream.prev()
break
elif (char == b" "):
elif char == b" ": # found a space item maybe an indirect object reference
upcomingchars = stream.peek(3)
isRef = re.search(b"(\d+) R",upcomingchars)
if (isRef):
stream.move_pointer(len(isRef.group(1))+2)
return IndirectObjectRef(number,isRef.group(1))
isReference = re.search(br"(\d+) R", upcomingchars)
if isReference:
stream.move_pointer(len(isReference.group(1)) + 2)
return IndirectObjectRef(number, isReference.group(1))
else:
return number
elif (not char.isdigit() and char != b"."):
elif not char.isdigit() and char != b".":
number += stream.finish_number()
break
number += char
Expand All @@ -89,11 +75,12 @@ def parse_numeric(init: bytes, stream: ObjectIter):
def classify_steam(stream_iter: ObjectIter, letter=None):
"""
Classifies and parses the given stream
:param stream_iter: A stream whose 1st character indicates its type
:param letter: Passes the letter that was consumed elsewhere
:return: A PDF Object or a standard object
"""
if (letter is None):
if letter is None:
letter = next(stream_iter)

debug = letter.decode("utf-8")
Expand All @@ -118,7 +105,7 @@ def classify_steam(stream_iter: ObjectIter, letter=None):
return value

elif letter == b"(":
value = parse_string_literal(stream_iter)
value = parse_literalStrings(stream_iter)
elif letter in b"tf": # handels true/false
value = letter + stream_iter.move_to(b"e") + next(stream_iter)
elif letter == b"n": # handels null values
Expand All @@ -132,25 +119,25 @@ def classify_steam(stream_iter: ObjectIter, letter=None):
return value


def parse_dictionary(pdf_stream:ObjectIter)->PDFDict:
def parse_dictionary(pdf_stream: ObjectIter) -> PDFDict:
"""
Parses PDFDictionary objects
:param pdf_stream: Object Stream
:return: A PDFDict :class:`PDFObjects.PDFDict` object
:return: :class:`PDFObjects.PDFDict` object
"""
object_dict = dict()
streamIter = ObjectIter(pdf_stream) if type(pdf_stream) != ObjectIter else pdf_stream
streamIter._prepare_dictparse()
streamIter.move_to(b"/")
for letter in streamIter:
# Parse Key

if letter == b">":
letter = next(streamIter)
if letter == b">":
return PDFDict(object_dict)

elif letter != b"/":
raise AssertionError(f"Expected a forward slash / to build a dict key but got {letter}")

key = extract_name(streamIter)
streamIter.skip_space()
letter = next(streamIter)
Expand All @@ -165,8 +152,9 @@ def parse_dictionary(pdf_stream:ObjectIter)->PDFDict:
def extract_array(stream: ObjectIter) -> PDFArray:
"""
Extracts array from steam
:param stream: ObjectIter
:return:
:return: PDFArray
"""
out_string = b""
count_closingBraces = 0
Expand All @@ -177,15 +165,21 @@ def extract_array(stream: ObjectIter) -> PDFArray:
count_closingBraces += 1
elif letter == b"[":
count_openingBraces += 1
if count_closingBraces==count_openingBraces:
if count_closingBraces == count_openingBraces:
break
out_string += letter

return PDFArray(parse_arrayObjects(out_string))


def parse_arrayObjects(array_str: bytes):
stream_iter = ObjectIter(array_str)
def parse_arrayObjects(array_bytes: bytes) -> list:
"""
Parses the extracted array
:param array_bytes:
:return: A python list with the parsed objects
"""
stream_iter = ObjectIter(array_bytes)
array = []
for char in stream_iter:
if (char.isspace()):
Expand Down

0 comments on commit 1fb6274

Please sign in to comment.