Skip to content

Commit

Permalink
Don't forget which branch you were on :P
Browse files Browse the repository at this point in the history
  • Loading branch information
John98Zakaria committed Oct 6, 2020
1 parent d16dc25 commit 7a7eaca
Show file tree
Hide file tree
Showing 7 changed files with 236 additions and 159 deletions.
89 changes: 56 additions & 33 deletions PDFMerger.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,77 @@
from tqdm import tqdm

from PDFObjects import IndirectObjectRef
from PDFParser import PDFParser, XrefEntry, XRefTable
import time

from PDFParser import *

class PDFMerger:
def __init__(self, pdf1: PDFParser, pdf2: PDFParser):
self.pdf1 = pdf1
self.pdf2 = pdf2
def __init__(self, pdfs):
self.pdfFiles = pdfs
self.objectCount = sum(len(pdf) for pdf in pdfs)


def new_page_root(self):
self.objectCount += 1
root_ref = IndirectObjectRef(self.objectCount, 0)
page_count = 0
kids = []
for pdf in self.pdfFiles:
root = pdf.get_page_root()
page_count += int(root[b"/Count"])
kids += root[b"/Kids"].data
root[b"/Parent"] = root_ref

rootDict = PDFDict({b"/Type": b"/Pages",
b"/Kids": PDFArray(kids),
b"/Count": str(page_count).encode("utf-8")})

self.pdfFiles[0].trailer[b"/Size"] = str(self.objectCount).encode("utf-8")
self.pdfFiles[0].get_document_catalog()[b"/Pages"] = root_ref

return PDFObject(rootDict, self.objectCount, 0, "n")

def merge(self, out_path: str) -> None:
"""
Merges the accumulated PDFFiles
:param out_path: Path for the output file
"""
self.pdf2.increment_references(self.pdf1.__len__())
root1 = self.pdf1.get_page_root()
root2 = self.pdf2.get_page_root()
root2[b"/Parent"] = IndirectObjectRef(root1.object_number, root1.object_rev)
root1[b"/Kids"].data.append(IndirectObjectRef(root2.object_number, root2.object_rev))
root1[b"/Count"] = str(int(root1[b"/Count"]) + int(root2[b"/Count"])).encode("utf-8")
self.pdf1.trailer[b"/Size"] = str(len(pdf1) + len(pdf2)).encode("utf-8")
accumulated_offset = len(self.pdfFiles[0])
for pdf in self.pdfFiles[1:]:
pdf.increment_references(accumulated_offset)
accumulated_offset += len(pdf)

self.pdfFiles[0].pdfObjects[self.objectCount] = (self.new_page_root())

newXrefTable = [XrefEntry(0, 65535, "f")]
with open("Merge3.pdf", "wb+")as f:
with open(out_path, "wb+")as f:
f.write(b"%PDF-1.5\n")
for object in tqdm(self.pdf1.pdfObjects.values(), "Writing Objects"):
pos = str(f.tell())
rev = str(object.object_rev)
inuse = object.inuse
newXrefTable.append(XrefEntry(pos, int(rev), str(inuse)))
f.write(object.to_bytes(self.pdf1.file) + b"\n")
for object in tqdm(self.pdf2.pdfObjects.values(), "Writing Objects"):
pos = str(f.tell())
rev = str(object.object_rev)
inuse = object.inuse
newXrefTable.append(XrefEntry(pos, int(rev), str(inuse)))
f.write(object.to_bytes(self.pdf2.file) + b"\n")

for index, pdf in enumerate(self.pdfFiles):
for object in tqdm(pdf.pdfObjects.values(), f"Writing Objects for {index}. pdf"):
pos = str(f.tell())
rev = str(object.object_rev)
inuse = object.inuse
newXrefTable.append(XrefEntry(pos, int(rev), str(inuse)))
f.write(bytes(object))
xrefpos = f.tell()
newXrefTable = XRefTable(newXrefTable, True)
f.write(newXrefTable.__str__().encode("utf-8"))
f.write(b"trailer\n")
# self.trailer.data.pop("/DocChecksum")
f.write(self.pdf1.trailer.to_bytes())
f.write(bytes(self.pdfFiles[0].trailer))
f.write(f"startxref\n{xrefpos}\n%%EOF\n".encode("utf-8"))


if __name__ == '__main__':
pdf1 = PDFParser("test_pdfs/FuldaFinalProjectHighLevelDescriptionWS2020.pdf")
pdf2 = PDFParser("test_pdfs/FuldaMilestone0WS2020.pdf")
merger = PDFMerger(pdf2, pdf1)
start = time.time()
pdf1 = PDFParser("/media/jn98zk/318476C83114A23B/Uni-Mainz/FormaleSprachen/FSB_01_Einführung.pdf")
pdf2 = PDFParser(
"/media/jn98zk/318476C83114A23B/Uni-Mainz/FormaleSprachen/FSB_02_Mathematische_Grundlagen_Anmerkungen.pdf")
pdf3 = PDFParser(
"/media/jn98zk/318476C83114A23B/Uni-Mainz/FormaleSprachen/FSB_03_Formale_Sprachen_und_Grammatiken_Anmerkungen.pdf")
pdf4 = PDFParser(
"/media/jn98zk/318476C83114A23B/Uni-Mainz/FormaleSprachen/FSB_04_Reguläre_Sprachen_Endliche_Automaten_Anmerkungen.pdf")
# pdf5 = PDFParser("/media/jn98zk/318476C83114A23B/Uni-Mainz/FormaleSprachen/FSB_05_Weitere_Charakterisierungen_Regulärer_Sprachen_Anmerkungen.pdf")

merger = PDFMerger([pdf1, pdf2, pdf3,pdf4])

merger.merge()
merger.merge("BlattMerger.pdf")
print(time.time() - start)
30 changes: 13 additions & 17 deletions PDFObjects.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,18 @@ class IndirectObjectRef(Ibytable):
7.3.10 PDF 32000-1:2008
"""

def __init__(self, objectref, generationNum):
def __init__(self, objectref, generation_number):
self.objectref = int(objectref)
self.generationNum = int(generationNum)
self.generationNum = int(generation_number)

def __str__(self):
return f"{self.objectref} {self.generationNum} R"
return f"IndirectObjectRef({self.objectref},{self.generationNum})"

def __repr__(self):
return self.__str__()

def __int__(self):
return self.objectref

def __eq__(self, other):
return self.objectref == other.objectref
return self.objectref == other.objectref and self.generationNum == other.generationNum

def offset_references(self, offset: int) -> None:
"""
Expand All @@ -36,13 +33,13 @@ def offset_references(self, offset: int) -> None:
def add_offset(self, offset: int):
self.objectref += offset

def to_bytes(self) -> bytes:
def __bytes__(self) -> bytes:
"""
Converts Indirect Reference to bytes
:return: bytes representation of the indirect reference
"""
return self.__str__().encode("utf-8")
return f"{self.objectref} {self.generationNum} R".encode("utf-8")


@dataclass
Expand All @@ -56,7 +53,7 @@ def __init__(self, data: list):
self.data = data

def __str__(self):
return "[" + ",".join(map(str, self.data)) + "]"
return f"PDFArray([{','.join(map(str, self.data))}])"

def __eq__(self, other):
return self.data == other.data
Expand All @@ -80,15 +77,15 @@ def offset_references(self, offset: int):
if issubclass(type(value), Ibytable):
value.offset_references(offset)

def to_bytes(self) -> bytes:
def __bytes__(self) -> bytes:
"""
Converts the object to bytes
:return: Bytes representation of the object
"""
bytes_representation = b"["
for item in self.data:
bytes_representation += self.itemToByte(item) + b" "
bytes_representation += bytes(item) + b" "
bytes_representation += b"]"
return bytes_representation

Expand Down Expand Up @@ -124,24 +121,23 @@ def __setitem__(self, key, value):

def __str__(self):

return str(self.data)
return f"PDFDict({str(self.data)})"

def __eq__(self, other):
self.data = other.data
return self.data == other.data

def __repr__(self):
return self.__str__()

def to_bytes(self) -> bytes:
def __bytes__(self) -> bytes:
"""
Converts the object to bytes
:return: Bytes representation of the object
"""
out_string = b"<<\n"
for key, value in zip(self.data.keys(), self.data.values()):
if issubclass(type(value), Ibytable):
value = value.to_bytes()
value = bytes(value)
out_string += key + b" " + value + b"\n"
out_string = out_string + b">>"
return out_string
Expand Down
46 changes: 30 additions & 16 deletions PDFObjectsParser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import re

from PDFObjects import *
from utils import ObjectIter

Expand Down Expand Up @@ -58,12 +56,28 @@ def parse_numeric(init: bytes, stream: ObjectIter):
stream.prev()
break
elif char == b" ": # found a space item maybe an indirect object reference
upcomingchars = stream.peek(3)
isReference = re.search(br"(\d+) R", upcomingchars)
pointer = stream.stream.tell()
try:
current_char = next(stream)
except StopIteration:
return number
rev_num = b""
while current_char.isdigit():
rev_num += current_char
try:
current_char = next(stream)
except StopIteration:
stream.stream.seek(pointer)
return number

stream.prev()

isReference = stream.peek(2) == b" R"
if isReference:
stream.move_pointer(len(isReference.group(1)) + 2)
return IndirectObjectRef(number, isReference.group(1))
stream.move_pointer(2)
return IndirectObjectRef(number, rev_num)
else:
stream.stream.seek(pointer)
return number
elif not char.isdigit() and char != b".":
number += stream.finish_number()
Expand All @@ -83,7 +97,6 @@ def classify_steam(stream_iter: ObjectIter, letter=None):
if letter is None:
letter = next(stream_iter)

debug = letter.decode("utf-8")
if letter == b"/":
value = extract_name(stream_iter)

Expand All @@ -98,17 +111,15 @@ def classify_steam(stream_iter: ObjectIter, letter=None):
if letter == b"<":
value = parse_dictionary(stream_iter)
else:
value = b"<" + letter + stream_iter.move_to(b">") + b">"
try:
next(stream_iter)
except StopIteration:
return value
value = b"<" + letter + stream_iter.move_to(b">") + b">" # handles Hex Values
next(stream_iter)

elif letter == b"(":
value = parse_literalStrings(stream_iter)
elif letter in b"tf": # handels true/false
value = letter + stream_iter.move_to(b"e") + next(stream_iter)
elif letter == b"n": # handels null values
elif letter in b"tf": # handles true/false
value = letter + stream_iter.move_to(b"e") + b"e"
next(stream_iter)
elif letter == b"n": # handles null values
peek = stream_iter.peek(3)
if peek == b"ull":
value = b"null"
Expand Down Expand Up @@ -175,7 +186,7 @@ def extract_array(stream: ObjectIter) -> PDFArray:
def parse_arrayObjects(array_bytes: bytes) -> list:
"""
Parses the extracted array
:param array_bytes:
:return: A python list with the parsed objects
"""
Expand All @@ -191,6 +202,9 @@ def parse_arrayObjects(array_bytes: bytes) -> list:


if __name__ == '__main__':
w = b'<< /ID [(\xa3\xa2\x86\x93\x8f \xdc\x91\xfeZ\x9f]\xb7\x91xM) (\xa3\xa2\x86\x93\x8f \xdc\x91\xfeZ\x9f]\xb7\x91xM)] /Info 409 0 R /Prev 601302 /Root 408 0 R /Size 670 >>\r\ns'
print(classify_steam(ObjectIter(w)))

##Bad table
# t1 = b"""/Type/Annot/Border[ 0 0 0]/Dest[ 4863 0 R/XYZ 76.450073 383.27719 0]/F 4/Rect[ 167.25 565.5 447.75 582]/Subtype/Link>>"""
# t1 = parse_dictionary(t1)
Expand Down

0 comments on commit 7a7eaca

Please sign in to comment.