Skip to content

Commit

Permalink
Documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
John98Zakaria committed Oct 3, 2020
1 parent 7cc2315 commit b810b06
Show file tree
Hide file tree
Showing 7 changed files with 148 additions and 90 deletions.
66 changes: 44 additions & 22 deletions objectsParser.py → PDFObjectsParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,13 @@ def parse_string_literal(stream: ObjectIter) -> str:
return out_string + b")"


def parse_numeric(init: str, stream: ObjectIter):
def parse_numeric(init: bytes, stream: ObjectIter):
"""
Parses numeric objects
:param init: The char that :meth:`PDFObjectsParser.classify_steam` already consumed
:param stream: Object Stream
:return: A number or a reference object
"""
number: str = init
for char in stream:
if (char in b"\\/[]<>()\t\n"):
Expand All @@ -80,47 +86,58 @@ def parse_numeric(init: str, stream: ObjectIter):
return number


def parse_stream(streamIter: ObjectIter, letter=None):
def classify_steam(stream_iter: ObjectIter, letter=None):
"""
Classifies and parses the given stream
:param stream_iter: A stream whose 1st character indicates its type
:param letter: Passes the letter that was consumed elsewhere
:return: A PDF Object or a standard object
"""
if (letter is None):
letter = next(streamIter)
letter = next(stream_iter)

debug = letter.decode("utf-8")
if letter == b"/":
value = extract_name(streamIter)
value = extract_name(stream_iter)

elif letter == b"[":
value = extract_array(streamIter)
value = extract_array(stream_iter)

elif letter.isdigit() or letter == b"-":
value = parse_numeric(letter, streamIter)
value = parse_numeric(letter, stream_iter)

elif letter == b"<":
letter = next(streamIter)
letter = next(stream_iter)
if letter == b"<":
value = parse_dictionary(streamIter)
value = parse_dictionary(stream_iter)
else:
value = b"<" + letter + streamIter.move_to(b">") + b">"
value = b"<" + letter + stream_iter.move_to(b">") + b">"
try:
next(streamIter)
next(stream_iter)
except StopIteration:
return value

elif letter == b"(":
value = parse_string_literal(streamIter)
value = parse_string_literal(stream_iter)
elif letter in b"tf": # handels true/false
value = letter + streamIter.move_to(b"e") + next(streamIter)
value = letter + stream_iter.move_to(b"e") + next(stream_iter)
elif letter == b"n": # handels null values
peek = streamIter.peek(3)
if (peek == b"ull"):
peek = stream_iter.peek(3)
if peek == b"ull":
value = b"null"
streamIter.move_pointer(3)
stream_iter.move_pointer(3)

skip_space(streamIter)
stream_iter.skip_space()

return value


def parse_dictionary(pdf_stream)->PDFDict:
def parse_dictionary(pdf_stream:ObjectIter)->PDFDict:
"""
Parses PDFDictionary objects
:param pdf_stream: Object Stream
:return: A PDFDict :class:`PDFObjects.PDFDict` object
"""
object_dict = dict()
streamIter = ObjectIter(pdf_stream) if type(pdf_stream) != ObjectIter else pdf_stream
streamIter._prepare_dictparse()
Expand All @@ -129,23 +146,28 @@ def parse_dictionary(pdf_stream)->PDFDict:

if letter == b">":
letter = next(streamIter)
if (letter == b">"):
if letter == b">":
return PDFDict(object_dict)

elif letter != b"/":
raise AssertionError(f"Expected a forward slash / to build a dict key but got {letter}")
key = extract_name(streamIter)
skip_space(streamIter)
streamIter.skip_space()
letter = next(streamIter)
# parse value
value = parse_stream(streamIter, letter)
value = classify_steam(streamIter, letter)

object_dict[key] = value

return PDFDict(object_dict)


def extract_array(stream: Iterable) -> PDFArray:
def extract_array(stream: ObjectIter) -> PDFArray:
"""
Extracts array from steam
:param stream: ObjectIter
:return:
"""
out_string = b""
count_closingBraces = 0
count_openingBraces = 1
Expand All @@ -168,7 +190,7 @@ def parse_arrayObjects(array_str: bytes):
for char in stream_iter:
if (char.isspace()):
continue
item = parse_stream(stream_iter, char)
item = classify_steam(stream_iter, char)
array.append(item)

return array
Expand Down
27 changes: 17 additions & 10 deletions PDFParser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from PDFObjects import IndirectObjectRef
from PDFStructureObjects import *
import re
from objectsParser import parse_stream
from PDFObjectsParser import classify_steam
from utills import ObjectIter
from tqdm import tqdm
import pickle
Expand All @@ -25,15 +25,21 @@ def __init__(self, filePath):
def _extractXrefAddress(self):
self.file.seek(-3, io.SEEK_END)
count = 0
letter = self.file.read(1)
while not letter.isdigit():#Looking for the first number in
self.file.seek(-2, io.SEEK_CUR)
letter = self.file.read(1)

while count != 2:
xrefAddress = b""
while True:
if not letter.isdigit():
break
xrefAddress +=letter
self.file.seek(-2, io.SEEK_CUR)
char = self.file.read(1).decode("utf-8")
count += char in "\r\n"
letter = self.file.read(1)

self.trailer_end = self.file.tell()
line = self.file.readline()
raw = re.search(b"\d+",line).group(0)
xrefAddress = int(raw)
xrefAddress = int(xrefAddress[::-1])
return xrefAddress

def xRefExtractor(self, xrefAddress):
Expand All @@ -57,7 +63,7 @@ def trailer_parser(self):

self.trailerStart = self.file.tell()
content = self.file.read(self.trailer_end - 10 - self.trailerStart)
trailer_dict = parse_stream(ObjectIter(content))
trailer_dict = classify_steam(ObjectIter(content))
if (b"/Prev" in trailer_dict):
prevXref = int(trailer_dict[b"/Prev"])
self.xRefExtractor(prevXref)
Expand Down Expand Up @@ -113,7 +119,7 @@ def extract_object(self, number):
endIndex = is_obj if is_obj + 1 \
else current_line.find(bytes("stream", "utf-8"))
object_stream += current_line[:endIndex]
thing = parse_stream(ObjectIter(object_stream))
thing = classify_steam(ObjectIter(object_stream))
if not (is_obj + 1):
ob = PDFStream(thing, num, rev, self.file.tell(), inuse)
if (type(ob.length) == IndirectObjectRef):
Expand Down Expand Up @@ -184,7 +190,8 @@ def increment_refrences(self, n: int):


if __name__ == '__main__':
pdf = PDFParser("test_pdfs/keyboard-shortcuts-linux.pdf")
pdf = PDFParser("test_pdfs/PDF-Specifications.pdf")
pdf.clone()
# with open("test_pdfs/Python for Data Analysis, 2nd Edition.pdf","rb") as r:
# with open("Refrased.pdf","wb+") as w:
# file = r.read()
Expand Down
32 changes: 16 additions & 16 deletions _tests/objectsParser_test.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pytest
from objectsParser import *
from PDFObjectsParser import *


class TestArray:
Expand All @@ -11,12 +11,12 @@ def test_simple(self):
simple5 = b"[/Train (KEY) (Len(Pi))]"
simple6 = b"[null false true]"

assert parse_stream(ObjectIter(simple1)) == PDFArray([b"1", b"2", b"3", b"4", b"5"])
assert parse_stream(ObjectIter(simple2)) == PDFArray([b"1", IndirectObjectRef(2)])
assert parse_stream(ObjectIter(simple3)) == PDFArray([IndirectObjectRef(15)])
assert parse_stream(ObjectIter(simple4)) == PDFArray([b"484", b"9874", b"618", b"798"])
assert parse_stream(ObjectIter(simple5)) == PDFArray([b"/Train", b"(KEY)", b"(Len(Pi))"])
assert parse_stream(ObjectIter(simple6)) == PDFArray([b"null", b"false", b"true"])
assert classify_steam(ObjectIter(simple1)) == PDFArray([b"1", b"2", b"3", b"4", b"5"])
assert classify_steam(ObjectIter(simple2)) == PDFArray([b"1", IndirectObjectRef(2,0)])
assert classify_steam(ObjectIter(simple3)) == PDFArray([IndirectObjectRef(15,0)])
assert classify_steam(ObjectIter(simple4)) == PDFArray([b"484", b"9874", b"618", b"798"])
assert classify_steam(ObjectIter(simple5)) == PDFArray([b"/Train", b"(KEY)", b"(Len(Pi))"])
assert classify_steam(ObjectIter(simple6)) == PDFArray([b"null", b"false", b"true"])

def test_nested(self):
nested1 = b"[1 2 3 [4 5 6]]"
Expand All @@ -25,24 +25,24 @@ def test_nested(self):
nested4 = b"[1 2 3 [4 [5] 6]]"
nested5 = b"[1 20 318 [4 [-5.497] 6]]"

assert parse_stream(ObjectIter(nested1)) == PDFArray(
assert classify_steam(ObjectIter(nested1)) == PDFArray(
[b'1', b'2', b'3', PDFArray( [b'4', b'5', b'6'])])
assert parse_stream(ObjectIter(nested2)) == PDFArray([b"1", PDFArray([b"4", b"5", b"6"]),b"5", b"8"])
assert parse_stream(ObjectIter(nested3)) == PDFArray(
assert classify_steam(ObjectIter(nested2)) == PDFArray([b"1", PDFArray([b"4", b"5", b"6"]), b"5", b"8"])
assert classify_steam(ObjectIter(nested3)) == PDFArray(
[b"1", PDFArray([b"2", b"3"]), PDFArray([b"4", b"5", b"6"])])
assert parse_stream(ObjectIter(nested4)) == PDFArray([b"1", b"2", b"3" ,PDFArray([b"4", PDFArray([b"5"]),b"6"])])
assert parse_stream(ObjectIter(nested5)) == PDFArray([b'1', b'20', b'318', PDFArray([b'4', PDFArray([b'-5.497']), b'6'])])
assert classify_steam(ObjectIter(nested4)) == PDFArray([b"1", b"2", b"3" , PDFArray([b"4", PDFArray([b"5"]), b"6"])])
assert classify_steam(ObjectIter(nested5)) == PDFArray([b'1', b'20', b'318', PDFArray([b'4', PDFArray([b'-5.497']), b'6'])])

def test_empty(self):
empty1 = b"[]"
empty2 = b"[[]]"
empty3 = b"[[[]]]"
empty4 = b"[[] [] [[]]]"

assert parse_stream(ObjectIter(empty1))==PDFArray([])
assert parse_stream(ObjectIter(empty2))==PDFArray([PDFArray([])])
assert parse_stream(ObjectIter(empty3))==PDFArray([PDFArray([PDFArray([])])])
assert parse_stream(ObjectIter(empty4))==PDFArray([PDFArray([]),PDFArray([]),PDFArray([PDFArray([])])])
assert classify_steam(ObjectIter(empty1)) == PDFArray([])
assert classify_steam(ObjectIter(empty2)) == PDFArray([PDFArray([])])
assert classify_steam(ObjectIter(empty3)) == PDFArray([PDFArray([PDFArray([])])])
assert classify_steam(ObjectIter(empty4)) == PDFArray([PDFArray([]), PDFArray([]), PDFArray([PDFArray([])])])



Expand Down
1 change: 0 additions & 1 deletion docs/source/PDFObjects.rst
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

PDF Objects
===============
.. automodule:: PDFObjects
Expand Down
4 changes: 4 additions & 0 deletions docs/source/PDFObjectsParser.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
PDFObjectsParser
===============
.. automodule:: PDFObjectsParser
:members:
1 change: 1 addition & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ print("Hello Sphinx")

license
PDFObjects
PDFObjectParser


Indices and tables
Expand Down

0 comments on commit b810b06

Please sign in to comment.