/
jhove-verapdf-validation-run.py
240 lines (181 loc) · 6.54 KB
/
jhove-verapdf-validation-run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
#! /usr/bin/env python3
import os
import sys
import glob
import argparse
import subprocess as sub
import xml.etree.ElementTree as ET
import pandas as pd
from tabulate import tabulate
"""
This script runs both JHOVE and VeraPDF on all files with a .pdf extension,
and then extracts information that allows for a comparison between JHOVE
validation status and VeraPDF parse errors and logged warnings. Results are
summarised in CSV file that can be further analyzed with script
jhove-verapdf-validation-analyze.py.
Python requirements:
- Pandas (https://pypi.org/project/pandas/)
Other requirements:
- JHOVE (tested with v. 1.28.0)
- veraPDF (tested with v. 1.22.3)
"""
# Locations of JHOVE and VeraPDF
jhoveBin = os.path.abspath("/home/johan/jhove/jhove")
veraPDFBin = os.path.abspath("/home/johan/verapdf/verapdf")
# Create parser
parser = argparse.ArgumentParser()
def parseCommandLine():
# Add arguments
parser.add_argument('dirIn',
action="store",
type=str,
help="input directory")
parser.add_argument('dirOut',
action="store",
type=str,
help="output directory"),
parser.add_argument('--existingoutput', '-e',
action="store_true",
dest="existingOutputFlag",
default=False,
help="don't run JHOVE and VeraPDF, but use existing output")
# Parse arguments
args = parser.parse_args()
return(args)
def errorExit(msg):
"""Print error to stderr and exit"""
msgString = ("Error: " + msg + "\n")
sys.stderr.write(msgString)
sys.exit(1)
def runJhove(jhoveBin, fileIn, fileOut):
"""
Run JHOVE on one PDF
"""
args = [jhoveBin]
args.append('-m')
args.append('PDF-hul')
args.append('-h')
args.append('XML')
args.append('-i')
args.append(fileIn)
args.append('-o')
args.append(fileOut)
p = sub.Popen(args, stdout=sub.PIPE, stderr=sub.PIPE, shell=False)
output, errors = p.communicate()
def runVeraPDF(veraPDFBin, fileIn, fileOut):
"""
Run VeraPDF on one PDF
"""
args = [veraPDFBin]
args.append('--off')
args.append('--addlogs')
args.append('--extract')
args.append(fileIn)
p = sub.Popen(args, stdout=sub.PIPE, stderr=sub.PIPE, shell=False)
output, errors = p.communicate()
# Write output (stdout) to file
with open(fileOut, 'wb') as f:
f.write(output)
def getJhoveResults(fileIn):
"""
Return validation status from JHOVE output
"""
tree = ET.parse(fileIn)
root = tree.getroot()
repInfo = root.find(".//{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo")
statusElt = repInfo.find(".//{http://schema.openpreservation.org/ois/xml/ns/jhove}status")
status = statusElt.text
return status
def getVeraPDFResults(fileIn):
"""
Return two Boolean flags that indicate if VeraPDF output contains any parse errors
or logged warnings
"""
tree = ET.parse(fileIn)
root = tree.getroot()
job = root.find(".//job")
parseErrors = False
logErrors = False # Don't think these are even a thing in VeraPDF?
logWarnings = False
try:
taskResults = job.findall(".//taskResult")
for taskResult in taskResults:
type = taskResult.get("type")
if type == "PARSE":
isSuccess = taskResult.get("isSuccess")
if isSuccess == "false":
#exceptionMessage = taskResult.find(".//exceptionMessage")
#exceptionMessageText = exceptionMessage.text
parseErrors = True
break
except:
pass
try:
logMessages = job.findall(".//logs/logMessage")
for logMessage in logMessages:
level = logMessage.get("level")
if level == "WARNING":
logWarnings = True
if level == "ERROR":
logErrors = True
except:
pass
return parseErrors, logWarnings
def main():
"""Main processing loop"""
# User input
args = parseCommandLine()
dirIn = os.path.abspath(args.dirIn)
dirOut = os.path.abspath(args.dirOut)
existingOutputFlag = args.existingOutputFlag
# Check if input directory exists
if not os.path.isdir(dirIn):
errorExit("input directory does not exist")
# Create output directory if it doesn't exist already
if not os.path.isdir(dirOut):
os.makedirs(dirOut)
# Create dictionary that will contain extracted data
dataDict = {
"fileName": [],
"jhoveStatus": [],
"veraParseErrors": [],
"veraLogWarnings": []
}
# Create list of all files with .pdf extension in dirIn
pdfsIn = glob.glob(dirIn + '/*.pdf')
# Process all files, and add results to dictionary
for pdfIn in pdfsIn:
# Strip path to get file name
fileName = os.path.basename(pdfIn)
# Strip file extension to get base name
baseName = os.path.splitext(fileName)[0]
# Generate JHOVE and VeraPDF output file names
outJhove = os.path.join(dirOut, baseName + "-jhove.xml")
outVeraPDF = os.path.join(dirOut, baseName + "-vera.xml")
if not existingOutputFlag:
# Run JHOVE and VeraPDF
runJhove(jhoveBin, pdfIn, outJhove)
runVeraPDF(veraPDFBin, pdfIn, outVeraPDF)
# Get JHOVE validation status from output file
try:
jhoveStatus = getJhoveResults(outJhove)
except FileNotFoundError:
errorExit("JHOVE output files not found, try running without --existingoutput option")
# Get Boolean flags that indicate parse errors or log warnings
# in VeraPDF output file
try:
veraParseErrors, veraLogWarnings = getVeraPDFResults(outVeraPDF)
except FileNotFoundError:
errorExit("VeraPDF output files not found, try running without --existingoutput option")
# Updata data dictionary
dataDict["fileName"].append(fileName)
dataDict["jhoveStatus"].append(jhoveStatus)
dataDict["veraParseErrors"].append(veraParseErrors)
dataDict["veraLogWarnings"].append(veraLogWarnings)
# Convert dictionary to dataframe
df = pd.DataFrame(dataDict)
# Write all data to a CSV file
csvOut = os.path.join(dirOut, "data.csv")
df.to_csv(csvOut, encoding='utf-8', index=False)
if __name__ == "__main__":
main()