forked from LSFLK/GIG-SDK
/
pdf_parser.go
91 lines (74 loc) · 1.78 KB
/
pdf_parser.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
package libraries
/*
* PDF to text: Extract all text for each page of a pdf file.
*
* N.B. Only outputs character codes as seen in the content stream. Need to account for text encoding to get readable
* text in many cases.
*
* Run as: go run pdf_extract_text.go input.pdf
*/
import (
pdfcontent "github.com/unidoc/unidoc/pdf/contentstream"
pdf "github.com/unidoc/unidoc/pdf/model"
"log"
"os"
)
const NewPageMarker = "\n*******************\n"
/**
return the string content of a given PDF file
*/
func ParsePdf(source string) string {
text, err := listContentStreams(source)
if err != nil {
log.Println(err)
return ""
}
return text
}
func listContentStreams(inputPath string) (string, error) {
f, err := os.Open(inputPath)
text := ""
if err != nil {
return text, err
}
defer f.Close()
pdfReader, err := pdf.NewPdfReader(f)
if err != nil {
return text, err
}
isEncrypted, err := pdfReader.IsEncrypted()
if err != nil {
return text, err
}
if isEncrypted {
_, err = pdfReader.Decrypt([]byte(""))
if err != nil {
return text, err
}
}
numPages, err := pdfReader.GetNumPages()
if err != nil {
return text, err
}
for i := 0; i < numPages; i++ {
pageNum := i + 1
page, err := pdfReader.GetPage(pageNum)
if err != nil {
return text, err
}
contentStreams, err := page.GetContentStreams()
if err != nil {
return text, err
}
// If the value is an array, the effect shall be as if all of the streams in the array were concatenated,
// in order, to form a single stream.
pageContentStr := ""
for _, cstream := range contentStreams {
pageContentStr += cstream
}
cstreamParser := pdfcontent.NewContentStreamParser(pageContentStr)
txt, err := cstreamParser.ExtractText()
text = text + NewPageMarker + txt
}
return text, err
}