Skip to content

Commit

Permalink
Changes to upgrade to using org.apache.pdfbox 2.0
Browse files Browse the repository at this point in the history
  • Loading branch information
JamesSullivan committed May 31, 2017
1 parent d7c84c6 commit cbe6ded
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 51 deletions.
38 changes: 20 additions & 18 deletions PDFLayoutTextStripper.java
Expand Up @@ -2,9 +2,11 @@
* Author: Jonathan Link
* Email: jonathanlink[d o t]email[a t]gmail[d o t]com
* Date of creation: 13.11.2014
* Version: 0.1
* Version: 0.2
* Description:
*
* Version 0.2 uses PDFBox 2.x. Version 0.1 used PDFBox 1.8.x
*
* What does it DO:
* This object converts the content of a PDF file into a String.
* The layout of the texts is transcribed as near as the one in the PDF given at the input.
Expand Down Expand Up @@ -50,9 +52,9 @@
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;
import org.apache.pdfbox.util.TextPositionComparator;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.text.TextPositionComparator;

public class PDFLayoutTextStripper extends PDFTextStripper {

Expand All @@ -70,19 +72,19 @@ public PDFLayoutTextStripper() throws IOException {
}

@Override
protected void processPage(PDPage page, COSStream content ) throws IOException {
PDRectangle pageRectangle = page.findMediaBox();
public void processPage(PDPage page) throws IOException {
PDRectangle pageRectangle = page.getMediaBox();
if (pageRectangle!= null) {
this.setCurrentPageWidth(pageRectangle.getWidth());
super.processPage(page, content);
super.processPage(page);
this.previousTextPosition = null;
this.textLineList = new ArrayList<TextLine>();
}
}

@Override
protected void writePage() throws IOException {
Vector<List<TextPosition>> charactersByArticle = super.getCharactersByArticle();
List<List<TextPosition>> charactersByArticle = super.getCharactersByArticle();
for( int i = 0; i < charactersByArticle.size(); i++) {
List<TextPosition> textList = charactersByArticle.get(i);
this.sortTextPositionList(textList);
Expand Down Expand Up @@ -172,10 +174,10 @@ private int getNumberOfNewLinesFromPreviousTextPosition(final TextPosition textP
return 1;
}

double textYPosition = Math.round( textPosition.getTextPos().getYPosition() );
double previousTextYPosition = Math.round( previousTextPosition.getTextPos().getYPosition() );
float textYPosition = Math.round( textPosition.getY() );
float previousTextYPosition = Math.round( previousTextPosition.getY() );

if ( textYPosition < previousTextYPosition ) {
if ( textYPosition > previousTextYPosition ) {
double height = textPosition.getHeight();
int numberOfLines = (int) (Math.floor( previousTextYPosition - textYPosition) / height );
numberOfLines = Math.max(1, numberOfLines - 1); // exclude current new line
Expand Down Expand Up @@ -400,7 +402,7 @@ public Character createCharacterFromTextPosition(final TextPosition textPosition
this.isCharacterAtTheBeginningOfNewLine = this.isCharacterAtTheBeginningOfNewLine(textPosition);
this.isCharacterCloseToPreviousWord = this.isCharacterCloseToPreviousWord(textPosition);
char character = this.getCharacterFromTextPosition(textPosition);
int index = (int)textPosition.getTextPos().getXPosition() / PDFLayoutTextStripper.OUTPUT_SPACE_CHARACTER_WIDTH_IN_PT;
int index = (int)textPosition.getX() / PDFLayoutTextStripper.OUTPUT_SPACE_CHARACTER_WIDTH_IN_PT;
return new Character(character,
index,
isCharacterPartOfPreviousWord,
Expand All @@ -414,8 +416,8 @@ private boolean isCharacterAtTheBeginningOfNewLine(final TextPosition textPositi
return true;
}
TextPosition previousTextPosition = this.getPreviousTextPosition();
double previousTextYPosition = previousTextPosition.getTextPos().getYPosition();
return ( Math.round( textPosition.getTextPos().getYPosition() ) < Math.round(previousTextYPosition) );
float previousTextYPosition = previousTextPosition.getY();
return ( Math.round( textPosition.getY() ) < Math.round(previousTextYPosition) );
}

private boolean isFirstCharacterOfAWord(final TextPosition textPosition) {
Expand All @@ -437,25 +439,25 @@ private boolean isCharacterCloseToPreviousWord(final TextPosition textPosition)

private boolean isCharacterPartOfPreviousWord(final TextPosition textPosition) {
TextPosition previousTextPosition = this.getPreviousTextPosition();
if ( previousTextPosition.getCharacter().equals(" ") ) {
if ( previousTextPosition.getUnicode().equals(" ") ) {
return false;
}
double numberOfSpaces = this.numberOfSpacesBetweenTwoCharacters(previousTextPosition, textPosition);
return (numberOfSpaces <= 1);
}

private double numberOfSpacesBetweenTwoCharacters(final TextPosition textPosition1, final TextPosition textPosition2) {
double previousTextXPosition = textPosition1.getTextPos().getXPosition();
double previousTextXPosition = textPosition1.getX();
double previousTextWidth = textPosition1.getWidth();
double previousTextEndXPosition = (previousTextXPosition + previousTextWidth);
double numberOfSpaces = Math.abs(Math.round(textPosition2.getTextPos().getXPosition() - previousTextEndXPosition));
double numberOfSpaces = Math.abs(Math.round(textPosition2.getX() - previousTextEndXPosition));
return numberOfSpaces;
}



private char getCharacterFromTextPosition(final TextPosition textPosition) {
String string = textPosition.getCharacter();
String string = textPosition.getUnicode();
char character = string.charAt(0);
return character;
}
Expand Down
Binary file modified PDFLayoutTextStripper.tar.gz
Binary file not shown.
43 changes: 10 additions & 33 deletions README.md
Expand Up @@ -15,42 +15,19 @@ Data extraction from a form in a PDF file

## How to install

1) Install **apache pdfbox** through Maven ([to get the v1.8.13 click here](https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox/1.8.13) )
1) Install **apache pdfbox** manually ([to get the v2.0.6 click here](https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox/2.0.6) ) and its two dependencies
commons-logging.jar and fontbox

>**warning**: currently only pdfbox versions **strictly inferior to version 2.0.0** are compatible with PDFLayoutTextStripper.java
>**warning**: only pdfbox versions **from version 2.0.0 upwards** are compatible with this version of PDFLayoutTextStripper.java
2) Copy **PDFLayoutTextStripper.java** inside your main java package

## How to use
### How to use on Linux
```
package pdftest.pt;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
public class test {
public static void main(String[] args) {
String string = null;
try {
PDFParser pdfParser = new PDFParser(new FileInputStream("sample.pdf"));
pdfParser.parse();
PDDocument pdDocument = new PDDocument(pdfParser.getDocument());
PDFTextStripper pdfTextStripper = new PDFLayoutTextStripper();
string = pdfTextStripper.getText(pdDocument);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
};
System.out.println(string);
}
}
cd PDFLayoutTextStripper
javac -cp .:/pathto/pdfbox-2.0.6.jar:/pathto/commons-logging-1.2.jar:/pathto/PDFLayoutTextStripper/fontbox-2.0.6.jar *.java
java -cp .:/pathto/pdfbox-2.0.6.jar:/pathto/commons-logging-1.2.jar:/pathto/PDFLayoutTextStripper/fontbox-2.0.6.jar test
```

### How to use on Windows

The same as for Linux (see above) but replace : with ;
30 changes: 30 additions & 0 deletions test.java
@@ -0,0 +1,30 @@
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;

import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;



public class test {

public static void main(String[] args) {
String string = null;
try {
PDFParser pdfParser = new PDFParser(new RandomAccessFile(new File("./samples/bus.pdf"), "r"));
pdfParser.parse();
PDDocument pdDocument = new PDDocument(pdfParser.getDocument());
PDFTextStripper pdfTextStripper = new PDFLayoutTextStripper();
string = pdfTextStripper.getText(pdDocument);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
};
System.out.println(string);
}

}

0 comments on commit cbe6ded

Please sign in to comment.