Skip to content

Commit

Permalink
Merge pull request #31 from JonathanLink/issue-#26
Browse files Browse the repository at this point in the history
Issue #26
  • Loading branch information
JonathanLink committed Apr 22, 2019
2 parents ec238cd + 5404e24 commit 6c4f407
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 3 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.DS_Store
pom.xml.releaseBackup
pom.xml.versionsBackup
release.properties
target/
target/
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,16 @@ Data extraction from a form in a PDF file

## How to install

### Maven
```
<dependency>
<groupId>io.github.jonathanlink</groupId>
<artifactId>PDFLayoutTextStripper</artifactId>
<version>2.2.3</version>
</dependency>
```

### Manual
1) Install **apache pdfbox** manually ([to get the v2.0.6 click here](https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox/2.0.6) ) and its two dependencies
commons-logging.jar and fontbox

Expand Down
20 changes: 18 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
<modelVersion>4.0.0</modelVersion>
<groupId>io.github.jonathanlink</groupId>
<artifactId>PDFLayoutTextStripper</artifactId>
<version>2.3.0-SNAPSHOT</version>
<version>2.2.3</version>



<distributionManagement>
<snapshotRepository>
Expand Down Expand Up @@ -95,15 +97,29 @@
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.9.1</version>
<executions>
<execution>
<id>attach-javadocs</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.sonatype.plugins</groupId>
<artifactId>nexus-staging-maven-plugin</artifactId>
<version>1.6.7</version>
<extensions>true</extensions>
<configuration>
<serverId>ossrh</serverId>
<serverId>sonatype-nexus-snapshots</serverId>
<nexusUrl>https://oss.sonatype.org/</nexusUrl>
<autoReleaseAfterClose>true</autoReleaseAfterClose>
<arguments>-Dmaven.javadoc.skip=true</arguments>
</configuration>
</plugin>
</plugins>
Expand Down
16 changes: 16 additions & 0 deletions src/main/java/io/github/jonathanlink/PDFLayoutTextStripper.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@
*
*/


package io.github.jonathanlink;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
Expand All @@ -55,6 +58,12 @@
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.text.TextPositionComparator;

/**
* Java doc to be completed
*
* @author Jonathan Link
*
*/
public class PDFLayoutTextStripper extends PDFTextStripper {

public static final boolean DEBUG = false;
Expand All @@ -64,12 +73,19 @@ public class PDFLayoutTextStripper extends PDFTextStripper {
private TextPosition previousTextPosition;
private List<TextLine> textLineList;

/**
* Constructor
*/
public PDFLayoutTextStripper() throws IOException {
super();
this.previousTextPosition = null;
this.textLineList = new ArrayList<TextLine>();
}

/**
*
* @param page page to parse
*/
@Override
public void processPage(PDPage page) throws IOException {
PDRectangle pageRectangle = page.getMediaBox();
Expand Down

0 comments on commit 6c4f407

Please sign in to comment.