Skip to content

Commit

Permalink
Title indexing fix
Browse files Browse the repository at this point in the history
  • Loading branch information
JulianEberius committed Mar 24, 2015
1 parent a889e26 commit b6367c9
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 2 deletions.
5 changes: 3 additions & 2 deletions src/webreduce/data/Dataset.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,15 @@ public class Dataset implements Serializable {
public HeaderPosition headerPosition = null; // position of those th tags
public TableType tableType = null; // table classification (entity,
// relational, matrix ...)

public String[] termSet = null; // top-terms extracted from the source page

// metadata used to identify and locate a table in the CC corpus
public int tableNum = -1; // index of the table in the list of tables on the
// original page
public String s3Link = ""; // link into S3
public long recordEndOffset = -1; // offsets into the CC file
public long recordOffset = -1;
public String[] termSet = null; // top-terms extracted from the source page


/*
* the following attributes are not set in the raw data, but are set by the
Expand Down
2 changes: 2 additions & 0 deletions src/webreduce/indexing/Indexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@ protected void processDataset(Dataset er) throws IOException {
}

String title = er.getTitle();
if (title == null)
title = er.getPageTitle();
if (title == null)
title = "";

Expand Down

0 comments on commit b6367c9

Please sign in to comment.