Switch Travis to Ubuntu 16.04

This also pins mecab for Japanese test consistency and simplifies the Japanese test files. We also now Poppler v0.62.0 (the default for Ubuntu 18.04).
HazyResearch · Nov 15, 2018 · 14a3f48 · 14a3f48
1 parent 78651b0
commit 14a3f48
Show file tree

Hide file tree

Showing 6 changed files with 344 additions and 260 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,5 +1,5 @@
 language: generic
-dist: trusty
+dist: xenial
 cache: pip
 
 matrix:
@@ -31,7 +31,7 @@ before_install:
 - if [ "$TRAVIS_OS_NAME" = "linux" ]; then OS=Linux-x86_64; else OS=MacOSX-x86_64; fi
 - if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt-get install -y libmecab-dev swig mecab mecab-ipadic-utf8; fi
 - if [ "$TRAVIS_OS_NAME" = "osx" ]; then brew update;fi
-- if [ "$TRAVIS_OS_NAME" = "osx" ]; then brew install mecab mecab-ipadic freetype; fi
+- if [ "$TRAVIS_OS_NAME" = "osx" ]; then brew install swig mecab mecab-ipadic freetype; fi
 - wget -O miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-$OS.sh
 - bash miniconda.sh -b -p $HOME/miniconda
 - export PATH="$HOME/miniconda/bin:$PATH"
@@ -71,19 +71,22 @@ install:
 - pip install -q coveralls
 - |
   if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
-    wget poppler.freedesktop.org/poppler-0.53.0.tar.xz
-    tar -xf ./poppler-0.53.0.tar.xz
-    cd poppler-0.53.0
-    "./configure"
-    make
-    sudo make install > /dev/null 2>&1
-    cd ..
-    rm -rf poppler-0.53.0
+    sudo apt install cmake checkinstall libopenjp2-7-dev
+    wget https://poppler.freedesktop.org/poppler-0.62.0.tar.xz
+    tar -xf poppler-0.62.0.tar.xz
+    cd poppler-0.62.0
+    mkdir build
+    cd build
+    cmake ..
+    sudo checkinstall -y make install > /dev/null 2>&1
+    cd ../..
+    rm -rf poppler-0.62.0
     export LD_LIBRARY_PATH="/usr/local/lib:$LD_LIBRARY_PATH"
     echo "Using LD_LIBRARY_PATH=${LD_LIBRARY_PATH}"
   else
     brew install poppler
   fi
+- pdfinfo -v
 - python -m spacy download en
 
 before_script:

diff --git a/docs/user/faqs.rst b/docs/user/faqs.rst
@@ -66,27 +66,27 @@ How can I use use Fonduer for documents in Languages other than English?
 ------------------------------------------------------------------------
 
 If available, Fonduer uses languages supported by spaCy for tokenization and
-its NLP pipeline (`see spacy language support`_).
-We also started adding languages with spaCy alpha support for tokenization
-(`see spacy alpha languages`_). Currently, only Japanese is supported.
+its NLP pipeline (`see spacy language support`_). We also started adding
+languages with spaCy alpha support for tokenization (`see spacy alpha
+languages`_). Currently, only Japanese is supported.
 
 If you would like to use Fonduer for Japanese documents, you will first have
 to install some additional packages (`see mecab on PyPI`_).
 
 For Linux::
 
-    $ sudo apt-get install libmecab-dev
+    $ sudo apt-get install swig libmecab-dev
     $ sudo apt-get install mecab mecab-ipadic-utf8
 
 For OS X::
 
-    $ brew install mecab
+    $ brew install swig mecab
     $ brew install mecab-ipadic
 
 Afterwards, you can use ``pip install fonduer[spacy_ja]`` to install Fonduer
-with Japanese language support.
-If you would like to use other languages with spaCy alpha support, which are
-not yet integrated in Fonduer, feel free to open an Issue_.
+with Japanese language support. If you would like to use other languages with
+spaCy alpha support, which are not yet integrated in Fonduer, feel free to open
+an Issue_.
 
 .. _see spacy language support: https://spacy.io/usage/models#languages
 .. _see spacy alpha languages: https://spacy.io/usage/models#alpha-support

diff --git a/setup.py b/setup.py
@@ -30,7 +30,7 @@
         "treedlib",
         "wand",
     ],
-    extras_require={"spacy_ja": ["mecab-python3"]},
+    extras_require={"spacy_ja": ["mecab-python3==0.7"]},
     keywords=["fonduer", "knowledge base construction", "richly formatted data"],
     include_package_data=True,
     url="https://github.com/HazyResearch/fonduer",

diff --git a/src/fonduer/parser/spacy_parser.py b/src/fonduer/parser/spacy_parser.py
@@ -60,25 +60,21 @@ def __init__(self, lang):
         # self.model = self.load_lang_model()
 
     def has_tokenizer_support(self):
-        if self.lang and (self.has_NLP_support() or self.lang in self.alpha_languages):
-            return True
-        else:
-            return False
+        return self.lang and (
+            self.has_NLP_support() or self.lang in self.alpha_languages
+        )
 
     def has_NLP_support(self):
-        if self.lang and (self.lang in self.languages):
-            return True
-        else:
-            return False
+        return self.lang and (self.lang in self.languages)
 
     @staticmethod
     def is_package(name):
         """Check if string maps to a package installed via pip.
+
         name (unicode): Name of package.
         RETURNS (bool): True if installed package, False if not.
 
         From https://github.com/explosion/spaCy/blob/master/spacy/util.py
-
         """
         name = name.lower()  # compare package name against lowercase name
         packages = pkg_resources.working_set.by_key.keys()
@@ -89,8 +85,7 @@ def is_package(name):
 
     @staticmethod
     def model_installed(name):
-        """
-        Check if spaCy language model is installed
+        """Check if spaCy language model is installed.
 
         From https://github.com/explosion/spaCy/blob/master/spacy/util.py
 
@@ -110,8 +105,8 @@ def model_installed(name):
 
     def load_lang_model(self):
         """
-        Load spaCy language model or download if
-        model is available and not installed
+        Load spaCy language model or download if model is available and not
+        installed.
 
         Currenty supported spaCy languages
 
@@ -172,8 +167,9 @@ def set_custom_boundary(doc):
 
     def enrich_sentences_with_NLP(self, all_sentences):
         """
-        Enrich a list of fonduer Sentence objects with NLP features.
-        We merge and process the text of all Sentences for higher efficiency
+        Enrich a list of fonduer Sentence objects with NLP features. We merge
+        and process the text of all Sentences for higher efficiency.
+
         :param all_sentences: List of fonduer Sentence objects for one document
         :return:
         """
@@ -276,8 +272,9 @@ def enrich_sentences_with_NLP(self, all_sentences):
 
     def split_sentences(self, document, text):
         """
-        Split input text into sentences that match CoreNLP's
-         default format, but are not yet processed
+        Split input text into sentences that match CoreNLP's default format,
+        but are not yet processed.
+
         :param document: The Document context
         :param text: The text of the parent paragraph of the sentences
         :return:
@@ -339,13 +336,14 @@ def split_sentences(self, document, text):
 
 class TokenPreservingTokenizer(object):
     """
-    This custom tokenizer simply preserves the
-    tokenization that was already performed during sentence splitting.
-    It will output a list of space separated tokens, whereas each token
-    is a single word from the list of sentences.
+    This custom tokenizer simply preserves the tokenization that was already
+    performed during sentence splitting. It will output a list of space
+    separated tokens, whereas each token is a single word from the list of
+    sentences.
+
     :param vocab: The vocab attribute of the respective spacy language object
-    :param tokenized_sentences: A list of sentences that
-    was previously tokenized/split by spacy
+    :param tokenized_sentences: A list of sentences that was previously
+        tokenized/split by spacy
     :return:
     """