diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000000000..583cff72a2a08d --- /dev/null +++ b/.gitignore @@ -0,0 +1,305 @@ + +# Created by https://www.gitignore.io/api/sbt,java,scala,python,eclipse,intellij,intellij+all + +### Eclipse ### + +.metadata +bin/ +tmp/ +*.tmp +*.bak +*.swp +*~.nib +local.properties +.settings/ +.loadpath +.recommenders + +# External tool builders +.externalToolBuilders/ + +# Locally stored "Eclipse launch configurations" +*.launch + +# PyDev specific (Python IDE for Eclipse) +*.pydevproject + +# CDT-specific (C/C++ Development Tooling) +.cproject + +# Java annotation processor (APT) +.factorypath + +# PDT-specific (PHP Development Tools) +.buildpath + +# sbteclipse plugin +.target + +# Tern plugin +.tern-project + +# TeXlipse plugin +.texlipse + +# STS (Spring Tool Suite) +.springBeans + +# Code Recommenders +.recommenders/ + +# Scala IDE specific (Scala & Java development for Eclipse) +.cache-main +.scala_dependencies +.worksheet + +### Eclipse Patch ### +# Eclipse Core +.project + +# JDT-specific (Eclipse Java Development Tools) +.classpath + +### Intellij ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff: +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/dictionaries + +# Sensitive or high-churn files: +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.xml +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml + +# Gradle: +.idea/**/gradle.xml +.idea/**/libraries + +# CMake +cmake-build-debug/ + +# Mongo Explorer plugin: +.idea/**/mongoSettings.xml + +## File-based project format: +*.iws + +## Plugin-specific files: + +# IntelliJ +/out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +### Intellij Patch ### +# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 + +# *.iml +# modules.xml +# .idea/misc.xml +# *.ipr + +# Sonarlint plugin +.idea/sonarlint + +### Intellij+all ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff: + +# Sensitive or high-churn files: + +# Gradle: + +# CMake + +# Mongo Explorer plugin: + +## File-based project format: + +## Plugin-specific files: + +# IntelliJ + +# mpeltonen/sbt-idea plugin + +# JIRA plugin + +# Cursive Clojure plugin + +# Crashlytics plugin (for Android Studio and IntelliJ) + +### Intellij+all Patch ### +# Ignores the whole idea folder +# See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 + +.idea/ + +### Java ### +# Compiled class file +*.class + +# Log file +*.log + +# BlueJ files +*.ctxt + +# Mobile Tools for Java (J2ME) +.mtj.tmp/ + +# Package Files # +*.jar +*.war +*.ear +*.zip +*.tar.gz +*.rar + +# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml +hs_err_pid* + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +### SBT ### +# Simple Build Tool +# http://www.scala-sbt.org/release/docs/Getting-Started/Directories.html#configuring-version-control + +dist/* +lib_managed/ +src_managed/ +project/boot/ +project/plugins/project/ +.history +.lib/ + +### Scala ### + +# End of https://www.gitignore.io/api/sbt,java,scala,python,eclipse,intellij,intellij+all + +### Local ### +tmp_pipeline/ +test-output-tmp/ +spark-warehouse/ +/python/python.iml diff --git a/.sbtrc b/.sbtrc new file mode 100644 index 00000000000000..2d9a8d79a590aa --- /dev/null +++ b/.sbtrc @@ -0,0 +1 @@ +alias assemblyAndCopy=;assembly;copyAssembledJar diff --git a/build.sbt b/build.sbt index cfa5631cb158c1..e59884ef7fa47c 100644 --- a/build.sbt +++ b/build.sbt @@ -1,11 +1,32 @@ - -val scalaLangVersion = "2.11.11" -val sparkVersion = "2.1.1" +val sparkVer = "2.1.1" +val scalaVer = "2.11.11" val scalaTestVersion = "3.0.0" +/** Package attributes */ +name := "spark-nlp" +organization := "johnsnowlabs" +version := "1.1.0" +scalaVersion := scalaVer +sparkVersion := sparkVer + + +/** Spark-Package attributes */ +spName := "johnsnowlabs/spark-nlp" +sparkComponents ++= Seq("mllib") +licenses += "Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0") +spIncludeMaven := false +spAppendScalaVersion := false +assemblyOption in assembly := (assemblyOption in assembly).value.copy( + includeScala = false +) +credentials += Credentials(Path.userHome / ".ivy2" / ".sbtcredentials") +ivyScala := ivyScala.value map { + _.copy(overrideScalaVersion = true) +} + lazy val analyticsDependencies = Seq( - "org.apache.spark" %% "spark-core" % sparkVersion % "provided", - "org.apache.spark" %% "spark-mllib" % sparkVersion % "provided" + "org.apache.spark" %% "spark-core" % sparkVer % "provided", + "org.apache.spark" %% "spark-mllib" % sparkVer % "provided" ) lazy val testDependencies = Seq( @@ -16,6 +37,14 @@ lazy val utilDependencies = Seq( "com.typesafe" % "config" % "1.3.0" ) +lazy val root = (project in file(".")) + .settings( + libraryDependencies ++= + analyticsDependencies ++ + testDependencies ++ + utilDependencies + ) + parallelExecution in Test := false logBuffered in Test := false @@ -30,24 +59,12 @@ testOptions in Test += Tests.Argument("-oF") /** Disables tests in assembly */ test in assembly := {} -lazy val root = (project in file(".")) - .settings( - name := "spark-nlp", - version := "1.0.0", - organization := "com.jsl.nlp", - scalaVersion := scalaLangVersion, - libraryDependencies ++= - analyticsDependencies ++ - testDependencies ++ - utilDependencies - ) - /** Copies the assembled jar to the pyspark/lib dir **/ lazy val copyAssembledJar = taskKey[Unit]("Copy assembled jar to pyspark/lib") copyAssembledJar := { val jarFilePath = (assemblyOutputPath in assembly).value - val newJarFilePath = baseDirectory( _ / "pysparknlp" / "lib" / "sparknlp.jar").value + val newJarFilePath = baseDirectory( _ / "python" / "lib" / "sparknlp.jar").value IO.copyFile(jarFilePath, newJarFilePath) println(s"[info] $jarFilePath copied to $newJarFilePath ") -} \ No newline at end of file +} diff --git a/project/plugins.sbt b/project/plugins.sbt new file mode 100644 index 00000000000000..c569d895bf312d --- /dev/null +++ b/project/plugins.sbt @@ -0,0 +1,3 @@ +resolvers += "bintray-spark-packages" at "https://dl.bintray.com/spark-packages/maven/" + +addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.6") diff --git a/pysparknlp/com/__init__.py b/python/com/__init__.py similarity index 100% rename from pysparknlp/com/__init__.py rename to python/com/__init__.py diff --git a/pysparknlp/com/jsl/__init__.py b/python/com/jsl/__init__.py similarity index 100% rename from pysparknlp/com/jsl/__init__.py rename to python/com/jsl/__init__.py diff --git a/pysparknlp/com/jsl/nlp/__init__.py b/python/com/jsl/nlp/__init__.py similarity index 100% rename from pysparknlp/com/jsl/nlp/__init__.py rename to python/com/jsl/nlp/__init__.py diff --git a/pysparknlp/example/dictionary-sentiment/sentiment.ipynb b/python/example/dictionary-sentiment/sentiment.ipynb similarity index 100% rename from pysparknlp/example/dictionary-sentiment/sentiment.ipynb rename to python/example/dictionary-sentiment/sentiment.ipynb diff --git a/pysparknlp/example/vivekn-sentiment/sentiment.ipynb b/python/example/vivekn-sentiment/sentiment.ipynb similarity index 100% rename from pysparknlp/example/vivekn-sentiment/sentiment.ipynb rename to python/example/vivekn-sentiment/sentiment.ipynb diff --git a/pysparknlp/run-tests.py b/python/run-tests.py similarity index 100% rename from pysparknlp/run-tests.py rename to python/run-tests.py diff --git a/pysparknlp/setup.py b/python/setup.py similarity index 100% rename from pysparknlp/setup.py rename to python/setup.py diff --git a/pysparknlp/sparknlp/__init__.py b/python/sparknlp/__init__.py similarity index 100% rename from pysparknlp/sparknlp/__init__.py rename to python/sparknlp/__init__.py diff --git a/pysparknlp/sparknlp/annotator.py b/python/sparknlp/annotator.py similarity index 100% rename from pysparknlp/sparknlp/annotator.py rename to python/sparknlp/annotator.py diff --git a/pysparknlp/sparknlp/base.py b/python/sparknlp/base.py similarity index 100% rename from pysparknlp/sparknlp/base.py rename to python/sparknlp/base.py diff --git a/pysparknlp/sparknlp/common.py b/python/sparknlp/common.py similarity index 100% rename from pysparknlp/sparknlp/common.py rename to python/sparknlp/common.py diff --git a/pysparknlp/sparknlp/util.py b/python/sparknlp/util.py similarity index 100% rename from pysparknlp/sparknlp/util.py rename to python/sparknlp/util.py diff --git a/pysparknlp/test/__init__.py b/python/test/__init__.py similarity index 100% rename from pysparknlp/test/__init__.py rename to python/test/__init__.py diff --git a/pysparknlp/test/annotators.py b/python/test/annotators.py similarity index 100% rename from pysparknlp/test/annotators.py rename to python/test/annotators.py diff --git a/pysparknlp/test/misc.py b/python/test/misc.py similarity index 100% rename from pysparknlp/test/misc.py rename to python/test/misc.py diff --git a/pysparknlp/test/util.py b/python/test/util.py similarity index 100% rename from pysparknlp/test/util.py rename to python/test/util.py