JohnSnowLabs · saif-ellafi · Jan 27, 2018 · Nov 24, 2017 · Nov 29, 2017 · Nov 29, 2017
diff --git a/docs/components.html b/docs/components.html
@@ -1055,7 +1055,8 @@ <h4 id="ViveknSentimentDetector" class="section-block"> 14. ViveknSentimentDetec
                                     </div>
                                 </div>
 
-                                <h4 id="Finisher" class="section-block"> 15. Finisher: Getting data out </h4>
+
+                                <h4 id="AssertionStatus" class="section-block"> 15. AssertionStatus: Assertion Status Classifier</h4>
                                 <ul class="nav nav-tabs" role="tablist">
                                     <li role="presentation" class="active"><a href="#python" aria-controls="home"
                                                                               role="tab" data-toggle="tab">Python</a>
@@ -1067,8 +1068,100 @@ <h4 id="Finisher" class="section-block"> 15. Finisher: Getting data out </h4>
                                     <div role="tabpanel" class="tab-pane active" id="python">
                                         <div class="code-block">
                                             <p>
+                                                Assigns an assertion status to a target within a sentence. For example, in the sentence "there's no intention to evacuate the area", considering "intention to evacuate the area" as a target, a possible status could be "Negated". This annotator allows you to specify a text, a target, and a set of possible labels describing the assertion status.<br>
+                                                <b>Type:</b> assertion<br>
+                                                <b>Requires:</b> Document, Token<br>
+                                                <b>Functions:</b>
+                                            <ul>
+                                                <li>
+                                                    setLabelCol(name): sets the name of the column that contains the label for the assertion. The set of labels is inferred from the values present in this column.
+                                                    You don't need to specify them explicitly.
+                                                </li>
+                                                <li>
+                                                    setInputCol(document): sets the name of the column that contains the text to be analyzed.
+                                                </li>
+                                                <li>
+                                                    setOutputCol(name): this is where the annotations with the label will be after the algorithm runs.
+                                                </li>
+                                                <li>
+                                                    setBefore(n): specifies the number of context tokens before the target term(s) that will be used in the algorithm.
+                                                </li>
+                                                <li>
+                                                    setAfter(m): specifies the number of context tokens after the first token of the target term(s) that will be used in the algorithm.
+                                                </li>
+                                                <li>
+                                                    setEmbeddingsSource(path, size, format): specifies the path to the embeddings file(string), the size of the vectors(integer), and the format of the file(one of the constants Text, Binary, SparkNlp).
+                                                </li>
+                                            </ul>
+                                            <br>
+                                            <b>Input:</b> a document as output by the Document Assembler.<br>
+                                            <b>Example:</b><br>
+                                            </p>
+                                            <pre><code class="language-python">
+assertion_status = AssertionStatusApproach() \
+.setLabelCol("label") \
+      .setInputCols("document") \
+      .setOutputCol("assertion") \
+      .setBefore(11) \
+      .setAfter(13) \
+      .setEmbeddingsSource(embeddingsFile, 200, 3)</code></pre>
+                                        </div><!--//code-block-->
+                                    </div>
+                                    <div role="tabpanel" class="tab-pane active" id="scala">
+                                        <div class="code-block">
+                                            <p>
+                                                Assigns an assertion status to a target within a sentence. For example, in the sentence "there's no intention to evacuate the area", considering "intention to evacuate the area" as a target, a possible status could be "Negated". This annotator allows you to specify a text, a target, and a set of possible labels describing the assertion status.<br>
+                                                <b>Type:</b> assertion<br>
+                                                <b>Requires:</b> Document, Token<br>
+                                                <b>Functions:</b>
+                                            <ul>
+                                                <li>
+                                                    setLabelCol(name): sets the name of the column that contains the label for the assertion. The set of labels is inferred from the values present in this column.
+                                                    You don't need to specify them explicitly.
+                                                </li>
+                                                <li>
+                                                    setInputCol(document): sets the name of the column that contains the text to be analyzed.
+                                                </li>
+                                                <li>
+                                                    setOutputCol(name): this is where the annotations with the label will be after the algorithm runs.
+                                                </li>
+                                                <li>
+                                                    setBefore(n): specifies the number of context tokens before the target term(s) that will be used in the algorithm.
+                                                </li>
+                                                <li>
+                                                    setAfter(m): specifies the number of context tokens after the first token of the target term(s) that will be used in the algorithm.
+                                                </li>
+                                                <li>
+                                                    setEmbeddingsSource(path, size, format): specifies the path to the embeddings file(string), the size of the vectors(integer), and the format of the file(one of the constants Text, Binary, SparkNlp).
+                                                </li>
+                                            </ul>
+                                            <br>
+                                            <b>Input:</b> a document as output by the Document Assembler.<br>
+                                            <b>Example:</b><br>
+                                            </p>
+                                            <pre><code class="language-python">
+val assertionStatus = new AssertionStatusApproach() 
+.setLabelCol("label") 
+      .setInputCols("document") 
+      .setOutputCol("assertion") 
+      .setBefore(11) 
+      .setAfter(13)
+      .setEmbeddingsSource(embeddingsFile, 200, WordEmbeddingsFormat.Binary)</code></pre>
+                                        </div><!--//code-block-->
+                                    </div>
 
-
+                                <h4 id="Finisher" class="section-block"> 16. Finisher: Getting data out </h4>
+                                <ul class="nav nav-tabs" role="tablist">
+                                    <li role="presentation" class="active"><a href="#python" aria-controls="home"
+                                                                              role="tab" data-toggle="tab">Python</a>
+                                    </li>
+                                    <li role="presentation"><a href="#scala" aria-controls="profile" role="tab"
+                                                               data-toggle="tab">Scala</a></li>
+                                </ul>
+                                <div class="tab-content">
+                                    <div role="tabpanel" class="tab-pane active" id="python">
+                                        <div class="code-block">
+                                            <p>
                                                 Once we have our NLP pipeline ready to go, we might want to use our
                                                 annotation results somewhere else where it is easy to use. The Finisher
                                                 outputs annotation(s) values into string.
@@ -1153,7 +1246,7 @@ <h4 id="Finisher" class="section-block"> 15. Finisher: Getting data out </h4>
                                             </ul>
                                         </div><!--//code-block--></div>
                                 </div>
-                              <h4 id="TokenAssembler" class="section-block"> 16. TokenAssembler: Getting data reshaped </h4>
+                              <h4 id="TokenAssembler" class="section-block"> 17. TokenAssembler: Getting data reshaped </h4>
                               <ul class="nav nav-tabs" role="tablist">
                                 <li role="presentation" class="active"><a href="#python" aria-controls="home"
                                                                           role="tab" data-toggle="tab">Python</a>
@@ -1237,8 +1330,8 @@ <h4 id="TokenAssembler" class="section-block"> 16. TokenAssembler: Getting data
                                     <li><a class="scrollto" href="#SentimentDetector">Sentiment Detector</a></li>
                                     <li><a class="scrollto" href="#NERTagger">NERTagger</a></li>
                                     <li><a class="scrollto" href="#SpellChecker">Spell Checker</a></li>
-                                    <li><a class="scrollto" href="#ViveknSentimentDetector">Vivekn Sentiment
-                                        Detector</a></li>
+                                    <li><a class="scrollto" href="#ViveknSentimentDetector">Vivekn Sentiment Detector</a></li>
+                                    <li><a class="scrollto" href="#AssertionStatus">Assertion Status</a></li>
                                     <li><a class="scrollto" href="#Finisher">Finisher</a></li>
 
                                 </ul><!--//nav-->

diff --git a/python/example/logreg-assertion/assertion.ipynb b/python/example/logreg-assertion/assertion.ipynb
@@ -0,0 +1,256 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.append('../../')\n",
+    "\n",
+    "from pyspark.sql import SparkSession\n",
+    "from pyspark.ml import Pipeline\n",
+    "\n",
+    "from sparknlp.annotator import *\n",
+    "from sparknlp.common import *\n",
+    "from sparknlp.base import *\n",
+    "\n",
+    "if sys.version_info[0] < 3:\n",
+    "    from urllib import urlretrieve\n",
+    "else:\n",
+    "    from urllib.request import urlretrieve\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark = SparkSession.builder \\\n",
+    "    .appName(\"assertion-status\")\\\n",
+    "    .master(\"local[2]\")\\\n",
+    "    .config(\"spark.driver.memory\",\"4G\")\\\n",
+    "    .config(\"spark.driver.maxResultSize\", \"2G\")\\\n",
+    "    .config(\"spark.jar\", \"lib/sparknlp.jar\")\\\n",
+    "    .config(\"spark.kryoserializer.buffer.max\", \"500m\")\\\n",
+    "    .getOrCreate()"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {},
+   "source": [
+    "1. required imports.\n",
+    "2. create spark session."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "\n",
+    "\n",
+    "embeddingsFile = 'PubMed-shuffle-win-2.bin'\n",
+    "embeddingsUrl = 'https://s3.amazonaws.com/auxdata.johnsnowlabs.com/PubMed-shuffle-win-2.bin'\n",
+    "# this may take a couple minutes\n",
+    "urlretrieve('https://tpc.googlesyndication.com/simgad/15370925399314456202', embeddingsFile)\n",
+    "\n",
+    "documentAssembler = DocumentAssembler()\\\n",
+    "    .setInputCol(\"sentence\")\\\n",
+    "    .setOutputCol(\"document\")\\\n",
+    "\n",
+    "assertion = AssertionLogRegApproach()\\\n",
+    "    .setLabelCol(\"label\")\\\n",
+    "    .setInputCols([\"document\"])\\\n",
+    "    .setOutputCol(\"assertion\")\\\n",
+    "    .setBefore(11)\\\n",
+    "    .setAfter(13)\\\n",
+    "    .setEmbeddingsSource(embeddingsFile,200,3)\n",
+    "\n",
+    "\n",
+    "finisher = Finisher() \\\n",
+    "    .setInputCols([\"assertion\"]) \\\n",
+    "    .setIncludeKeys(True)\n",
+    "\n",
+    "pipeline = Pipeline(\n",
+    "    stages = [\n",
+    "    documentAssembler,\n",
+    "    assertion,\n",
+    "    finisher\n",
+    "  ])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+--------------------+--------------------+--------+-----+---+\n",
+      "|            sentence|              target|   label|start|end|\n",
+      "+--------------------+--------------------+--------+-----+---+\n",
+      "|**initials ______...|multinodular goit...|Affirmed|   21| 25|\n",
+      "|02) mild aortic r...|mild aortic regur...|Affirmed|    1|  3|\n",
+      "|02) mild left atr...|mild left atrial ...|Affirmed|    1|  4|\n",
+      "|02) mild left atr...|mild left atrial ...|Affirmed|    1|  4|\n",
+      "|02) mild to moder...|mild to moderate ...|Affirmed|    1|  5|\n",
+      "|02) mild to moder...|mild to moderate ...|Affirmed|    1|  5|\n",
+      "|02) no valvular a...|valvular abnormal...| Negated|    2|  3|\n",
+      "|02) nondilated ri...|nondilated right ...|Affirmed|    1|  9|\n",
+      "|02) normal left v...|normal left ventr...|Affirmed|    1|  4|\n",
+      "|02) normal left v...|normal left ventr...|Affirmed|    1|  6|\n",
+      "|02) paradoxical s...|post-operative se...|Affirmed|    6|  8|\n",
+      "|02) small left ve...|small left ventri...|Affirmed|    1|  8|\n",
+      "|03) mild mitral r...|mild mitral regur...|Affirmed|    1|  3|\n",
+      "|03) mitral annula...|mitral annular ca...|Affirmed|    1|  3|\n",
+      "|03) moderate left...|moderate left atr...|Affirmed|    1|  4|\n",
+      "|03) normal pulmon...|normal pulmonary ...|Affirmed|    1|  5|\n",
+      "|03) thickened aor...|thickened aortic ...|Affirmed|    1|  3|\n",
+      "|03) thickened aor...|thickened aortic ...|Affirmed|    1|  6|\n",
+      "|03) thickened aor...|thickened aortic ...|Affirmed|    1|  8|\n",
+      "|03) thickened mit...|thickened mitral ...|Affirmed|    1|  6|\n",
+      "+--------------------+--------------------+--------+-----+---+\n",
+      "only showing top 20 rows\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Load the input data to be annotated\n",
+    "data = spark. \\\n",
+    "        read. \\\n",
+    "        parquet(\"../../../src/test/resources/negex.parquet\"). \\\n",
+    "        limit(3000)\n",
+    "data.cache()\n",
+    "data.count()\n",
+    "data.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Start fitting\n",
+      "Fitting is ended\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Start fitting\")\n",
+    "model = pipeline.fit(data)\n",
+    "print(\"Fitting is ended\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+--------------------+--------------------+------------------+\n",
+      "|            sentence|              target|finished_assertion|\n",
+      "+--------------------+--------------------+------------------+\n",
+      "|**initials ______...|multinodular goit...|  result->Affirmed|\n",
+      "|02) mild aortic r...|mild aortic regur...|  result->Affirmed|\n",
+      "|02) mild left atr...|mild left atrial ...|  result->Affirmed|\n",
+      "|02) mild left atr...|mild left atrial ...|  result->Affirmed|\n",
+      "|02) mild to moder...|mild to moderate ...|  result->Affirmed|\n",
+      "|02) mild to moder...|mild to moderate ...|  result->Affirmed|\n",
+      "|02) no valvular a...|valvular abnormal...|   result->Negated|\n",
+      "|02) nondilated ri...|nondilated right ...|  result->Affirmed|\n",
+      "|02) normal left v...|normal left ventr...|  result->Affirmed|\n",
+      "|02) normal left v...|normal left ventr...|  result->Affirmed|\n",
+      "|02) paradoxical s...|post-operative se...|  result->Affirmed|\n",
+      "|02) small left ve...|small left ventri...|  result->Affirmed|\n",
+      "|03) mild mitral r...|mild mitral regur...|  result->Affirmed|\n",
+      "|03) mitral annula...|mitral annular ca...|  result->Affirmed|\n",
+      "|03) moderate left...|moderate left atr...|  result->Affirmed|\n",
+      "|03) normal pulmon...|normal pulmonary ...|  result->Affirmed|\n",
+      "|03) thickened aor...|thickened aortic ...|  result->Affirmed|\n",
+      "|03) thickened aor...|thickened aortic ...|  result->Affirmed|\n",
+      "|03) thickened aor...|thickened aortic ...|  result->Affirmed|\n",
+      "|03) thickened mit...|thickened mitral ...|  result->Affirmed|\n",
+      "+--------------------+--------------------+------------------+\n",
+      "only showing top 20 rows\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "result = model.transform(data)\n",
+    "result.select(\"sentence\", \"target\", \"finished_assertion\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline.write().overwrite().save(\"./assertion_pipeline\")\n",
+    "model.write().overwrite().save(\"./assertion_model\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "from pyspark.ml import PipelineModel, Pipeline\n",
+    "\n",
+    "Pipeline.read().load(\"./assertion_pipeline\")\n",
+    "sameModel = PipelineModel.read().load(\"./assertion_model\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/python/sparknlp/__init__.py b/python/sparknlp/__init__.py
@@ -6,6 +6,8 @@
 sys.modules['com.johnsnowlabs.nlp.annotators.ner'] = annotator
 sys.modules['com.johnsnowlabs.nlp.annotators.ner.regex'] = annotator
 sys.modules['com.johnsnowlabs.nlp.annotators.ner.crf'] = annotator
+sys.modules['com.johnsnowlabs.nlp.annotators.assertion'] = annotator
+sys.modules['com.johnsnowlabs.nlp.annotators.assertion.logreg'] = annotator
 sys.modules['com.johnsnowlabs.nlp.annotators.pos'] = annotator
 sys.modules['com.johnsnowlabs.nlp.annotators.pos.perceptron'] = annotator
 sys.modules['com.johnsnowlabs.nlp.annotators.sbd'] = annotator