Skip to content

Commit

Permalink
Merge da1070f into ddd0ef8
Browse files Browse the repository at this point in the history
  • Loading branch information
maziyarpanahi authored Sep 25, 2023
2 parents ddd0ef8 + da1070f commit 3cf2b43
Show file tree
Hide file tree
Showing 35 changed files with 3,484 additions and 194 deletions.
122 changes: 122 additions & 0 deletions docs/en/annotator_entries/Date2Chunk.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
{%- capture title -%}
Date2Chunk
{%- endcapture -%}

{%- capture description -%}
Converts `DATE` type Annotations to `CHUNK` type.

This can be useful if the following annotators after DateMatcher and MultiDateMatcher require
`CHUNK` types. The entity name in the metadata can be changed with `setEntityName`.
{%- endcapture -%}

{%- capture input_anno -%}
DATE
{%- endcapture -%}

{%- capture output_anno -%}
CHUNK
{%- endcapture -%}

{%- capture python_example -%}
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

documentAssembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")

date = DateMatcher() \
.setInputCols(["document"]) \
.setOutputCol("date")

date2Chunk = Date2Chunk() \
.setInputCols(["date"]) \
.setOutputCol("date_chunk")

pipeline = Pipeline().setStages([
documentAssembler,
date,
date2Chunk
])

data = spark.createDataFrame([["Omicron is a new variant of COVID-19, which the World Health Organization designated a variant of concern on Nov. 26, 2021/26/11."]]).toDF("text")

result = pipeline.fit(data).transform(data)

result.select("date_chunk").show(1, truncate=False)
+----------------------------------------------------+
|date_chunk |
+----------------------------------------------------+
|[{chunk, 118, 121, 2021/01/01, {sentence -> 0}, []}]|
+----------------------------------------------------+

{%- endcapture -%}

{%- capture scala_example -%}
import spark.implicits._
import com.johnsnowlabs.nlp.annotator._

import org.apache.spark.ml.Pipeline

val documentAssembler = new DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")

val inputFormats = Array("yyyy", "yyyy/dd/MM", "MM/yyyy", "yyyy")
val outputFormat = "yyyy/MM/dd"

val date = new DateMatcher()
.setInputCols("document")
.setOutputCol("date")

val date2Chunk = new Date2Chunk()
.setInputCols("date")
.setOutputCol("date_chunk")

val pipeline = new Pipeline().setStages(Array(
documentAssembler,
date,
date2Chunk
))

val data = Seq(
"""Omicron is a new variant of COVID-19, which the World Health Organization designated a variant of concern on Nov. 26, 2021/26/11.""",
"""Neighbouring Austria has already locked down its population this week for at until 2021/10/12, becoming the first to reimpose such restrictions."""
).toDF("text")
val result = pipeline.fit(data).transform(data)

result.transform(data).select("date_chunk").show(false)
----------------------------------------------------+
date_chunk |
----------------------------------------------------+
[{chunk, 118, 121, 2021/01/01, {sentence -> 0}, []}]|
[{chunk, 83, 86, 2021/01/01, {sentence -> 0}, []}] |
----------------------------------------------------+

{%- endcapture -%}

{%- capture api_link -%}
[Date2Chunk](/api/com/johnsnowlabs/nlp/annotators/Date2Chunk)
{%- endcapture -%}

{%- capture python_api_link -%}
[Date2Chunk](/api/python/reference/autosummary/sparknlp/annotator/date2_chunk/index.html#sparknlp.annotator.date2_chunk.Date2Chunk)
{%- endcapture -%}

{%- capture source_link -%}
[Date2Chunk](https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/main/scala/com/johnsnowlabs/nlp/annotators/Date2Chunk.scala)
{%- endcapture -%}

{% include templates/anno_template.md
title=title
description=description
input_anno=input_anno
output_anno=output_anno
python_example=python_example
scala_example=scala_example
api_link=api_link
python_api_link=python_api_link
source_link=source_link
%}
213 changes: 213 additions & 0 deletions docs/en/annotator_entries/DocumentSimilarityRanker.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
{%- capture title -%}
DocumentSimilarityRanker
{%- endcapture -%}

{%- capture model_description -%}
Instantiated model of the DocumentSimilarityRankerApproach. For usage and examples see the
documentation of the main class.
{%- endcapture -%}

{%- capture model_input_anno -%}
SENTENCE_EMBEDDINGS
{%- endcapture -%}

{%- capture model_output_anno -%}
DOC_SIMILARITY_RANKINGS
{%- endcapture -%}

{%- capture model_api_link -%}
[DocumentSimilarityRankerModel](/api/com/johnsnowlabs/nlp/annotators/similarity/DocumentSimilarityRankerModel)
{%- endcapture -%}

{%- capture model_python_api_link -%}
[DocumentSimilarityRankerModel](TODO: implement new for new scheme)
{%- endcapture -%}

{%- capture model_source_link -%}
[DocumentSimilarityRankerModel](https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/main/scala/com/johnsnowlabs/nlp/annotators/similarity/DocumentSimilarityRankerModel.scala)
{%- endcapture -%}

{%- capture approach_description -%}
Annotator that uses LSH techniques present in Spark ML lib to execute approximate nearest
neighbors search on top of sentence embeddings.

It aims to capture the semantic meaning of a document in a dense, continuous vector space and
return it to the ranker search.

For instantiated/pretrained models, see DocumentSimilarityRankerModel.

For extended examples of usage, see the jupyter notebook
[Document Similarity Ranker for Spark NLP](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/text-similarity/doc-sim-ranker/test_doc_sim_ranker.ipynb).
{%- endcapture -%}

{%- capture approach_input_anno -%}
SENTENCE_EMBEDDINGS
{%- endcapture -%}

{%- capture approach_output_anno -%}
DOC_SIMILARITY_RANKINGS
{%- endcapture -%}

{%- capture approach_python_example -%}
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from sparknlp.annotator.similarity.document_similarity_ranker import *

document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
sentence_embeddings = E5Embeddings.pretrained() \
.setInputCols(["document"]) \
.setOutputCol("sentence_embeddings")
document_similarity_ranker = DocumentSimilarityRankerApproach() \
.setInputCols("sentence_embeddings") \
.setOutputCol("doc_similarity_rankings") \
.setSimilarityMethod("brp") \
.setNumberOfNeighbours(1) \
.setBucketLength(2.0) \
.setNumHashTables(3) \
.setVisibleDistances(True) \
.setIdentityRanking(False)
document_similarity_ranker_finisher = DocumentSimilarityRankerFinisher() \
.setInputCols("doc_similarity_rankings") \
.setOutputCols(
"finished_doc_similarity_rankings_id",
"finished_doc_similarity_rankings_neighbors") \
.setExtractNearestNeighbor(True)
pipeline = Pipeline(stages=[
document_assembler,
sentence_embeddings,
document_similarity_ranker,
document_similarity_ranker_finisher
])
docSimRankerPipeline = pipeline.fit(data).transform(data)

(
docSimRankerPipeline
.select(
"finished_doc_similarity_rankings_id",
"finished_doc_similarity_rankings_neighbors"
).show(10, False)
)
+-----------------------------------+------------------------------------------+
|finished_doc_similarity_rankings_id|finished_doc_similarity_rankings_neighbors|
+-----------------------------------+------------------------------------------+
|1510101612 |[(1634839239,0.12448559591306324)] |
|1634839239 |[(1510101612,0.12448559591306324)] |
|-612640902 |[(1274183715,0.1220122862046063)] |
|1274183715 |[(-612640902,0.1220122862046063)] |
|-1320876223 |[(1293373212,0.17848855164122393)] |
|1293373212 |[(-1320876223,0.17848855164122393)] |
|-1548374770 |[(-1719102856,0.23297156732534166)] |
|-1719102856 |[(-1548374770,0.23297156732534166)] |
+-----------------------------------+------------------------------------------+

{%- endcapture -%}

{%- capture approach_scala_example -%}
import com.johnsnowlabs.nlp.base._
import com.johnsnowlabs.nlp.annotator._
import com.johnsnowlabs.nlp.annotators.similarity.DocumentSimilarityRankerApproach
import com.johnsnowlabs.nlp.finisher.DocumentSimilarityRankerFinisher
import org.apache.spark.ml.Pipeline

import spark.implicits._

val documentAssembler = new DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")

val sentenceEmbeddings = RoBertaSentenceEmbeddings
.pretrained()
.setInputCols("document")
.setOutputCol("sentence_embeddings")

val documentSimilarityRanker = new DocumentSimilarityRankerApproach()
.setInputCols("sentence_embeddings")
.setOutputCol("doc_similarity_rankings")
.setSimilarityMethod("brp")
.setNumberOfNeighbours(1)
.setBucketLength(2.0)
.setNumHashTables(3)
.setVisibleDistances(true)
.setIdentityRanking(false)

val documentSimilarityRankerFinisher = new DocumentSimilarityRankerFinisher()
.setInputCols("doc_similarity_rankings")
.setOutputCols(
"finished_doc_similarity_rankings_id",
"finished_doc_similarity_rankings_neighbors")
.setExtractNearestNeighbor(true)

// Let's use a dataset where we can visually control similarity
// Documents are coupled, as 1-2, 3-4, 5-6, 7-8 and they were create to be similar on purpose
val data = Seq(
"First document, this is my first sentence. This is my second sentence.",
"Second document, this is my second sentence. This is my second sentence.",
"Third document, climate change is arguably one of the most pressing problems of our time.",
"Fourth document, climate change is definitely one of the most pressing problems of our time.",
"Fifth document, Florence in Italy, is among the most beautiful cities in Europe.",
"Sixth document, Florence in Italy, is a very beautiful city in Europe like Lyon in France.",
"Seventh document, the French Riviera is the Mediterranean coastline of the southeast corner of France.",
"Eighth document, the warmest place in France is the French Riviera coast in Southern France.")
.toDF("text")

val pipeline = new Pipeline().setStages(
Array(
documentAssembler,
sentenceEmbeddings,
documentSimilarityRanker,
documentSimilarityRankerFinisher))

val result = pipeline.fit(data).transform(data)

result
.select("finished_doc_similarity_rankings_id", "finished_doc_similarity_rankings_neighbors")
.show(10, truncate = false)
+-----------------------------------+------------------------------------------+
|finished_doc_similarity_rankings_id|finished_doc_similarity_rankings_neighbors|
+-----------------------------------+------------------------------------------+
|1510101612 |[(1634839239,0.12448559591306324)] |
|1634839239 |[(1510101612,0.12448559591306324)] |
|-612640902 |[(1274183715,0.1220122862046063)] |
|1274183715 |[(-612640902,0.1220122862046063)] |
|-1320876223 |[(1293373212,0.17848855164122393)] |
|1293373212 |[(-1320876223,0.17848855164122393)] |
|-1548374770 |[(-1719102856,0.23297156732534166)] |
|-1719102856 |[(-1548374770,0.23297156732534166)] |
+-----------------------------------+------------------------------------------+

{%- endcapture -%}

{%- capture approach_api_link -%}
[DocumentSimilarityRankerApproach](/api/com/johnsnowlabs/nlp/annotators/similarity/DocumentSimilarityRankerApproach)
{%- endcapture -%}

{%- capture approach_python_api_link -%}
[DocumentSimilarityRankerApproach](/api/python/reference/autosummary/sparknlp/annotator/similarity/document_similarity_ranker/index.html#sparknlp.annotator.similarity.document_similarity_ranker.DocumentSimilarityRankerApproach)
{%- endcapture -%}

{%- capture approach_source_link -%}
[DocumentSimilarityRankerApproach](https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/main/scala/com/johnsnowlabs/nlp/annotators/similarity/DocumentSimilarityRankerApproach.scala)
{%- endcapture -%}


{% include templates/approach_model_template.md
title=title
model_description=model_description
model_input_anno=model_input_anno
model_output_anno=model_output_anno
model_api_link=model_api_link
model_python_api_link=model_python_api_link
model_source_link=model_source_link
approach_description=approach_description
approach_input_anno=approach_input_anno
approach_output_anno=approach_output_anno
approach_python_example=approach_python_example
approach_scala_example=approach_scala_example
approach_api_link=approach_api_link
approach_python_api_link=approach_python_api_link
approach_source_link=approach_source_link
%}
Loading

0 comments on commit 3cf2b43

Please sign in to comment.