In [1]:
# This is only to setup PySpark and Spark NLP on Colab
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

--2022-12-23 12:16:14--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://setup.johnsnowlabs.com/colab.sh [following]
--2022-12-23 12:16:14--  https://setup.johnsnowlabs.com/colab.sh
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2022-12-23 12:16:14--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:44

In [2]:
from pyspark import *
import sparknlp

In [8]:
spark = sparknlp.start()

In [9]:
from sparknlp.annotator import *
from sparknlp.base import *

In [10]:
sparknlp.version()

'4.2.6'

In [11]:
from pyspark.sql.types import StringType

## German formatted dates matching examples

In [12]:
df = spark.createDataFrame(
  ["Wir trafen uns am 13/05/2018 und dann am 18/05/2020."],
  StringType()).toDF("text")
df.show()

+--------------------+
|                text|
+--------------------+
|Wir trafen uns am...|
+--------------------+



In [13]:
document_assembler = DocumentAssembler() \
            .setInputCol("text") \
            .setOutputCol("document")

date_matcher = MultiDateMatcher() \
            .setInputCols(['document']) \
            .setOutputCol("date") \
            .setOutputFormat("MM/dd/yyyy") \
            .setSourceLanguage("de")

assembled = document_assembler.transform(df)
date_matcher.transform(assembled).select("date").show(10, False)

+--------------------------------------------------------------------------------------------------+
|date                                                                                              |
+--------------------------------------------------------------------------------------------------+
|[{date, 18, 27, 05/13/2018, {sentence -> 0}, []}, {date, 41, 50, 05/18/2020, {sentence -> 0}, []}]|
+--------------------------------------------------------------------------------------------------+



## German unformatted dates matching examples

In [14]:
df = spark.createDataFrame(
  ["Wir haben uns vor 2 tagen kennengelernt und sie sagten mir, dass sie uns nächste woche besuchen würden."],
  StringType()).toDF("text")
df.show()

+--------------------+
|                text|
+--------------------+
|Wir haben uns vor...|
+--------------------+



In [15]:
document_assembler = DocumentAssembler() \
            .setInputCol("text") \
            .setOutputCol("document")

date_matcher = MultiDateMatcher() \
            .setInputCols(['document']) \
            .setOutputCol("date") \
            .setOutputFormat("MM/dd/yyyy") \
            .setSourceLanguage("de")

assembled = document_assembler.transform(df)
date_matcher.transform(assembled).select("date").show(10, False)

+--------------------------------------------------------------------------------------------------+
|date                                                                                              |
+--------------------------------------------------------------------------------------------------+
|[{date, 14, 23, 12/21/2022, {sentence -> 0}, []}, {date, 85, 93, 12/30/2022, {sentence -> 0}, []}]|
+--------------------------------------------------------------------------------------------------+



# A short guide to language support extension

## In order to extend the date matchers language support for new languages, please follow the instructions below:

1. Add the new dictionary into src/main/resources/date-matcher/translation-dictionaries/dynamic folder of the spark-nlp project
2. Add the same dictionary base of the other languages
   * Add tests for the dictionary
3. Add other eventual specific expressions to the base
   * Add tests for those specific expressions to avoid syntactic conflicts in parsing
4. Add a notebook like this one to show how to use the language extension

Thank you for contributing! :)