From 8d07f849f40db7271d19113edb2afc5d6e42680d Mon Sep 17 00:00:00 2001 From: bunyamin-polat Date: Thu, 27 Apr 2023 17:15:41 +0700 Subject: [PATCH 1/4] Add model 2023-04-27-legner_mapa_de --- .../2023-04-27-legner_mapa_de.md | 131 ++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 docs/_posts/bunyamin-polat/2023-04-27-legner_mapa_de.md diff --git a/docs/_posts/bunyamin-polat/2023-04-27-legner_mapa_de.md b/docs/_posts/bunyamin-polat/2023-04-27-legner_mapa_de.md new file mode 100644 index 0000000000..24f7daab56 --- /dev/null +++ b/docs/_posts/bunyamin-polat/2023-04-27-legner_mapa_de.md @@ -0,0 +1,131 @@ +--- +layout: model +title: Legal NER for MAPA(Multilingual Anonymisation for Public Administrations) +author: John Snow Labs +name: legner_mapa +date: 2023-04-27 +tags: [de, ner, legal, licensed, mapa] +task: Named Entity Recognition +language: de +edition: Legal NLP 1.0.0 +spark_version: 3.0 +supported: true +annotator: LegalNerModel +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +The dataset consists of 12 documents taken from EUR-Lex, a multilingual corpus of court decisions and legal dispositions in the 24 official languages of the European Union. + +This model extracts `ADDRESS`, `AMOUNT`, `DATE`, `ORGANISATION`, and `PERSON` entities from `German` documents. + +## Predicted Entities + +`ADDRESS`, `AMOUNT`, `DATE`, `ORGANISATION`, `PERSON` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/legal/models/legner_mapa_de_1.0.0_3.0_1682589773968.zip){:.button.button-orange} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/legal/models/legner_mapa_de_1.0.0_3.0_1682589773968.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = nlp.DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = nlp.Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + +embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_base_de_cased", "de")\ + .setInputCols(["sentence", "token"])\ + .setOutputCol("embeddings")\ + .setMaxSentenceLength(512)\ + .setCaseSensitive(True) + +ner_model = legal.NerModel.pretrained("legner_mapa", "de", "legal/models")\ + .setInputCols(["sentence", "token", "embeddings"])\ + .setOutputCol("ner") + +ner_converter = nlp.NerConverter()\ + .setInputCols(["sentence", "token", "ner"])\ + .setOutputCol("ner_chunk") + +nlpPipeline = nlp.Pipeline(stages=[ + document_assembler, + sentence_detector, + tokenizer, + embeddings, + ner_model, + ner_converter]) + +empty_data = spark.createDataFrame([[""]]).toDF("text") + +model = nlpPipeline.fit(empty_data) + +text = ["""Herr Liberato und Frau Grigorescu heirateten am 22 Oktober 2005 in Rom (Italien) und lebten in diesem Mitgliedstaat bis zur Geburt ihres Kindes am 20 Februar 2006 zusammen."""] + +result = model.transform(spark.createDataFrame([text]).toDF("text")) +``` + +
+ +## Results + +```bash ++----------------+---------+ +|chunk |ner_label| ++----------------+---------+ +|Herr Liberato |PERSON | +|Frau Grigorescu |PERSON | +|22 Oktober 2005|DATE | +|Rom (Italien) |ADDRESS | +|20 Februar 2006 |DATE | ++----------------+---------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|legner_mapa| +|Compatibility:|Legal NLP 1.0.0+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|de| +|Size:|1.4 MB| + +## References + +The dataset is available [here](https://huggingface.co/datasets/joelito/mapa). + +## Benchmarking + +```bash +label precision recall f1-score support +ADDRESS 0.69 0.85 0.76 13 +AMOUNT 1.0 0.75 0.86 4 +DATE 0.92 0.93 0.93 61 +ORGANISATION 0.64 0.77 0.7 30 +PERSON 0.85 0.87 0.86 46 +macro-avg 0.82 0.87 0.84 154 +macro-avg 0.82 0.83 0.82 154 +weighted-avg 0.83 0.87 0.85 154 +``` \ No newline at end of file From 0028093c3bdaceee40e396823538e8dd676378c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=BCnyamin=20Polat?= <78386903+bunyamin-polat@users.noreply.github.com> Date: Thu, 27 Apr 2023 13:16:36 +0300 Subject: [PATCH 2/4] Update 2023-04-27-legner_mapa_de.md --- docs/_posts/bunyamin-polat/2023-04-27-legner_mapa_de.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/_posts/bunyamin-polat/2023-04-27-legner_mapa_de.md b/docs/_posts/bunyamin-polat/2023-04-27-legner_mapa_de.md index 24f7daab56..07f89c1c1e 100644 --- a/docs/_posts/bunyamin-polat/2023-04-27-legner_mapa_de.md +++ b/docs/_posts/bunyamin-polat/2023-04-27-legner_mapa_de.md @@ -38,6 +38,7 @@ This model extracts `ADDRESS`, `AMOUNT`, `DATE`, `ORGANISATION`, and `PERSON` en
{% include programmingLanguageSelectScalaPythonNLU.html %} + ```python document_assembler = nlp.DocumentAssembler()\ .setInputCol("text")\ @@ -121,11 +122,11 @@ The dataset is available [here](https://huggingface.co/datasets/joelito/mapa). ```bash label precision recall f1-score support ADDRESS 0.69 0.85 0.76 13 -AMOUNT 1.0 0.75 0.86 4 +AMOUNT 1.00 0.75 0.86 4 DATE 0.92 0.93 0.93 61 -ORGANISATION 0.64 0.77 0.7 30 +ORGANISATION 0.64 0.77 0.70 30 PERSON 0.85 0.87 0.86 46 macro-avg 0.82 0.87 0.84 154 macro-avg 0.82 0.83 0.82 154 weighted-avg 0.83 0.87 0.85 154 -``` \ No newline at end of file +``` From 481ba5b207fbfb7e3f35b3bac3e5e7ef68cfbc60 Mon Sep 17 00:00:00 2001 From: bunyamin-polat Date: Thu, 27 Apr 2023 17:24:42 +0700 Subject: [PATCH 3/4] Add model 2023-04-27-legner_mapa_el --- .../2023-04-27-legner_mapa_el.md | 131 ++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 docs/_posts/bunyamin-polat/2023-04-27-legner_mapa_el.md diff --git a/docs/_posts/bunyamin-polat/2023-04-27-legner_mapa_el.md b/docs/_posts/bunyamin-polat/2023-04-27-legner_mapa_el.md new file mode 100644 index 0000000000..2fc8d9d9db --- /dev/null +++ b/docs/_posts/bunyamin-polat/2023-04-27-legner_mapa_el.md @@ -0,0 +1,131 @@ +--- +layout: model +title: Legal NER for MAPA(Multilingual Anonymisation for Public Administrations) +author: John Snow Labs +name: legner_mapa +date: 2023-04-27 +tags: [el, ner, legal, mapa, licensed] +task: Named Entity Recognition +language: el +edition: Legal NLP 1.0.0 +spark_version: 3.0 +supported: true +annotator: LegalNerModel +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +The dataset consists of 12 documents taken from EUR-Lex, a multilingual corpus of court decisions and legal dispositions in the 24 official languages of the European Union. + +This model extracts `ADDRESS`, `AMOUNT`, `DATE`, `ORGANISATION`, and `PERSON` entities from `Greek` documents. + +## Predicted Entities + +`ADDRESS`, `AMOUNT`, `DATE`, `ORGANISATION`, `PERSON` + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/legal/models/legner_mapa_el_1.0.0_3.0_1682590655353.zip){:.button.button-orange} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/legal/models/legner_mapa_el_1.0.0_3.0_1682590655353.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +document_assembler = nlp.DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx")\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = nlp.Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + +embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_base_el_cased", "el")\ + .setInputCols(["sentence", "token"])\ + .setOutputCol("embeddings")\ + .setMaxSentenceLength(512)\ + .setCaseSensitive(True) + +ner_model = legal.NerModel.pretrained("legner_mapa", "el", "legal/models")\ + .setInputCols(["sentence", "token", "embeddings"])\ + .setOutputCol("ner") + +ner_converter = nlp.NerConverter()\ + .setInputCols(["sentence", "token", "ner"])\ + .setOutputCol("ner_chunk") + +nlpPipeline = nlp.Pipeline(stages=[ + document_assembler, + sentence_detector, + tokenizer, + embeddings, + ner_model, + ner_converter]) + +empty_data = spark.createDataFrame([[""]]).toDF("text") + +model = nlpPipeline.fit(empty_data) + +text = ["""86 Στην υπόθεση της κύριας δίκης, προκύπτει ότι ορισμένοι εργαζόμενοι της Martin‑Meat αποσπάσθηκαν στην Αυστρία κατά την περίοδο μεταξύ του έτους 2007 και του έτους 2012, για την εκτέλεση εργασιών τεμαχισμού κρέατος σε εγκαταστάσεις της Alpenrind."""] + +result = model.transform(spark.createDataFrame([text]).toDF("text")) +``` + +
+ +## Results + +```bash ++-----------+------------+ +|chunk |ner_label | ++-----------+------------+ +|Martin‑Meat|ORGANISATION| +|Αυστρία |ADDRESS | +|2007 |DATE | +|2012 |DATE | +|Alpenrind |ORGANISATION| ++-----------+------------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|legner_mapa| +|Compatibility:|Legal NLP 1.0.0+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|el| +|Size:|16.4 MB| + +## References + +The dataset is available [here](https://huggingface.co/datasets/joelito/mapa). + +## Benchmarking + +```bash +label precision recall f1-score support +ADDRESS 0.89 1.0 0.94 16 +AMOUNT 0.82 0.75 0.78 12 +DATE 0.98 0.98 0.98 65 +ORGANISATION 0.85 0.85 0.85 40 +PERSON 0.9 0.95 0.92 38 +macro-avg 0.91 0.93 0.92 171 +macro-avg 0.89 0.91 0.9 171 +weighted-avg 0.91 0.93 0.92 171 +``` \ No newline at end of file From 780d5265bece0aa62492968a9f4d3de78c772f9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=BCnyamin=20Polat?= <78386903+bunyamin-polat@users.noreply.github.com> Date: Thu, 27 Apr 2023 13:25:29 +0300 Subject: [PATCH 4/4] Update 2023-04-27-legner_mapa_el.md --- docs/_posts/bunyamin-polat/2023-04-27-legner_mapa_el.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/_posts/bunyamin-polat/2023-04-27-legner_mapa_el.md b/docs/_posts/bunyamin-polat/2023-04-27-legner_mapa_el.md index 2fc8d9d9db..6f10765afe 100644 --- a/docs/_posts/bunyamin-polat/2023-04-27-legner_mapa_el.md +++ b/docs/_posts/bunyamin-polat/2023-04-27-legner_mapa_el.md @@ -38,6 +38,7 @@ This model extracts `ADDRESS`, `AMOUNT`, `DATE`, `ORGANISATION`, and `PERSON` en
{% include programmingLanguageSelectScalaPythonNLU.html %} + ```python document_assembler = nlp.DocumentAssembler()\ .setInputCol("text")\ @@ -120,12 +121,12 @@ The dataset is available [here](https://huggingface.co/datasets/joelito/mapa). ```bash label precision recall f1-score support -ADDRESS 0.89 1.0 0.94 16 +ADDRESS 0.89 1.00 0.94 16 AMOUNT 0.82 0.75 0.78 12 DATE 0.98 0.98 0.98 65 ORGANISATION 0.85 0.85 0.85 40 -PERSON 0.9 0.95 0.92 38 +PERSON 0.90 0.95 0.92 38 macro-avg 0.91 0.93 0.92 171 -macro-avg 0.89 0.91 0.9 171 +macro-avg 0.89 0.91 0.90 171 weighted-avg 0.91 0.93 0.92 171 -``` \ No newline at end of file +```