Merge c37e2b5 into 99866b3

weni-ai · Jan 20, 2022 · 41e0d56 · 41e0d56
2 parents 99866b3 + c37e2b5
commit 41e0d56
Show file tree

Hide file tree

Showing 8 changed files with 308 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -25,6 +25,13 @@
 [![Python Version](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/)
 [![License GPL-3.0](https://img.shields.io/badge/license-%20GPL--3.0-yellow.svg)](https://github.com/bothub-it/bothub-engine/blob/master/LICENSE)
 
+# Requirements
+
+* Python (3.6)
+* Pipenv
+* Docker
+* Docker-compose
+
 ## Development
 
 Use ```make``` commands to ```check_environment```, ```install_requirements```, ```lint```, ```test```, ```migrate```, ```start```, ```migrations``` and ```collectstatic```.
@@ -163,9 +170,12 @@ You can set environment variables in your OS, write on ```.env``` file or pass v
 | ELASTICSEARCH_DSL | ```string``` | ```es:9200``` | URL Elasticsearch.
 | ELASTICSEARCH_NUMBER_OF_SHARDS | ```int``` | ```1``` | Specify the number of shards for the indexes.
 | ELASTICSEARCH_NUMBER_OF_REPLICAS | ```int``` | ```1``` | Specify the number of replicas for the indexes.
-| ELASTICSEARCH_REPOSITORYNLPLOG_INDEX | ```string``` | ```repositorynlplog``` | Specify the index title for the RepositoryNlpLog document.
+| ELASTICSEARCH_REPOSITORYNLPLOG_INDEX | ```string``` | ```ai_repositorynlplog``` | Specify the index title for the RepositoryNLPLog document.
+| ELASTICSEARCH_REPOSITORYQANLPLOG_INDEX | ```string``` | ```ai_repositoryqanlplog``` | Specify the index title for the RepositoryQANLPLog document.
+| ELASTICSEARCH_REPOSITORYBASICEXAMPLE_INDEX | ```string``` | ```ai_repositorybasicexample``` | Specify the index title for the RepositoryBasicExample document.
 | ELASTICSEARCH_SIGNAL_PROCESSOR | ```string``` | ```celery``` | Specify the signal processor responsible for updating the Elasticsearch data.
 | GUNICORN_WORKERS | ``` int ``` | ``` multiprocessing.cpu_count() * 2 + 1 ``` | Gunicorn number of workers
+| USE_ELASTICSEARCH | ```boolean``` | ```true``` | Change the logic in requirements_to_train to use either elasticsearch or postgres.
 
 
 ## Roadmap

diff --git a/bothub/common/documents/__init__.py b/bothub/common/documents/__init__.py
@@ -1,7 +1,9 @@
 from bothub.common.documents.repositorynlplog import RepositoryNLPLogDocument
 from bothub.common.documents.repositoryqanlplog import RepositoryQANLPLogDocument
+from bothub.common.documents.repositorybasicexample import RepositoryExampleDocument
 
 __all__ = (
     "RepositoryNLPLogDocument",
     "RepositoryQANLPLogDocument",
+    "RepositoryExampleDocument"
 )
diff --git a/bothub/common/documents/repositorybasicexample.py b/bothub/common/documents/repositorybasicexample.py
@@ -0,0 +1,58 @@
+from django.conf import settings
+from django_elasticsearch_dsl import Document, Index, fields
+
+from bothub.common.models import RepositoryExample, RepositoryExampleEntity, RepositoryIntent, RepositoryVersionLanguage
+
+REPOSITORYBASICEXAMPLE_INDEX = Index(settings.ELASTICSEARCH_INDEX_NAMES[__name__])
+
+
+@REPOSITORYBASICEXAMPLE_INDEX.doc_type
+class RepositoryExampleDocument(Document):
+    repository_version_language = fields.ObjectField(
+        properties={
+            "pk": fields.IntegerField(),
+            "language": fields.TextField(fields={"raw": fields.KeywordField()})
+        }
+    )
+    intent = fields.ObjectField(
+        properties={
+            "text": fields.TextField(
+                fields={"raw": fields.KeywordField()}
+            )
+        }
+    )
+    entities = fields.NestedField(
+        properties={
+            "entity": fields.ObjectField(
+                properties={
+                    "value": fields.TextField(fields={"raw": fields.KeywordField()}),
+                }
+            ),
+        }
+    )
+    pk = fields.IntegerField()
+
+    class Django:
+        model = RepositoryExample
+        fields = [
+            "id",
+            "text",
+        ]
+        related_models = [RepositoryVersionLanguage, RepositoryIntent, RepositoryExampleEntity]
+
+    def get_queryset(self):
+        return super(RepositoryExampleDocument, self).get_queryset().select_related(
+            "repository_version_language",
+            "intent",
+        ).prefetch_related(
+            "entities",
+            "translations",
+        )
+
+    def get_instances_from_related(self, related_instance):
+        if isinstance(related_instance, RepositoryVersionLanguage):
+            return related_instance.added.all()
+        elif isinstance(related_instance, RepositoryIntent):
+            return related_instance.repositoryexample_set.all()
+        elif isinstance(related_instance, RepositoryExampleEntity):
+            return related_instance.repository_example
diff --git a/bothub/common/models.py b/bothub/common/models.py
@@ -14,6 +14,8 @@
 from django.utils import timezone
 from django.utils.translation import ugettext_lazy as _
 from django_elasticsearch_dsl_drf.wrappers import dict_to_obj
+from elasticsearch_dsl import A
+from elasticsearch_dsl import Q as elasticQ
 from rest_framework import status
 from rest_framework.exceptions import APIException
 
@@ -1064,20 +1066,125 @@ def examples(self):
         return examples.distinct()
 
     @property
-    def requirements_to_train(self):
+    def _search_weak_intents_and_entities(self):
+        from bothub.common.documents import RepositoryExampleDocument
+
+        search = RepositoryExampleDocument.search().query(
+            "match", repository_version_language__pk=self.pk
+        )
+        search.update_from_dict({"size": 0})
+
+        duplicated_limit_bucket = A(
+            "bucket_selector",
+            buckets_path={"doc_count": "_count"},
+            script=f"params.doc_count < {self.MIN_EXAMPLES_PER_INTENT}",
+        )
+
+        search.aggs.bucket("duplicated_intents", "terms", field="intent.text.raw")
+        search.aggs["duplicated_intents"].bucket(
+            "filter_duplicated_intent_limit", duplicated_limit_bucket
+        )
+        search.aggs.bucket(
+            "duplicated_intents_stats",
+            "stats_bucket",
+            buckets_path="duplicated_intents._count",
+        )
+
+        search.aggs.bucket("nested_entities", "nested", path="entities")
+        search.aggs["nested_entities"].bucket(
+            "duplicated_entities", "terms", field="entities.entity.value.raw"
+        )
+        search.aggs["nested_entities"]["duplicated_entities"].bucket(
+            "filter_duplicated_entity_limit", duplicated_limit_bucket
+        )
+        search.aggs["nested_entities"].bucket(
+            "duplicated_entities_stats",
+            "stats_bucket",
+            buckets_path="duplicated_entities._count",
+        )
+
+        return search.execute()
+
+    @property
+    def _does_all_examples_have_intents(self):
+        from bothub.common.documents import RepositoryExampleDocument
+
+        search = RepositoryExampleDocument.search().query(
+            "bool",
+            must=[
+                elasticQ("match", intent__text__raw=""),
+                elasticQ("match", repository_version_language__pk=self.pk),
+            ],
+        )
+        return False if search.execute().hits.total.value != 0 else True
+
+    @property
+    def _elasticsearch_requirements_to_train(self):
         try:
             self.validate_init_train()
         except RepositoryUpdateAlreadyTrained:  # pragma: no cover
             return [_("This bot version has already been trained.")]
         except RepositoryUpdateAlreadyStartedTraining:  # pragma: no cover
             return [_("This bot version is being trained.")]
 
-        r = []
+        warnings = []
+
+        if not self._does_all_examples_have_intents:
+            warnings.append(_("All examples need to have a intent."))
+
+        search_result = self._search_weak_intents_and_entities
+
+        weak_intents_count = search_result.aggregations.duplicated_intents_stats.count
+        weak_intents = search_result.aggregations.duplicated_intents.buckets
+
+        if weak_intents_count > 0:
+            for intent in weak_intents:
+                warnings.append(
+                    _(
+                        'The "{}" intention has only {} sentence\nAdd 1 more sentence to that intention (minimum is {})'
+                    ).format(
+                        intent["key"],
+                        intent["doc_count"],
+                        self.MIN_EXAMPLES_PER_INTENT,
+                    )
+                )
+
+        weak_entities_count = (
+            search_result.aggregations.nested_entities.duplicated_entities_stats.count
+        )
+        weak_entities = (
+            search_result.aggregations.nested_entities.duplicated_entities.buckets
+        )
+
+        if weak_entities_count > 0:
+            for intent in weak_entities:
+                warnings.append(
+                    _(
+                        'The entity "{}" has only {} sentence\nAdd 1 more sentence to that entity (minimum is {})'
+                    ).format(
+                        intent["key"],
+                        intent["doc_count"],
+                        self.MIN_EXAMPLES_PER_INTENT,
+                    )
+                )
+
+        return warnings
+
+    @property
+    def _relational_requirements_to_train(self):
+        try:
+            self.validate_init_train()
+        except RepositoryUpdateAlreadyTrained:  # pragma: no cover
+            return [_("This bot version has already been trained.")]
+        except RepositoryUpdateAlreadyStartedTraining:  # pragma: no cover
+            return [_("This bot version is being trained.")]
+
+        warnings = []
 
         intents = self.examples.values_list("intent__text", flat=True)
 
         if "" in intents:
-            r.append(_("All examples need have a intent."))
+            warnings.append(_("All examples need to have a intent."))
 
         weak_intents = (
             self.examples.values("intent__text")
@@ -1088,7 +1195,7 @@ def requirements_to_train(self):
 
         if weak_intents.exists():
             for i in weak_intents:
-                r.append(
+                warnings.append(
                     _(
                         'The "{}" intention has only {} sentence\nAdd 1 more sentence to that intention (minimum is {})'
                     ).format(
@@ -1108,7 +1215,7 @@ def requirements_to_train(self):
 
         if weak_entities.exists():
             for e in weak_entities:
-                r.append(
+                warnings.append(
                     _(
                         'The entity "{}" has only {} sentence\nAdd 1 more sentence to that entity (minimum is {})'
                     ).format(
@@ -1118,7 +1225,14 @@ def requirements_to_train(self):
                     )
                 )
 
-        return r
+        return warnings
+
+    @property
+    def requirements_to_train(self):
+        if settings.USE_ELASTICSEARCH:
+            return self._elasticsearch_requirements_to_train
+        else:
+            return self._relational_requirements_to_train
 
     @property
     def ready_for_train(self):
@@ -1462,7 +1576,7 @@ def get_text(self, language=None):  # pragma: no cover
             return self.text
         return self.get_translation(language).text
 
-    def get_entities(self, language):  # pragma: no cover
+    def get_entities(self, language=None):  # pragma: no cover
         if not language or language == self.repository_version_language.language:
             return self.entities.all()
         return self.get_translation(language).entities.all()
@@ -1483,6 +1597,22 @@ def delete(self, using=None, keep_parents=False):
 
         return instance
 
+    @property
+    def entities_field_indexing(self):
+        entities = self.entities.all()
+        entity_reduced_list = []
+        for entity in entities:
+            reduced_entity_obj = dict_to_obj(
+                {
+                    "entity": {
+                        "value": entity.entity.value,
+                    },
+                }
+            )
+            entity_reduced_list.append(reduced_entity_obj)
+
+        return entity_reduced_list
+
 
 class RepositoryTranslatedExampleManager(models.Manager):
     def create(

diff --git a/bothub/common/tests.py b/bothub/common/tests.py
@@ -959,6 +959,7 @@ def test_empty_intent(self):
         RepositoryExampleEntity.objects.create(
             repository_example=example, start=0, end=7, entity="name"
         )
+
         self.assertFalse(self.repository.current_version().ready_for_train)
 
     def test_intent_dont_have_min_examples(self):

diff --git a/bothub/settings.py b/bothub/settings.py
@@ -80,7 +80,9 @@
     REPOSITORY_EXAMPLE_TEXT_WORDS_LIMIT=(int, 200),
     ELASTICSEARCH_DSL=(str, "localhost:9200"),
     ELASTICSEARCH_REPOSITORYNLPLOG_INDEX=(str, "ai_repositorynlplog"),
+    USE_ELASTICSEARCH=(bool, True),
     ELASTICSEARCH_REPOSITORYQANLPLOG_INDEX=(str, "ai_repositoryqanlplog"),
+    ELASTICSEARCH_REPOSITORYBASICEXAMPLE_INDEX=(str, "ai_repositorybasicexample"),
     ELASTICSEARCH_NUMBER_OF_SHARDS=(int, 1),
     ELASTICSEARCH_NUMBER_OF_REPLICAS=(int, 0),
     ELASTICSEARCH_SIGNAL_PROCESSOR=(str, "realtime"),
@@ -442,10 +444,14 @@
 REPOSITORY_RESTRICT_ACCESS_NLP_LOGS = env.list("REPOSITORY_RESTRICT_ACCESS_NLP_LOGS")
 
 # Limit of characters for the knowledge base description
-REPOSITORY_KNOWLEDGE_BASE_DESCRIPTION_LIMIT = env.list("REPOSITORY_KNOWLEDGE_BASE_DESCRIPTION_LIMIT", default=450)
+REPOSITORY_KNOWLEDGE_BASE_DESCRIPTION_LIMIT = env.list(
+    "REPOSITORY_KNOWLEDGE_BASE_DESCRIPTION_LIMIT", default=450
+)
 
 # Limit of words for the example sentence
-REPOSITORY_EXAMPLE_TEXT_WORDS_LIMIT = env.list("REPOSITORY_EXAMPLE_TEXT_WORDS_LIMIT", default=200)
+REPOSITORY_EXAMPLE_TEXT_WORDS_LIMIT = env.list(
+    "REPOSITORY_EXAMPLE_TEXT_WORDS_LIMIT", default=200
+)
 
 
 # django_redis
@@ -522,6 +528,8 @@
     "default": {"hosts": env.str("ELASTICSEARCH_DSL", default="es:9200")}
 }
 
+USE_ELASTICSEARCH = env.bool("USE_ELASTICSEARCH", default=True)
+
 ELASTICSEARCH_DSL_INDEX_SETTINGS = {
     "number_of_shards": env.int("ELASTICSEARCH_NUMBER_OF_SHARDS", default=1),
     "number_of_replicas": env.int("ELASTICSEARCH_NUMBER_OF_REPLICAS", default=0),
@@ -534,6 +542,10 @@
     "bothub.common.documents.repositoryqanlplog": env.str(
         "ELASTICSEARCH_REPOSITORYQANLPLOG_INDEX", default="ai_repositoryqanlplog"
     ),
+    "bothub.common.documents.repositorybasicexample": env.str(
+        "ELASTICSEARCH_REPOSITORYBASICEXAMPLE_INDEX",
+        default="ai_repositorybasicexample",
+    ),
 }
 
 ELASTICSEARCH_SIGNAL_PROCESSOR_CLASSES = {

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -106,8 +106,10 @@ services:
       - ELASTICSEARCH_NUMBER_OF_SHARDS=${ELASTICSEARCH_NUMBER_OF_SHARDS:-1}
       - ELASTICSEARCH_NUMBER_OF_REPLICAS=${ELASTICSEARCH_NUMBER_OF_REPLICAS:-0}
       - ELASTICSEARCH_REPOSITORYNLPLOG_INDEX=${ELASTICSEARCH_REPOSITORYNLPLOG_INDEX:-ai_repositorynlplog}
-      - ELASTICSEARCH_REPOSITORYNLPLOG_INDEX=${ELASTICSEARCH_REPOSITORYQANLPLOG_INDEX:-ai_repositoryqanlplog}
+      - ELASTICSEARCH_REPOSITORYQANLPLOG_INDEX=${ELASTICSEARCH_REPOSITORYQANLPLOG_INDEX:-ai_repositoryqanlplog}
+      - ELASTICSEARCH_REPOSITORYBASICEXAMPLE_INDEX=${ELASTICSEARCH_REPOSITORYBASICEXAMPLE_INDEX:-ai_repositorybasicexample}
       - ELASTICSEARCH_SIGNAL_PROCESSOR=${ELASTICSEARCH_SIGNAL_PROCESSOR:-celery}
+      - USE_ELASTICSEARCH=${USE_ELASTICSEARCH:-true}
 
   bothub-engine-celery-redis:
     image: redis