Skip to content

Commit

Permalink
Merge 7ecd869 into 3639b77
Browse files Browse the repository at this point in the history
  • Loading branch information
helllllllder authored Jan 20, 2022
2 parents 3639b77 + 7ecd869 commit daee949
Show file tree
Hide file tree
Showing 8 changed files with 318 additions and 11 deletions.
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@
[![Python Version](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/)
[![License GPL-3.0](https://img.shields.io/badge/license-%20GPL--3.0-yellow.svg)](https://github.com/bothub-it/bothub-engine/blob/master/LICENSE)

# Requirements

* Python (3.6)
* Pipenv
* Docker
* Docker-compose

## Development

Use ```make``` commands to ```check_environment```, ```install_requirements```, ```lint```, ```test```, ```migrate```, ```start```, ```migrations``` and ```collectstatic```.
Expand Down Expand Up @@ -163,9 +170,12 @@ You can set environment variables in your OS, write on ```.env``` file or pass v
| ELASTICSEARCH_DSL | ```string``` | ```es:9200``` | URL Elasticsearch.
| ELASTICSEARCH_NUMBER_OF_SHARDS | ```int``` | ```1``` | Specify the number of shards for the indexes.
| ELASTICSEARCH_NUMBER_OF_REPLICAS | ```int``` | ```1``` | Specify the number of replicas for the indexes.
| ELASTICSEARCH_REPOSITORYNLPLOG_INDEX | ```string``` | ```repositorynlplog``` | Specify the index title for the RepositoryNlpLog document.
| ELASTICSEARCH_REPOSITORYNLPLOG_INDEX | ```string``` | ```ai_repositorynlplog``` | Specify the index title for the RepositoryNLPLog document.
| ELASTICSEARCH_REPOSITORYQANLPLOG_INDEX | ```string``` | ```ai_repositoryqanlplog``` | Specify the index title for the RepositoryQANLPLog document.
| ELASTICSEARCH_REPOSITORYBASICEXAMPLE_INDEX | ```string``` | ```ai_repositorybasicexample``` | Specify the index title for the RepositoryBasicExample document.
| ELASTICSEARCH_SIGNAL_PROCESSOR | ```string``` | ```celery``` | Specify the signal processor responsible for updating the Elasticsearch data.
| GUNICORN_WORKERS | ``` int ``` | ``` multiprocessing.cpu_count() * 2 + 1 ``` | Gunicorn number of workers
| USE_ELASTICSEARCH | ```boolean``` | ```true``` | Change the logic in requirements_to_train to use either elasticsearch or postgres.


## Roadmap
Expand Down
2 changes: 2 additions & 0 deletions bothub/common/documents/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from bothub.common.documents.repositorynlplog import RepositoryNLPLogDocument
from bothub.common.documents.repositoryqanlplog import RepositoryQANLPLogDocument
from bothub.common.documents.repositorybasicexample import RepositoryExampleDocument

__all__ = (
"RepositoryNLPLogDocument",
"RepositoryQANLPLogDocument",
"RepositoryExampleDocument"
)
68 changes: 68 additions & 0 deletions bothub/common/documents/repositorybasicexample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from django.conf import settings
from django_elasticsearch_dsl import Document, Index, fields

from bothub.common.models import (
RepositoryExample,
RepositoryExampleEntity,
RepositoryIntent,
RepositoryVersionLanguage,
)

REPOSITORYBASICEXAMPLE_INDEX = Index(settings.ELASTICSEARCH_INDEX_NAMES[__name__])


@REPOSITORYBASICEXAMPLE_INDEX.doc_type
class RepositoryExampleDocument(Document):
repository_version_language = fields.ObjectField(
properties={
"pk": fields.IntegerField(),
"language": fields.TextField(fields={"raw": fields.KeywordField()}),
}
)
intent = fields.ObjectField(
properties={"text": fields.TextField(fields={"raw": fields.KeywordField()})}
)
entities = fields.NestedField(
properties={
"entity": fields.ObjectField(
properties={
"value": fields.TextField(fields={"raw": fields.KeywordField()}),
}
),
}
)
pk = fields.IntegerField()

class Django:
model = RepositoryExample
fields = [
"id",
"text",
]
related_models = [
RepositoryVersionLanguage,
RepositoryIntent,
RepositoryExampleEntity,
]

def get_queryset(self):
return (
super(RepositoryExampleDocument, self)
.get_queryset()
.select_related(
"repository_version_language",
"intent",
)
.prefetch_related(
"entities",
"translations",
)
)

def get_instances_from_related(self, related_instance):
if isinstance(related_instance, RepositoryVersionLanguage):
return related_instance.added.all()
elif isinstance(related_instance, RepositoryIntent):
return related_instance.repositoryexample_set.all()
elif isinstance(related_instance, RepositoryExampleEntity):
return related_instance.repository_example
144 changes: 137 additions & 7 deletions bothub/common/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
from django.utils import timezone
from django.utils.translation import ugettext_lazy as _
from django_elasticsearch_dsl_drf.wrappers import dict_to_obj
from elasticsearch_dsl import A
from elasticsearch_dsl import Q as elasticQ
from rest_framework import status
from rest_framework.exceptions import APIException

Expand Down Expand Up @@ -1064,20 +1066,125 @@ def examples(self):
return examples.distinct()

@property
def requirements_to_train(self):
def _search_weak_intents_and_entities(self):
from bothub.common.documents import RepositoryExampleDocument

search = RepositoryExampleDocument.search().query(
"match", repository_version_language__pk=self.pk
)
search.update_from_dict({"size": 0})

duplicated_limit_bucket = A(
"bucket_selector",
buckets_path={"doc_count": "_count"},
script=f"params.doc_count < {self.MIN_EXAMPLES_PER_INTENT}",
)

search.aggs.bucket("duplicated_intents", "terms", field="intent.text.raw")
search.aggs["duplicated_intents"].bucket(
"filter_duplicated_intent_limit", duplicated_limit_bucket
)
search.aggs.bucket(
"duplicated_intents_stats",
"stats_bucket",
buckets_path="duplicated_intents._count",
)

search.aggs.bucket("nested_entities", "nested", path="entities")
search.aggs["nested_entities"].bucket(
"duplicated_entities", "terms", field="entities.entity.value.raw"
)
search.aggs["nested_entities"]["duplicated_entities"].bucket(
"filter_duplicated_entity_limit", duplicated_limit_bucket
)
search.aggs["nested_entities"].bucket(
"duplicated_entities_stats",
"stats_bucket",
buckets_path="duplicated_entities._count",
)

return search.execute()

@property
def _does_all_examples_have_intents(self):
from bothub.common.documents import RepositoryExampleDocument

search = RepositoryExampleDocument.search().query(
"bool",
must=[
elasticQ("match", intent__text__raw=""),
elasticQ("match", repository_version_language__pk=self.pk),
],
)
return False if search.execute().hits.total.value != 0 else True

@property
def _elasticsearch_requirements_to_train(self):
try:
self.validate_init_train()
except RepositoryUpdateAlreadyTrained: # pragma: no cover
return [_("This bot version has already been trained.")]
except RepositoryUpdateAlreadyStartedTraining: # pragma: no cover
return [_("This bot version is being trained.")]

r = []
warnings = []

if not self._does_all_examples_have_intents:
warnings.append(_("All examples need to have a intent."))

search_result = self._search_weak_intents_and_entities

weak_intents_count = search_result.aggregations.duplicated_intents_stats.count
weak_intents = search_result.aggregations.duplicated_intents.buckets

if weak_intents_count > 0:
for intent in weak_intents:
warnings.append(
_(
'The "{}" intention has only {} sentence\nAdd 1 more sentence to that intention (minimum is {})'
).format(
intent["key"],
intent["doc_count"],
self.MIN_EXAMPLES_PER_INTENT,
)
)

weak_entities_count = (
search_result.aggregations.nested_entities.duplicated_entities_stats.count
)
weak_entities = (
search_result.aggregations.nested_entities.duplicated_entities.buckets
)

if weak_entities_count > 0:
for intent in weak_entities:
warnings.append(
_(
'The entity "{}" has only {} sentence\nAdd 1 more sentence to that entity (minimum is {})'
).format(
intent["key"],
intent["doc_count"],
self.MIN_EXAMPLES_PER_INTENT,
)
)

return warnings

@property
def _relational_requirements_to_train(self):
try:
self.validate_init_train()
except RepositoryUpdateAlreadyTrained: # pragma: no cover
return [_("This bot version has already been trained.")]
except RepositoryUpdateAlreadyStartedTraining: # pragma: no cover
return [_("This bot version is being trained.")]

warnings = []

intents = self.examples.values_list("intent__text", flat=True)

if "" in intents:
r.append(_("All examples need have a intent."))
warnings.append(_("All examples need to have a intent."))

weak_intents = (
self.examples.values("intent__text")
Expand All @@ -1088,7 +1195,7 @@ def requirements_to_train(self):

if weak_intents.exists():
for i in weak_intents:
r.append(
warnings.append(
_(
'The "{}" intention has only {} sentence\nAdd 1 more sentence to that intention (minimum is {})'
).format(
Expand All @@ -1108,7 +1215,7 @@ def requirements_to_train(self):

if weak_entities.exists():
for e in weak_entities:
r.append(
warnings.append(
_(
'The entity "{}" has only {} sentence\nAdd 1 more sentence to that entity (minimum is {})'
).format(
Expand All @@ -1118,7 +1225,14 @@ def requirements_to_train(self):
)
)

return r
return warnings

@property
def requirements_to_train(self):
if settings.USE_ELASTICSEARCH:
return self._elasticsearch_requirements_to_train
else:
return self._relational_requirements_to_train

@property
def ready_for_train(self):
Expand Down Expand Up @@ -1462,7 +1576,7 @@ def get_text(self, language=None): # pragma: no cover
return self.text
return self.get_translation(language).text

def get_entities(self, language): # pragma: no cover
def get_entities(self, language=None): # pragma: no cover
if not language or language == self.repository_version_language.language:
return self.entities.all()
return self.get_translation(language).entities.all()
Expand All @@ -1483,6 +1597,22 @@ def delete(self, using=None, keep_parents=False):

return instance

@property
def entities_field_indexing(self):
entities = self.entities.all()
entity_reduced_list = []
for entity in entities:
reduced_entity_obj = dict_to_obj(
{
"entity": {
"value": entity.entity.value,
},
}
)
entity_reduced_list.append(reduced_entity_obj)

return entity_reduced_list


class RepositoryTranslatedExampleManager(models.Manager):
def create(
Expand Down
1 change: 1 addition & 0 deletions bothub/common/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -959,6 +959,7 @@ def test_empty_intent(self):
RepositoryExampleEntity.objects.create(
repository_example=example, start=0, end=7, entity="name"
)

self.assertFalse(self.repository.current_version().ready_for_train)

def test_intent_dont_have_min_examples(self):
Expand Down
16 changes: 14 additions & 2 deletions bothub/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,9 @@
REPOSITORY_EXAMPLE_TEXT_WORDS_LIMIT=(int, 200),
ELASTICSEARCH_DSL=(str, "localhost:9200"),
ELASTICSEARCH_REPOSITORYNLPLOG_INDEX=(str, "ai_repositorynlplog"),
USE_ELASTICSEARCH=(bool, True),
ELASTICSEARCH_REPOSITORYQANLPLOG_INDEX=(str, "ai_repositoryqanlplog"),
ELASTICSEARCH_REPOSITORYBASICEXAMPLE_INDEX=(str, "ai_repositorybasicexample"),
ELASTICSEARCH_NUMBER_OF_SHARDS=(int, 1),
ELASTICSEARCH_NUMBER_OF_REPLICAS=(int, 0),
ELASTICSEARCH_SIGNAL_PROCESSOR=(str, "realtime"),
Expand Down Expand Up @@ -442,10 +444,14 @@
REPOSITORY_RESTRICT_ACCESS_NLP_LOGS = env.list("REPOSITORY_RESTRICT_ACCESS_NLP_LOGS")

# Limit of characters for the knowledge base description
REPOSITORY_KNOWLEDGE_BASE_DESCRIPTION_LIMIT = env.list("REPOSITORY_KNOWLEDGE_BASE_DESCRIPTION_LIMIT", default=450)
REPOSITORY_KNOWLEDGE_BASE_DESCRIPTION_LIMIT = env.list(
"REPOSITORY_KNOWLEDGE_BASE_DESCRIPTION_LIMIT", default=450
)

# Limit of words for the example sentence
REPOSITORY_EXAMPLE_TEXT_WORDS_LIMIT = env.list("REPOSITORY_EXAMPLE_TEXT_WORDS_LIMIT", default=200)
REPOSITORY_EXAMPLE_TEXT_WORDS_LIMIT = env.list(
"REPOSITORY_EXAMPLE_TEXT_WORDS_LIMIT", default=200
)


# django_redis
Expand Down Expand Up @@ -522,6 +528,8 @@
"default": {"hosts": env.str("ELASTICSEARCH_DSL", default="es:9200")}
}

USE_ELASTICSEARCH = env.bool("USE_ELASTICSEARCH", default=True)

ELASTICSEARCH_DSL_INDEX_SETTINGS = {
"number_of_shards": env.int("ELASTICSEARCH_NUMBER_OF_SHARDS", default=1),
"number_of_replicas": env.int("ELASTICSEARCH_NUMBER_OF_REPLICAS", default=0),
Expand All @@ -534,6 +542,10 @@
"bothub.common.documents.repositoryqanlplog": env.str(
"ELASTICSEARCH_REPOSITORYQANLPLOG_INDEX", default="ai_repositoryqanlplog"
),
"bothub.common.documents.repositorybasicexample": env.str(
"ELASTICSEARCH_REPOSITORYBASICEXAMPLE_INDEX",
default="ai_repositorybasicexample",
),
}

ELASTICSEARCH_SIGNAL_PROCESSOR_CLASSES = {
Expand Down
4 changes: 3 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,10 @@ services:
- ELASTICSEARCH_NUMBER_OF_SHARDS=${ELASTICSEARCH_NUMBER_OF_SHARDS:-1}
- ELASTICSEARCH_NUMBER_OF_REPLICAS=${ELASTICSEARCH_NUMBER_OF_REPLICAS:-0}
- ELASTICSEARCH_REPOSITORYNLPLOG_INDEX=${ELASTICSEARCH_REPOSITORYNLPLOG_INDEX:-ai_repositorynlplog}
- ELASTICSEARCH_REPOSITORYNLPLOG_INDEX=${ELASTICSEARCH_REPOSITORYQANLPLOG_INDEX:-ai_repositoryqanlplog}
- ELASTICSEARCH_REPOSITORYQANLPLOG_INDEX=${ELASTICSEARCH_REPOSITORYQANLPLOG_INDEX:-ai_repositoryqanlplog}
- ELASTICSEARCH_REPOSITORYBASICEXAMPLE_INDEX=${ELASTICSEARCH_REPOSITORYBASICEXAMPLE_INDEX:-ai_repositorybasicexample}
- ELASTICSEARCH_SIGNAL_PROCESSOR=${ELASTICSEARCH_SIGNAL_PROCESSOR:-celery}
- USE_ELASTICSEARCH=${USE_ELASTICSEARCH:-true}

bothub-engine-celery-redis:
image: redis
Expand Down
Loading

0 comments on commit daee949

Please sign in to comment.