JSv4 · JSv4 · May 27, 2024 · May 19, 2024 · May 19, 2024 · May 19, 2024
diff --git a/.envs/.test/.django b/.envs/.test/.django
@@ -35,3 +35,6 @@ CELERY_FLOWER_PASSWORD=U3Md7XEGNZ67HlNwHon8fbwiT0GemPeCrwDubZ6BRvX3dwxMEsLhuLoiU
 # AUTH0
 # ------------------------------------------------------------------------------
 USE_AUTH0=false
+
+# Turn on Embeddings Microservice
+# ------------------------------------------------------------------------------
diff --git a/.envs/.test/.postgres b/.envs/.test/.postgres
@@ -3,6 +3,6 @@
 POSTGRES_HOST=postgres
 POSTGRES_PORT=5432
 POSTGRES_DB=opencontractserver
-POSTGRES_GREMLIN_DB=gremlin_engine_db
-POSTGRES_USER=layRHIreDzsPFYERJCLpSCkivldgXLIZ
-POSTGRES_PASSWORD=m9gtttr2FO2pMxmvIYS5j6ky9CkoS4k8mwFqSqja1QsWY9kMd0TdCEyvxRMdEcSG
+POSTGRES_GREMLIN_DB=ocdb
+POSTGRES_USER=oc_user
+POSTGRES_PASSWORD=dgj902j43rtrgkmn
diff --git a/.idea/other.xml b/.idea/other.xml
diff --git a/.idea/sqldialects.xml b/.idea/sqldialects.xml
diff --git a/compose/production/postgres/Dockerfile b/compose/production/postgres/Dockerfile
@@ -1,4 +1,7 @@
-FROM postgres:15.2
+FROM pgvector/pgvector:pg15
+
+RUN apt-get update && apt-get install -y git build-essential postgresql-server-dev-15
+RUN cd /tmp && git clone --branch v0.7.0 https://github.com/pgvector/pgvector.git && cd pgvector && make && make install # may need sudo
 
 COPY ./compose/production/postgres/maintenance /usr/local/bin/maintenance
 RUN chmod +x /usr/local/bin/maintenance/*

diff --git a/compose/production/postgres/init.sql b/compose/production/postgres/init.sql
@@ -0,0 +1 @@
+CREATE EXTENSION vector;
diff --git a/config/graphql/filters.py b/config/graphql/filters.py
@@ -4,9 +4,9 @@
 import django_filters
 from django.contrib.auth import get_user_model
 from django.db.models import Q
+from django_filters import OrderingFilter
 from django_filters import rest_framework as filters
 from graphql_relay import from_global_id
-from django_filters import FilterSet, OrderingFilter
 
 from opencontractserver.analyzer.models import Analysis, Analyzer, GremlinEngine
 from opencontractserver.annotations.models import (
@@ -136,6 +136,7 @@ class AnnotationFilter(django_filters.FilterSet):
     uses_label_from_labelset_id = django_filters.CharFilter(
         method="filter_by_label_from_labelset_id"
     )
+
     def filter_by_label_from_labelset_id(self, queryset, info, value):
         django_pk = from_global_id(value)[1]
         return queryset.filter(annotation_label__included_in_labelset=django_pk)
@@ -177,11 +178,7 @@ def filter_by_created_with_analyzer_id(self, queryset, info, value):
         else:
             return queryset
 
-    order_by = OrderingFilter(
-        fields=(
-            ('modified', 'modified'),
-        )
-    )
+    order_by = OrderingFilter(fields=(("modified", "modified"),))
 
     class Meta:
         model = Annotation

diff --git a/config/graphql/graphene_types.py b/config/graphql/graphene_types.py
@@ -72,6 +72,7 @@ class AnnotationType(AnnotatePermissionsForReadMixin, ModelType):
     class Meta:
         model = Annotation
         interfaces = [relay.Node]
+        exclude = ("embedding",)
         connection_class = CountableConnection
 
         # In order for filter options to show up in nested resolvers, you need to specify them
@@ -166,6 +167,7 @@ def resolve_pawls_parse_file(self, info):
     class Meta:
         model = Document
         interfaces = [relay.Node]
+        exclude = ("embedding",)
         connection_class = CountableConnection
 
 

diff --git a/config/graphql/queries.py b/config/graphql/queries.py
@@ -60,8 +60,7 @@ class Query(graphene.ObjectType):
 
     # ANNOTATION RESOLVERS #####################################
     annotations = DjangoFilterConnectionField(
-        AnnotationType,
-        filterset_class=AnnotationFilter
+        AnnotationType, filterset_class=AnnotationFilter
     )
 
     def resolve_annotations(self, info, **kwargs):
@@ -132,8 +131,9 @@ def resolve_bulk_doc_annotations_in_corpus(self, info, corpus_id, **kwargs):
 
         print(f"Base queryset: {queryset}")
 
-        # Now build query to stuff they want to see
-        q_objects = Q(corpus_id=corpus_django_pk)
+        # Now build query to stuff they want to see (filter to annotations in this corpus or with NO corpus FK, which
+        # travel with document.
+        q_objects = Q(corpus_id=corpus_django_pk) | Q(corpus_id__isnull=True)
 
         # If for_analysis_ids is passed in, only show annotations from those analyses, otherwise only show human
         # annotations.

diff --git a/config/settings/base.py b/config/settings/base.py
@@ -453,11 +453,19 @@
 
 # Nlm-ingestor settings
 # -----------------------------------------------------------------------------
-NLM_INGESTOR_ACTIVE = env.bool('NLM_INGESTOR_ACTIVE', False)  # Use nlm-ingestor where this is True... otherwise PAWLs
+NLM_INGESTOR_ACTIVE = env.bool(
+    "NLM_INGESTOR_ACTIVE", False
+)  # Use nlm-ingestor where this is True... otherwise PAWLs
 NLM_INGEST_USE_OCR = False  # IF True, always tell nlm-ingestor to use OCR (Tesseract)
-NLM_INGEST_HOSTNAME = "http://nlm-ingestor:5001"  # Hostname to send nlm-ingestor REST requests to
+NLM_INGEST_HOSTNAME = (
+    "http://nlm-ingestor:5001"  # Hostname to send nlm-ingestor REST requests to
+)
 NLM_INGEST_API_KEY = None  # If the endpoint is secured with an API_KEY, specify it here, otherwise use None
 
+# Embeddings / Semantic Search
+EMBEDDINGS_MICROSERVICE_URL = "http://vector-embedder:8000"
+VECTOR_EMBEDDER_API_KEY = "abc123"
+
 # CORS
 # ------------------------------------------------------------------------------
 CORS_ORIGIN_WHITELIST = [

diff --git a/docs/architecture/components/annotator/how-annotations-are-created.md b/docs/architecture/components/annotator/how-annotations-are-created.md
@@ -44,5 +44,3 @@ graph TD
     Q -->|Error| S[Display error message]
     R --> T[Re-render components with updated annotations]
 ```
-
-
diff --git a/docs/sample_env_files/backend/local/.django b/docs/sample_env_files/backend/local/.django
@@ -36,3 +36,7 @@ CELERY_FLOWER_PASSWORD=
 # AUTH0
 # ------------------------------------------------------------------------------
 USE_AUTH0=False
+
+# NLM Parser
+# ------------------------------------------------------------------------------
+NLM_INGESTOR_ACTIVE=True
diff --git a/docs/walkthrough/advanced/export-import-corpuses.md b/docs/walkthrough/advanced/export-import-corpuses.md
@@ -22,7 +22,7 @@ If you've enabled corpus imports (see the **frontend** env file for the boolean
 ## OpenContracts Export Format Specification
 
 The OpenContracts export is a zip archive containing:
-1. A `data.json` file with metadata about the export 
+1. A `data.json` file with metadata about the export
 2. The original PDF documents
 3. Exported annotations "burned in" to the PDF documents
 
@@ -38,19 +38,19 @@ The `data.json` file contains a JSON object with the following fields:
 
 * `corpus` (OpenContractCorpusType): Metadata about the exported corpus, with fields:
     - `id` (int): ID of the corpus
-    - `title` (string)  
+    - `title` (string)
     - `description` (string)
     - `icon_name` (string): Filename of the corpus icon image
     - `icon_data` (string): Base64 encoded icon image data
     - `creator` (string): Email of the corpus creator
     - `label_set` (string): ID of the labelset used by this corpus
-        
+
 * `label_set` (OpenContractsLabelSetType): Metadata about the label set, with fields:
     - `id` (int)
-    - `title` (string)  
+    - `title` (string)
     - `description` (string)
     - `icon_name` (string): Filename of the labelset icon
-    - `icon_data` (string): Base64 encoded labelset icon data 
+    - `icon_data` (string): Base64 encoded labelset icon data
     - `creator` (string): Email of the labelset creator
 
 
@@ -61,7 +61,7 @@ Each document in `annotated_docs` is represented by an OpenContractDocExport obj
 * `doc_labels` (list[string]): List of document label names applied to this doc
 * `labelled_text` (list[OpenContractsAnnotationPythonType]): List of text annotations
 * `title` (string): Document title
-* `content` (string): Full text content of the document  
+* `content` (string): Full text content of the document
 * `description` (string): Description of the document
 * `pawls_file_content` (list[PawlsPagePythonType]): PAWLS parse data for each page
 * `page_count` (int): Number of pages in the document
@@ -70,10 +70,10 @@ Each document in `annotated_docs` is represented by an OpenContractDocExport obj
 
 Represents an individual text annotation, with fields:
 
-* `id` (string): Optional ID 
+* `id` (string): Optional ID
 * `annotationLabel` (string): Name of the label for this annotation
 * `rawText` (string): Raw text content of the annotation
-* `page` (int): 0-based page number the annotation is on 
+* `page` (int): 0-based page number the annotation is on
 * `annotation_json` (dict): Maps page numbers to OpenContractsSinglePageAnnotationType
 
 ### OpenContractsSinglePageAnnotationType Format
@@ -89,11 +89,11 @@ Represents the annotation data for a single page:
 Represents a bounding box with fields:
 
 * `top` (int)
-* `bottom` (int)  
+* `bottom` (int)
 * `left` (int)
 * `right` (int)
 
-### TokenIdPythonType Format  
+### TokenIdPythonType Format
 
 References a PAWLS token by page and token index:
 
@@ -109,7 +109,7 @@ Represents PAWLS parse data for a single page:
 
 ### PawlsPageBoundaryPythonType Format
 
-Represents the page boundary with fields:  
+Represents the page boundary with fields:
 
 * `width` (float)
 * `height` (float)
@@ -119,24 +119,24 @@ Represents the page boundary with fields:
 
 Represents a single PAWLS token with fields:
 
-* `x` (float): X-coordinate of token box 
+* `x` (float): X-coordinate of token box
 * `y` (float): Y-coordinate of token box
 * `width` (float): Width of token box
-* `height` (float): Height of token box  
+* `height` (float): Height of token box
 * `text` (string): Text content of the token
 
 ### AnnotationLabelPythonType Format
 
 Defines an annotation label with fields:
 
 * `id` (string)
-* `color` (string): Hex color for the label 
-* `description` (string) 
+* `color` (string): Hex color for the label
+* `description` (string)
 * `icon` (string): Icon name
 * `text` (string): Label text
 * `label_type` (LabelType): One of DOC_TYPE_LABEL, TOKEN_LABEL, RELATIONSHIP_LABEL, METADATA_LABEL
 
-### Example data.json 
+### Example data.json
 ```json
 {
   "annotated_docs": {
@@ -212,7 +212,7 @@ Defines an annotation label with fields:
       "label_type": "DOC_TYPE_LABEL"
     },
     "NDA": {
-      "id": "2", 
+      "id": "2",
       "color": "#00FF00",
       "description": "Indicates a non-disclosure agreement",
       "icon": "nda",
@@ -241,7 +241,7 @@ Defines an annotation label with fields:
   },
   "label_set": {
     "id": "4",
-    "title": "Example Label Set",  
+    "title": "Example Label Set",
     "description": "A sample label set",
     "icon_name": "label_icon.png",
     "icon_data": "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAACklEQVR4nGMAAQAABQABDQottAAAAABJRU5ErkJggg==",

diff --git a/frontend/Dockerfile b/frontend/Dockerfile
@@ -27,11 +27,11 @@ EXPOSE 3000
 
 # COPY .env file and shell script to container
 WORKDIR /usr/share/nginx/html
-COPY ./env.sh .
+COPY ./env.sh env.sh
 COPY .env .
 
 # Make shell script executable
 RUN chmod +x env.sh
 
 # Start Nginx server
-CMD ["/bin/sh", "-c", "/usr/share/nginx/html/env.sh && nginx -g \"daemon off;\""]
+CMD ["/bin/sh", "-c", "nginx -g \"daemon off;\""]
diff --git a/frontend/package.json b/frontend/package.json
@@ -24,6 +24,7 @@
     "fuse.js": "^6.5.3",
     "graphql": "^16.2.0",
     "lodash": "^4.17.21",
+    "lodash.uniqueid": "^4.0.1",
     "pdfjs-dist": "^2.13.216",
     "react": "16.14.0",
     "react-beautiful-dnd": "^13.1.0",
@@ -77,6 +78,7 @@
     "**/*": "prettier --write --ignore-unknown"
   },
   "devDependencies": {
+    "@types/lodash.uniqueid": "^4.0.9",
     "@types/uuid": "^8.3.4",
     "husky": "^8.0.1",
     "prettier": "^2.7.1"

diff --git a/frontend/src/components/annotator/Annotator.tsx b/frontend/src/components/annotator/Annotator.tsx
@@ -598,8 +598,11 @@ export const Annotator = ({
       }
       setDocTypeLabels(Object.values(document_label_lookup));
 
+      // This is the annotations start loading
       // Turn existing annotation data into PDFAnnotations obj and inject into state:
       let annotation_objs: ServerAnnotation[] = [];
+
+      // Case 1 is where an "Analysis" is not selected
       if (
         annotator_data?.existingTextAnnotations &&
         selected_analysis_ids?.length === 0
@@ -619,7 +622,9 @@ export const Annotator = ({
               )
           );
         // console.log("Got manual annotation objs: ", annotation_objs);
-      } else if (
+      }
+      // If an analysis is selected... load THOSE annotations
+      else if (
         selected_analysis_ids &&
         selected_analysis_ids.length > 0 &&
         annotator_data?.existingTextAnnotations
@@ -639,6 +644,7 @@ export const Annotator = ({
           );
       }
 
+      // Load doc-level labels
       let doc_type_annotations: DocTypeAnnotation[] = [];
 
       if (annotator_data?.existingDocLabelAnnotations) {
@@ -659,6 +665,7 @@ export const Annotator = ({
         );
       }
 
+      // Load relationship level labels
       let relationship_annotations: RelationGroup[] = [];
 
       if (annotator_data?.existingRelationships) {