Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Structural Annotations and Vector Embeddings #116

Merged
merged 16 commits into from
May 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .envs/.test/.django
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,6 @@ CELERY_FLOWER_PASSWORD=U3Md7XEGNZ67HlNwHon8fbwiT0GemPeCrwDubZ6BRvX3dwxMEsLhuLoiU
# AUTH0
# ------------------------------------------------------------------------------
USE_AUTH0=false

# Turn on Embeddings Microservice
# ------------------------------------------------------------------------------
6 changes: 3 additions & 3 deletions .envs/.test/.postgres
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@
POSTGRES_HOST=postgres
POSTGRES_PORT=5432
POSTGRES_DB=opencontractserver
POSTGRES_GREMLIN_DB=gremlin_engine_db
POSTGRES_USER=layRHIreDzsPFYERJCLpSCkivldgXLIZ
POSTGRES_PASSWORD=m9gtttr2FO2pMxmvIYS5j6ky9CkoS4k8mwFqSqja1QsWY9kMd0TdCEyvxRMdEcSG
POSTGRES_GREMLIN_DB=ocdb
POSTGRES_USER=oc_user
POSTGRES_PASSWORD=dgj902j43rtrgkmn
6 changes: 6 additions & 0 deletions .idea/other.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions .idea/sqldialects.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion compose/production/postgres/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
FROM postgres:15.2
FROM pgvector/pgvector:pg15

RUN apt-get update && apt-get install -y git build-essential postgresql-server-dev-15
RUN cd /tmp && git clone --branch v0.7.0 https://github.com/pgvector/pgvector.git && cd pgvector && make && make install # may need sudo

COPY ./compose/production/postgres/maintenance /usr/local/bin/maintenance
RUN chmod +x /usr/local/bin/maintenance/*
Expand Down
1 change: 1 addition & 0 deletions compose/production/postgres/init.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CREATE EXTENSION vector;
9 changes: 3 additions & 6 deletions config/graphql/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
import django_filters
from django.contrib.auth import get_user_model
from django.db.models import Q
from django_filters import OrderingFilter
from django_filters import rest_framework as filters
from graphql_relay import from_global_id
from django_filters import FilterSet, OrderingFilter

from opencontractserver.analyzer.models import Analysis, Analyzer, GremlinEngine
from opencontractserver.annotations.models import (
Expand Down Expand Up @@ -136,6 +136,7 @@ class AnnotationFilter(django_filters.FilterSet):
uses_label_from_labelset_id = django_filters.CharFilter(
method="filter_by_label_from_labelset_id"
)

def filter_by_label_from_labelset_id(self, queryset, info, value):
django_pk = from_global_id(value)[1]
return queryset.filter(annotation_label__included_in_labelset=django_pk)
Expand Down Expand Up @@ -177,11 +178,7 @@ def filter_by_created_with_analyzer_id(self, queryset, info, value):
else:
return queryset

order_by = OrderingFilter(
fields=(
('modified', 'modified'),
)
)
order_by = OrderingFilter(fields=(("modified", "modified"),))

class Meta:
model = Annotation
Expand Down
2 changes: 2 additions & 0 deletions config/graphql/graphene_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ class AnnotationType(AnnotatePermissionsForReadMixin, ModelType):
class Meta:
model = Annotation
interfaces = [relay.Node]
exclude = ("embedding",)
connection_class = CountableConnection

# In order for filter options to show up in nested resolvers, you need to specify them
Expand Down Expand Up @@ -166,6 +167,7 @@ def resolve_pawls_parse_file(self, info):
class Meta:
model = Document
interfaces = [relay.Node]
exclude = ("embedding",)
connection_class = CountableConnection


Expand Down
8 changes: 4 additions & 4 deletions config/graphql/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,7 @@ class Query(graphene.ObjectType):

# ANNOTATION RESOLVERS #####################################
annotations = DjangoFilterConnectionField(
AnnotationType,
filterset_class=AnnotationFilter
AnnotationType, filterset_class=AnnotationFilter
)

def resolve_annotations(self, info, **kwargs):
Expand Down Expand Up @@ -132,8 +131,9 @@ def resolve_bulk_doc_annotations_in_corpus(self, info, corpus_id, **kwargs):

print(f"Base queryset: {queryset}")

# Now build query to stuff they want to see
q_objects = Q(corpus_id=corpus_django_pk)
# Now build query to stuff they want to see (filter to annotations in this corpus or with NO corpus FK, which
# travel with document.
q_objects = Q(corpus_id=corpus_django_pk) | Q(corpus_id__isnull=True)

# If for_analysis_ids is passed in, only show annotations from those analyses, otherwise only show human
# annotations.
Expand Down
12 changes: 10 additions & 2 deletions config/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,11 +453,19 @@

# Nlm-ingestor settings
# -----------------------------------------------------------------------------
NLM_INGESTOR_ACTIVE = env.bool('NLM_INGESTOR_ACTIVE', False) # Use nlm-ingestor where this is True... otherwise PAWLs
NLM_INGESTOR_ACTIVE = env.bool(
"NLM_INGESTOR_ACTIVE", False
) # Use nlm-ingestor where this is True... otherwise PAWLs
NLM_INGEST_USE_OCR = False # IF True, always tell nlm-ingestor to use OCR (Tesseract)
NLM_INGEST_HOSTNAME = "http://nlm-ingestor:5001" # Hostname to send nlm-ingestor REST requests to
NLM_INGEST_HOSTNAME = (
"http://nlm-ingestor:5001" # Hostname to send nlm-ingestor REST requests to
)
NLM_INGEST_API_KEY = None # If the endpoint is secured with an API_KEY, specify it here, otherwise use None

# Embeddings / Semantic Search
EMBEDDINGS_MICROSERVICE_URL = "http://vector-embedder:8000"
VECTOR_EMBEDDER_API_KEY = "abc123"

# CORS
# ------------------------------------------------------------------------------
CORS_ORIGIN_WHITELIST = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,5 +44,3 @@ graph TD
Q -->|Error| S[Display error message]
R --> T[Re-render components with updated annotations]
```


4 changes: 4 additions & 0 deletions docs/sample_env_files/backend/local/.django
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,7 @@ CELERY_FLOWER_PASSWORD=
# AUTH0
# ------------------------------------------------------------------------------
USE_AUTH0=False

# NLM Parser
# ------------------------------------------------------------------------------
NLM_INGESTOR_ACTIVE=True
36 changes: 18 additions & 18 deletions docs/walkthrough/advanced/export-import-corpuses.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ If you've enabled corpus imports (see the **frontend** env file for the boolean
## OpenContracts Export Format Specification

The OpenContracts export is a zip archive containing:
1. A `data.json` file with metadata about the export
1. A `data.json` file with metadata about the export
2. The original PDF documents
3. Exported annotations "burned in" to the PDF documents

Expand All @@ -38,19 +38,19 @@ The `data.json` file contains a JSON object with the following fields:

* `corpus` (OpenContractCorpusType): Metadata about the exported corpus, with fields:
- `id` (int): ID of the corpus
- `title` (string)
- `title` (string)
- `description` (string)
- `icon_name` (string): Filename of the corpus icon image
- `icon_data` (string): Base64 encoded icon image data
- `creator` (string): Email of the corpus creator
- `label_set` (string): ID of the labelset used by this corpus

* `label_set` (OpenContractsLabelSetType): Metadata about the label set, with fields:
- `id` (int)
- `title` (string)
- `title` (string)
- `description` (string)
- `icon_name` (string): Filename of the labelset icon
- `icon_data` (string): Base64 encoded labelset icon data
- `icon_data` (string): Base64 encoded labelset icon data
- `creator` (string): Email of the labelset creator


Expand All @@ -61,7 +61,7 @@ Each document in `annotated_docs` is represented by an OpenContractDocExport obj
* `doc_labels` (list[string]): List of document label names applied to this doc
* `labelled_text` (list[OpenContractsAnnotationPythonType]): List of text annotations
* `title` (string): Document title
* `content` (string): Full text content of the document
* `content` (string): Full text content of the document
* `description` (string): Description of the document
* `pawls_file_content` (list[PawlsPagePythonType]): PAWLS parse data for each page
* `page_count` (int): Number of pages in the document
Expand All @@ -70,10 +70,10 @@ Each document in `annotated_docs` is represented by an OpenContractDocExport obj

Represents an individual text annotation, with fields:

* `id` (string): Optional ID
* `id` (string): Optional ID
* `annotationLabel` (string): Name of the label for this annotation
* `rawText` (string): Raw text content of the annotation
* `page` (int): 0-based page number the annotation is on
* `page` (int): 0-based page number the annotation is on
* `annotation_json` (dict): Maps page numbers to OpenContractsSinglePageAnnotationType

### OpenContractsSinglePageAnnotationType Format
Expand All @@ -89,11 +89,11 @@ Represents the annotation data for a single page:
Represents a bounding box with fields:

* `top` (int)
* `bottom` (int)
* `bottom` (int)
* `left` (int)
* `right` (int)

### TokenIdPythonType Format
### TokenIdPythonType Format

References a PAWLS token by page and token index:

Expand All @@ -109,7 +109,7 @@ Represents PAWLS parse data for a single page:

### PawlsPageBoundaryPythonType Format

Represents the page boundary with fields:
Represents the page boundary with fields:

* `width` (float)
* `height` (float)
Expand All @@ -119,24 +119,24 @@ Represents the page boundary with fields:

Represents a single PAWLS token with fields:

* `x` (float): X-coordinate of token box
* `x` (float): X-coordinate of token box
* `y` (float): Y-coordinate of token box
* `width` (float): Width of token box
* `height` (float): Height of token box
* `height` (float): Height of token box
* `text` (string): Text content of the token

### AnnotationLabelPythonType Format

Defines an annotation label with fields:

* `id` (string)
* `color` (string): Hex color for the label
* `description` (string)
* `color` (string): Hex color for the label
* `description` (string)
* `icon` (string): Icon name
* `text` (string): Label text
* `label_type` (LabelType): One of DOC_TYPE_LABEL, TOKEN_LABEL, RELATIONSHIP_LABEL, METADATA_LABEL

### Example data.json
### Example data.json
```json
{
"annotated_docs": {
Expand Down Expand Up @@ -212,7 +212,7 @@ Defines an annotation label with fields:
"label_type": "DOC_TYPE_LABEL"
},
"NDA": {
"id": "2",
"id": "2",
"color": "#00FF00",
"description": "Indicates a non-disclosure agreement",
"icon": "nda",
Expand Down Expand Up @@ -241,7 +241,7 @@ Defines an annotation label with fields:
},
"label_set": {
"id": "4",
"title": "Example Label Set",
"title": "Example Label Set",
"description": "A sample label set",
"icon_name": "label_icon.png",
"icon_data": "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAACklEQVR4nGMAAQAABQABDQottAAAAABJRU5ErkJggg==",
Expand Down
4 changes: 2 additions & 2 deletions frontend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ EXPOSE 3000

# COPY .env file and shell script to container
WORKDIR /usr/share/nginx/html
COPY ./env.sh .
COPY ./env.sh env.sh
COPY .env .

# Make shell script executable
RUN chmod +x env.sh

# Start Nginx server
CMD ["/bin/sh", "-c", "/usr/share/nginx/html/env.sh && nginx -g \"daemon off;\""]
CMD ["/bin/sh", "-c", "nginx -g \"daemon off;\""]
2 changes: 2 additions & 0 deletions frontend/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
"fuse.js": "^6.5.3",
"graphql": "^16.2.0",
"lodash": "^4.17.21",
"lodash.uniqueid": "^4.0.1",
"pdfjs-dist": "^2.13.216",
"react": "16.14.0",
"react-beautiful-dnd": "^13.1.0",
Expand Down Expand Up @@ -77,6 +78,7 @@
"**/*": "prettier --write --ignore-unknown"
},
"devDependencies": {
"@types/lodash.uniqueid": "^4.0.9",
"@types/uuid": "^8.3.4",
"husky": "^8.0.1",
"prettier": "^2.7.1"
Expand Down
9 changes: 8 additions & 1 deletion frontend/src/components/annotator/Annotator.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -598,8 +598,11 @@ export const Annotator = ({
}
setDocTypeLabels(Object.values(document_label_lookup));

// This is the annotations start loading
// Turn existing annotation data into PDFAnnotations obj and inject into state:
let annotation_objs: ServerAnnotation[] = [];

// Case 1 is where an "Analysis" is not selected
if (
annotator_data?.existingTextAnnotations &&
selected_analysis_ids?.length === 0
Expand All @@ -619,7 +622,9 @@ export const Annotator = ({
)
);
// console.log("Got manual annotation objs: ", annotation_objs);
} else if (
}
// If an analysis is selected... load THOSE annotations
else if (
selected_analysis_ids &&
selected_analysis_ids.length > 0 &&
annotator_data?.existingTextAnnotations
Expand All @@ -639,6 +644,7 @@ export const Annotator = ({
);
}

// Load doc-level labels
let doc_type_annotations: DocTypeAnnotation[] = [];

if (annotator_data?.existingDocLabelAnnotations) {
Expand All @@ -659,6 +665,7 @@ export const Annotator = ({
);
}

// Load relationship level labels
let relationship_annotations: RelationGroup[] = [];

if (annotator_data?.existingRelationships) {
Expand Down
Loading
Loading