Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,7 @@ You must first have these set up and ready to go:
```
12. Go to [Django admin for DocumentCloud](https://api.dev.documentcloud.org/admin) and add the required static [flat page](https://api.dev.documentcloud.org/admin/flatpages/flatpage/) called `/tipofday/`. It can be blank. Do not prefix the URL with `/pages/`. Specifying the `Site` as `example.com` is alright.
13. Create an initial Minio bucket to simulate AWS S3 locally:
- Reference your DocumentCloud `.django` file for these variables:
- Visit the `MINIO_URL` with a browser, likely at [this address](http://minio.documentcloud.org:9000), and login with the minio `MINIO_ACCESS_KEY` and `MINIO_SECRET_KEY`
- At the bottom right corner click the round plus button and then click the first circle that appears above it to "create bucket".
- Create a bucket called `documents`
- Run `inv initialize-minio`
14. Upload a document:
- **Check your memory allocation on Docker is at least 7gb.** A sign that you do not have enough memory allocated is if containers are randomly failing or if your system is swapping heavily, especially when uploading documents.
- The "upload" button should not be grayed out (if it is, check your user organization Verified Journalist status above)
Expand Down
2 changes: 1 addition & 1 deletion config/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,7 +463,7 @@
BASE_URL = DOCCLOUD_URL

PUBLIC_ASSET_URL = env(
"PUBLIC_ASSET_URL", default="http://minio.documentcloud.org:9000/documents/"
"PUBLIC_ASSET_URL", default="https://minio.documentcloud.org/documents/"
)
PRIVATE_ASSET_URL = env("PRIVATE_ASSET_URL", default=f"{DOCCLOUD_API_URL}/files/")

Expand Down
35 changes: 20 additions & 15 deletions documentcloud/common/environment/aws/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,23 +58,28 @@ def size(self, file_name):
return bucket.Object(key).content_length

def open(self, file_name, mode="rb", content_type=None, access=None):

# This logic changed with smart_open 5.0
# https://github.com/piskvorky/smart_open/blob/develop/CHANGELOG.md#500-30-mar-2021
# See migration guide here:
# https://github.com/piskvorky/smart_open/blob/develop/MIGRATING_FROM_OLDER_VERSIONS.rst
transport_params = {
"resource_kwargs": self.resource_kwargs,
"multipart_upload_kwargs": {},
"client": self.s3_client,
}

if content_type is None:
# attempt to guess content type if not specified
content_type = mimetypes.guess_type(file_name)[0]

if content_type is not None:
# set content type if we have one
transport_params["multipart_upload_kwargs"]["ContentType"] = content_type

if access is not None:
transport_params["multipart_upload_kwargs"]["ACL"] = ACLS[access]

if "w" in mode: # Setting these kwargs only make sense in a write context
writeable_kwargs = {}
if content_type is None:
# attempt to guess content type if not specified
content_type = mimetypes.guess_type(file_name)[0]
if content_type is not None:
# set content type if we have one
writeable_kwargs["ContentType"] = content_type
if access is not None:
writeable_kwargs["ACL"] = ACLS[access]
if writeable_kwargs:
# Guard against no writeable kwargs provided
transport_params["client_kwargs"] = {
"S3.Client.create_multipart_upload": writeable_kwargs
}
return smart_open.open(
f"s3://{file_name}", mode, transport_params=transport_params
)
Expand Down
4 changes: 2 additions & 2 deletions documentcloud/common/environment/minio/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ def __init__(self, resource_kwargs=None, minio=True):
if resource_kwargs is None:
resource_kwargs = {
"endpoint_url": env.str("MINIO_URL"),
"aws_access_key_id": env.str("MINIO_ACCESS_KEY"),
"aws_secret_access_key": env.str("MINIO_SECRET_KEY"),
"aws_access_key_id": env.str("MINIO_ROOT_USER"),
"aws_secret_access_key": env.str("MINIO_ROOT_PASSWORD"),
"config": Config(signature_version="s3v4"),
"region_name": "us-east-1",
}
Expand Down
57 changes: 57 additions & 0 deletions documentcloud/core/management/commands/initialize_minio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Django
from django.core.management.base import BaseCommand

# Standard Library
import json

# Third Party
import boto3
import environ
from botocore.client import Config
from botocore.exceptions import ClientError

env = environ.Env()


class Command(BaseCommand):
help = "Initialize Minio bucket and policies for local development"

def handle(self, *args, **options):
if env.str("ENVIRONMENT") != "local-minio":
return

client = boto3.client(
"s3",
endpoint_url=env.str("MINIO_URL"),
aws_access_key_id=env.str("MINIO_ROOT_USER"),
aws_secret_access_key=env.str("MINIO_ROOT_PASSWORD"),
config=Config(signature_version="s3v4"),
region_name="us-east-1",
)

# Create bucket if it doesn't exist
try:
client.head_bucket(Bucket="documents")
self.stdout.write("Bucket already exists")
except ClientError as e:
error_code = e.response["Error"]["Code"]
if error_code == "404": # Bucket doesn't exist, create it
client.create_bucket(Bucket="documents")
self.stdout.write("Created documents bucket")
else:
raise

# Set public read policy
policy = {
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": "*",
"Action": "s3:GetObject",
"Resource": "arn:aws:s3:::documents/*",
}
],
}
client.put_bucket_policy(Bucket="documents", Policy=json.dumps(policy))
self.stdout.write("Minio initialized successfully")
2 changes: 1 addition & 1 deletion documentcloud/documents/processing/info_and_image/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -768,7 +768,7 @@ def extract_single_page(doc_id, slug, access, page, page_number, large_image_pat
image_width,
max(round(img_buffer.height * (image_width / img_buffer.width)), 1),
),
Image.ANTIALIAS,
Image.LANCZOS,
)

mem_file = io.BytesIO()
Expand Down
7 changes: 4 additions & 3 deletions initialize_dotenvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,10 @@ def random_string(n):
{
"name": "MinIO",
"envvars": [
("MINIO_ACCESS_KEY", lambda: random_string(64)),
("MINIO_SECRET_KEY", lambda: random_string(64)),
("MINIO_URL", "http://minio.documentcloud.org:9000"),
("MINIO_ROOT_USER", lambda: random_string(64)),
("MINIO_ROOT_PASSWORD", lambda: random_string(64)),
("MINIO_URL", "https://minio.documentcloud.org"),
("AWS_CA_BUNDLE", "/etc/ssl/certs/ca-certificates.crt"),
],
},
],
Expand Down
6 changes: 4 additions & 2 deletions local.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,16 +50,18 @@ services:
image: redis:5.0

documentcloud_minio:
image: minio/minio:RELEASE.2019-10-12T01-39-57Z
image: minio/minio:RELEASE.2024-12-18T13-15-44Z
volumes:
- local_minio_data:/data
ports:
- "9000:9000"
command: server /data
- "9001:9001"
command: server /data --console-address ":9001"
env_file:
- ./.envs/.local/.django
networks:
default:
squarelet_default:
aliases:
- minio.documentcloud.org

Expand Down
4 changes: 4 additions & 0 deletions tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,10 @@ def download_tesseract_data(c):
"""Download Tesseract data files. Needed to be able to do OCR locally."""
c.run("cd config/aws/lambda; ./build.sh")

@task
def initialize_minio(c):
"""Initialize Minio bucket and policies for local development"""
c.run(DJANGO_RUN.format(cmd="python manage.py initialize_minio"))

@task
def deploy_lambdas(c, staging=False):
Expand Down
Loading