Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions .actions/base/lib.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/bin/bash

error() {
export failure=true
notify bangbang "@$GITHUB_ACTOR" "Error $*"
return 1
}

project() {
# Return the current project
if [ -n "$GITHUB_REPOSITORY" ]; then
return "$GITHUB_REPOSITORY"
else
return "$(basename "$(pwd)")"
fi
}

notify() {
# Initialization

icon=":$1:"
shift
channel="$1"
shift
message="$*"

# Body
if [ -z "$GITHUB_ACTOR" ]; then
# Running in local mode
echo "$message"
else
if [ -z "$CD_WEBHOOK" ]; then
echo "Please set the CD_WEBHOOK environment variable (you have it in pass)"
exit 1
fi

curl -v -X POST -H "Content-Type: application/json" \
--data "{\"icon_emoji\":\"$icon\", \"channel\":\"$channel\", \"text\":\
\"[$message]($GIT_HTTP_SERVER/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_NUMBER)\
\"}" "$CD_WEBHOOK"
fi
}

bump() {
echo "---------------------------------"
echo "--- Bumping program version ---"
echo "---------------------------------"

# Bump the version
cz --no-raise 21 bump --changelog --no-verify || error creating the bump with commitizen

# Push changes
git remote add ssh "git@$GIT_SERVER:$GITHUB_REPOSITORY.git"
git pull ssh main || error pulling the main branch in the bump job
git push ssh main || error pushing the main branch in the bump job
git push ssh --tags || error pushing the tags in the bump job
}

update_actions() {
echo "------------------------------"
echo "--- Updating the actions ---"
echo "------------------------------"

git submodule update --recursive --remote
}
8 changes: 8 additions & 0 deletions .actions/base/notify.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash

set -eu

# shellcheck source=lib.sh
source "$(dirname "$(realpath "$0")")/lib.sh"

notify "$@"
6 changes: 6 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
*
!extract_python
!uv.lock
!pyproject.toml
!README.md

84 changes: 84 additions & 0 deletions .gitea/workflows/build.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
---
name: Publish Docker images

on:
push:
tags:
- '*'

env:
CD_WEBHOOK: "${{ secrets.CD_WEBHOOK }}"
GIT_SERVER: "${{ secrets.GIT_SERVER }}"
GIT_HTTP_SERVER: "${{ secrets.GIT_HTTP_SERVER }}"

jobs:
build-and-push:
runs-on: ubuntu-latest
steps:
- name: Configure SSH to be able to log in the target instances
run: |
echo "${{ secrets.DEPLOY_SSH_KEY }}" > ~/.ssh/deploy_key
chmod 600 ~/.ssh/deploy_key
dos2unix ~/.ssh/deploy_key
ssh-agent -a $SSH_AUTH_SOCK > /dev/null
ssh-add ~/.ssh/deploy_key

- name: Checkout code
uses: https://github.com/actions/checkout@v3
with:
fetch-depth: 0
submodules: true

- name: Login to Docker Registry
uses: https://github.com/docker/login-action@v2
with:
registry: ${{ secrets.REGISTRY_NAME }}
username: ${{ secrets.REGISTRY_USERNAME }}
password: ${{ secrets.REGISTRY_PASSWORD }}

- name: Set up Docker Buildx
uses: https://github.com/docker/setup-buildx-action@v2

- name: Determine DBMate architecture
id: arch
run: |
DBMATE_ARCH=$(arch || uname -m)
if [[ "$DBMATE_ARCH" == "x86_64" || "$DBMATE_ARCH" == "amd64" ]]; then
DBMATE_ARCH="amd64"
elif [[ "$DBMATE_ARCH" == "aarch64" || "$DBMATE_ARCH" == "arm64" ]]; then
DBMATE_ARCH="arm64"
elif [[ "$DBMATE_ARCH" == "i386" ]]; then
DBMATE_ARCH="386"
else
echo "Unsupported architecture: $DBMATE_ARCH" >&2
exit 1
fi
echo "DBMATE_ARCH=$DBMATE_ARCH" >> $GITHUB_ENV

- name: Extract metadata
id: meta-extract-worker-cpu
uses: docker/metadata-action@v4
with:
images: ${{ secrets.REGISTRY_NAME }}/extract-worker-cpu

- name: Build and push extract-worker-cpu
uses: docker/build-push-action@v2
with:
context: .
target: worker-cpu
push: true
cache-from: type=registry,ref=${{ secrets.REGISTRY_NAME }}/extract-worker-cpu:buildcache
cache-to: type=registry,ref=${{ secrets.REGISTRY_NAME }}/extract-worker-cpu:buildcache,mode=max
tags: ${{ steps.meta-extract-worker-cpu.outputs.tags }}
labels: ${{ steps.meta-extract-worker-cpu.outputs.labels }}
build-args: |
dbmate_arch=${{ env.DBMATE_ARCH }}

- name: Notify failures
if: failure()
run: |-
./.actions/base/notify.sh bangbang "@$GITHUB_ACTOR" "[Error running the build on the Docker images]"

- name: Notify success
run: |-
./.actions/base/notify.sh white_check_mark "@$GITHUB_ACTOR" "[Success running the build on the Docker images]"
13 changes: 13 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,19 @@ jobs:
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Install tesseract an
run: |
sudo apt-get install -y tesseract-ocr \
tesseract-ocr-eng \
tesseract-ocr-fra \
tesseract-ocr-deu \
tesseract-ocr-spa \
tesseract-ocr-lat \
tesseract-ocr-jpn \
libtesseract-dev \
libleptonica-dev \
pkg-config
echo "TESSDATA_PREFIX=$(sudo dpkg -L tesseract-ocr-eng | grep tessdata$)" >> $GITHUB_ENV
- name: Run tests
run: uv run --dev --extra docling --frozen pytest -vvv --cache-clear --show-capture=all -r A tests

Expand Down
51 changes: 51 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# syntax=docker/dockerfile:1.14.0
FROM python:3.11-slim-bullseye AS python-base

ENV HOME=/home/user
WORKDIR $HOME
RUN apt-get update && apt-get install -y curl

RUN curl -LsSf https://astral.sh/uv/0.6.7/install.sh | sh
ENV PATH="$HOME/.local/bin:$PATH"
ENV UV_LINK_MODE=copy
ENV UV_COMPILE_BYTECODE=1

FROM python-base AS worker-cpu

ARG dbmate_arch
WORKDIR $HOME/src/app
RUN curl -fsSL -o /usr/local/bin/dbmate https://github.com/amacneil/dbmate/releases/download/v2.19.0/dbmate-linux-${dbmate_arch} \
&& chmod +x /usr/local/bin/dbmate
# TODO: add more languages here
RUN apt-get install -y tesseract-ocr \
tesseract-ocr-eng \
tesseract-ocr-fra \
tesseract-ocr-deu \
tesseract-ocr-spa \
tesseract-ocr-lat \
tesseract-ocr-jpn \
libtesseract-dev \
libleptonica-dev \
pkg-config
ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata
# We skip opencv since we already depend on opencv-python-headless which is the lib we need to use
# Install deps first to optimize layer cache
RUN --mount=type=cache,target=~/.cache/uv \
--mount=type=bind,source=uv.lock,target=uv.lock \
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
uv sync -v --frozen --no-editable --no-sources --no-install-project --no-install-package opencv-python --extra docling
RUN uv run --no-sync docling-tools models download -o ~/.cache/docling/models
# Then copy code
ADD uv.lock pyproject.toml README.md ./
ADD extract_python ./extract_python/
# Then install service
RUN uv sync -v --frozen --no-editable --no-sources --no-install-package opencv-python --extra docling

RUN rm -rf ~/.cache/pip $(uv cache dir)

ENTRYPOINT ["uv", "run", "--no-sync", "icij-worker", "workers", "start", "-g", "cpu", "extract_python.app:app"]

FROM icij/task-service:icij-worker-0.17.21 AS http-service
ADD uv.lock pyproject.toml README.md ./extract-python/
ADD extract_python ./extract-python/extract_python/
RUN uv pip install -e ./extract-python
91 changes: 91 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
version: '3.7'

x-tm-amqp-config-variables: &tm-amqp-config
TASK_MANAGER__RABBITMQ_HOST: rabbitmq

x-postgres-storage-config: &tm-postgres-storage-config
TASK_MANAGER__BACKEND: amqp
# Change this to a FSKeyValueStorageConfig if you don't want to use postgres
TASK_MANAGER__STORAGE__HOST: postgres
TASK_MANAGER__STORAGE__PORT: 5432
TASK_MANAGER__STORAGE__PASSWORD: changeme

x-worker-config-variables: &worker-config
ICIJ_WORKER_TYPE: amqp
ICIJ_WORKER_RABBITMQ_HOST: rabbitmq
ICIJ_WORKER_RABBITMQ_PORT: 5672

x-async-app-variables: &async-app
EXTRACT_DATA_DIR: /usr/src/data
EXTRACT_WORK_DIR: /usr/src/data/workdir
EXTRACT_LOG_LEVEL: DEBUG


services:
rabbitmq:
image: rabbitmq:3.12.0-management
container_name: extract-rabbitmq
healthcheck:
test: rabbitmq-diagnostics -q status
interval: 5s
timeout: 2s
retries: 10
start_period: 5s
ports:
- "5672:5672"
- "15672:15672"

postgres:
image: postgres
container_name: extract-postgres
environment:
POSTGRES_PASSWORD: changeme
healthcheck:
test: pg_isready
interval: 2s
timeout: 2s
retries: 10
start_period: 5s
ports:
- "5435:5432"

http-service:
depends_on:
rabbitmq:
condition: service_healthy
postgres:
condition: service_healthy
build:
context: .
target: http-service
container_name: extract-http-service
environment:
<<: [ *tm-amqp-config, *tm-postgres-storage-config ]
PORT: "8000"
HOST: "0.0.0.0"
LOG_LEVEL: DEBUG
TASK_MANAGER__APP_PATH: extract_app
healthcheck:
test: curl -f http://localhost:8000/health
interval: 5s
timeout: 2s
retries: 10
start_period: 5s
ports:
- "8000:8000"

extract-worker-cpu:
depends_on:
http-service:
condition: service_healthy
build:
context: .
args:
dbmate_arch: $DBMATE_ARCH
target: worker-cpu
environment:
<<: [ *worker-config, *async-app ]
volumes:
- type: bind
source: ./data
target: /usr/src/data
42 changes: 42 additions & 0 deletions extract
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/env bash

function _export_globals() {
DBMATE_ARCH=$(dbmate_arch)
export DBMATE_ARCH
}

function _helpers() {
function dbmate_arch() {
local host_arch
if command -v arch >/dev/null 2>&1; then
host_arch=$(arch)
else
host_arch=$(uname -m)
fi
local dbmate_arch_
if [ "$host_arch" == "x86_64" ] ||[ "$host_arch" == "amd64" ]; then
dbmate_arch_="amd64"
elif [ "$host_arch" == "aarch64" ] || [ "$host_arch" == "arm64" ]; then
dbmate_arch_="arm64"
elif [ "$host_arch" == "i386" ] ; then
dbmate_arch_="386"
else
_exit_with_message "Unsupported architecture $host_arch"
fi
echo "$dbmate_arch_"
}

}

function _main() {
set -e
function _exit_with_message() {
echo "$1"
exit "${2:-1}"
}
_helpers
_export_globals
docker compose "$@"
}

_main "$@"
Loading