diff --git a/.github/workflows/ansible-deploy.yml b/.github/workflows/ansible-deploy.yml new file mode 100644 index 0000000000..7843e51677 --- /dev/null +++ b/.github/workflows/ansible-deploy.yml @@ -0,0 +1,82 @@ +name: Ansible Deployment + +on: + push: + branches: [ main, master ] + paths: + - 'ansible/**' + - '!ansible/docs/**' + - '.github/workflows/ansible-deploy.yml' + pull_request: + branches: [ main, master ] + paths: + - 'ansible/**' + - '!ansible/docs/**' + +concurrency: + group: ansible-deploy-${{ github.ref }} + cancel-in-progress: true + +jobs: + lint: + name: Ansible Lint + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install dependencies + run: | + pip install ansible ansible-lint + + - name: Run ansible-lint + run: | + cd ansible + ansible-lint playbooks/*.yml + + deploy: + name: Deploy Application + needs: lint + runs-on: ubuntu-latest + if: github.event_name == 'push' + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install Ansible + run: | + pip install ansible + + - name: Setup SSH + run: | + mkdir -p ~/.ssh + echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa + chmod 600 ~/.ssh/id_rsa + ssh-keyscan -H ${{ secrets.VM_HOST }} >> ~/.ssh/known_hosts + + - name: Deploy with Ansible + env: + ANSIBLE_HOST_KEY_CHECKING: "False" + run: | + cd ansible + echo "${{ secrets.ANSIBLE_VAULT_PASSWORD }}" > /tmp/vault_pass + ansible-playbook playbooks/deploy.yml \ + -i inventory/hosts.ini \ + --vault-password-file /tmp/vault_pass + rm -f /tmp/vault_pass + + - name: Verify Deployment + run: | + sleep 10 + curl -f http://${{ secrets.VM_HOST }}:5000 || exit 1 + curl -f http://${{ secrets.VM_HOST }}:5000/health || exit 1 diff --git a/README.md b/README.md index 371d51f456..0b25c8e8a0 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ [![Labs](https://img.shields.io/badge/Labs-18-blue)](#labs) [![Exam](https://img.shields.io/badge/Exam-Optional-green)](#exam-alternative) [![Duration](https://img.shields.io/badge/Duration-18%20Weeks-lightgrey)](#course-roadmap) +[![Ansible Deployment](https://github.com/GrayMansion/DevOps-Core-Course/actions/workflows/ansible-deploy.yml/badge.svg)](https://github.com/GrayMansion/DevOps-Core-Course/actions/workflows/ansible-deploy.yml) Master **production-grade DevOps practices** through hands-on labs. Build, containerize, deploy, monitor, and scale applications using industry-standard tools. diff --git a/ansible/docs/LAB06.md b/ansible/docs/LAB06.md new file mode 100644 index 0000000000..71396a49b7 --- /dev/null +++ b/ansible/docs/LAB06.md @@ -0,0 +1,1049 @@ +# Lab 6: Advanced Ansible & CI/CD — Submission + +**Name:** Makar +**Date:** 2026-03-04 +**Lab Points:** 10 + 0 bonus + +--- + +## Task 1: Blocks & Tags (2 pts) + +### 1.1 Block Usage in Common Role + +**File:** `roles/common/tasks/main.yml` + +The common role was refactored into a single block that groups all package-related tasks together. The block applies `become: true` once at the block level rather than per-task, and uses the tags `packages` and `common`. + +```yaml +--- +# Package installation block with error handling +- name: Install system packages + become: true + tags: + - packages + - common + block: + - name: Update apt cache + ansible.builtin.apt: + update_cache: true + cache_valid_time: 3600 + + - name: Install common packages + ansible.builtin.apt: + name: "{{ common_packages }}" + state: present + + - name: Set timezone + community.general.timezone: + name: "{{ common_timezone }}" + + rescue: + - name: Fix apt cache on failure + ansible.builtin.apt: + update_cache: true + force: true + + - name: Retry package installation after fix + ansible.builtin.apt: + name: "{{ common_packages }}" + state: present + update_cache: true + + always: + - name: Log package installation completion + ansible.builtin.copy: + content: "Common role packages block completed at {{ ansible_date_time.iso8601 }}\n" + dest: /tmp/common_packages_done.log + mode: "0644" +``` + +**Error handling:** If `apt update` or package installation fails, the rescue block runs `apt update --force` and retries. The always block logs a timestamp to `/tmp/common_packages_done.log` regardless of success or failure. + +### 1.2 Block Usage in Docker Role + +**File:** `roles/docker/tasks/main.yml` + +The docker role was split into two logical blocks: + +1. **Install Docker Engine** (`docker_install` tag) — groups prerequisites, GPG key, repository, and package installation with rescue/always error handling. +2. **Configure Docker** (`docker_config` tag) — groups user and Python library setup. + +```yaml +--- +# Docker installation block with error handling +- name: Install Docker Engine + become: true + tags: + - docker_install + - docker + block: + - name: Install prerequisites for Docker repository + ansible.builtin.apt: + name: [ca-certificates, curl, gnupg] + state: present + - name: Create keyrings directory + ansible.builtin.file: + path: /etc/apt/keyrings + state: directory + mode: "0755" + - name: Add Docker GPG key + ansible.builtin.apt_key: + url: https://download.docker.com/linux/ubuntu/gpg + keyring: /etc/apt/keyrings/docker.gpg + state: present + - name: Add Docker repository + ansible.builtin.apt_repository: + repo: >- + deb [arch=amd64 signed-by=/etc/apt/keyrings/docker.gpg] + https://download.docker.com/linux/ubuntu + {{ ansible_facts['distribution_release'] }} stable + state: present + filename: docker + - name: Install Docker packages + ansible.builtin.apt: + name: "{{ docker_packages }}" + state: present + update_cache: true + notify: Restart docker + rescue: + - name: Wait before retrying Docker installation + ansible.builtin.pause: + seconds: 10 + - name: Retry apt update after GPG key failure + ansible.builtin.apt: + update_cache: true + - name: Retry Docker package installation + ansible.builtin.apt: + name: "{{ docker_packages }}" + state: present + notify: Restart docker + always: + - name: Ensure Docker service is enabled and started + ansible.builtin.service: + name: docker + state: started + enabled: true + failed_when: false + +# Docker configuration block +- name: Configure Docker + become: true + tags: + - docker_config + - docker + block: + - name: Add user to docker group + ansible.builtin.user: + name: "{{ docker_user }}" + groups: docker + append: true + - name: Install python3-docker for Ansible docker modules + ansible.builtin.apt: + name: python3-docker + state: present +``` + +**Error handling:** The Docker GPG key addition can fail due to network timeouts. The rescue block waits 10 seconds, retries `apt update`, then retries package installation. The always block ensures the Docker service is enabled regardless of outcome. + +### 1.3 Tag Strategy + +| Tag | Scope | Description | +|-----|-------|-------------| +| `common` | common role | All common role tasks | +| `packages` | common role | Package installation tasks | +| `docker` | docker role | All Docker tasks | +| `docker_install` | docker role | Docker installation only | +| `docker_config` | docker role | Docker configuration only | +| `app_deploy` | web_app role | Application deployment | +| `compose` | web_app role | Docker Compose operations | +| `web_app_wipe` | web_app role | Wipe/cleanup tasks | + +### 1.4 Evidence — ansible-lint Passes (Production Profile) + +``` +$ cd ansible && source ../app_python/venv/bin/activate.fish && ansible-lint playbooks/*.yml + +Passed: 0 failure(s), 0 warning(s) on 13 files examined and 11 of them are considered (2 exempted). +Last profile that matched before the error was 'production'. +``` + +All roles pass the strictest production-level lint profile with zero warnings. + +### 1.5 Evidence — Tag Listing + +``` +$ ansible-playbook playbooks/provision.yml --list-tags + play #1 (webservers): Provision infrastructure TAGS: [] + TASK TAGS: [common, docker, docker_config, docker_install, packages] + +$ ansible-playbook playbooks/deploy.yml --list-tags + play #1 (webservers): Deploy application TAGS: [] + TASK TAGS: [app_deploy, compose, docker, docker_config, docker_install, web_app_wipe] +``` + +Note: `deploy.yml` includes docker tags because `web_app/meta/main.yml` declares `docker` as a dependency. + +### 1.6 Evidence — Selective Tag Execution (only docker tasks) + +``` +$ ansible-playbook playbooks/provision.yml --tags "docker" \ + -i inventory/hosts.ini --vault-password-file /tmp/.vault_pass + +PLAY [Provision infrastructure] ************************************************ + +TASK [Gathering Facts] ********************************************************* +ok: [devops_vm] + +TASK [docker : Install prerequisites for Docker repository] ******************** +ok: [devops_vm] + +TASK [docker : Create keyrings directory] ************************************** +ok: [devops_vm] + +TASK [docker : Add Docker GPG key] ********************************************* +ok: [devops_vm] + +TASK [docker : Add Docker repository] ****************************************** +ok: [devops_vm] + +TASK [docker : Install Docker packages] **************************************** +ok: [devops_vm] + +TASK [docker : Ensure Docker service is enabled and started] ******************* +ok: [devops_vm] + +TASK [docker : Add user to docker group] *************************************** +ok: [devops_vm] + +TASK [docker : Install python3-docker for Ansible docker modules] ************** +ok: [devops_vm] + +PLAY RECAP ********************************************************************* +devops_vm : ok=9 changed=0 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 +``` + +All common role tasks (Update apt cache, Install common packages, Set timezone, Log) were **skipped** because they don't carry the `docker` tag — only docker role tasks executed. + +### 1.7 Evidence — Rescue Block Triggered + +During initial deployment, the rescue block was triggered due to a container name conflict (leftover from Lab 5): + +``` +TASK [web_app : Deploy with docker compose] ************************************ +fatal: [devops_vm]: FAILED! => ... + "stderr": "Error response from daemon: Conflict. The container name \"/devops-app\" + is already in use by container ... You have to remove (or rename) that container..." + +TASK [web_app : Log deployment failure] **************************************** +ok: [devops_vm] => { + "msg": "Deployment of devops-app failed. Check logs with: + docker compose -f /opt/devops-app/docker-compose.yml logs" +} + +TASK [web_app : Fail with error message] *************************************** +fatal: [devops_vm]: FAILED! ... + +PLAY RECAP ********************************************************************* +devops_vm : ok=16 changed=3 unreachable=0 failed=1 skipped=1 rescued=1 ignored=0 +``` + +The `rescued=1` confirms the rescue block fired. After removing the conflicting container (`docker rm -f devops-app`), re-deployment succeeded. + +### 1.8 Research Answers — Blocks & Tags + +**Q: What happens if rescue block also fails?** +If the rescue block fails, the **always** block still runs (if present), but the overall task/play is marked as failed. Ansible does NOT have a "rescue for the rescue" — the play stops for that host after the always block completes. + +**Q: Can you have nested blocks?** +Yes, blocks can be nested. Inner blocks can have their own rescue/always sections. However, deeply nested blocks reduce readability — typically one level is sufficient. + +**Q: How do tags inherit to tasks within blocks?** +Tags applied at the block level are inherited by all tasks within the block (including rescue and always sections). Tags on individual tasks inside a block are additive — a task gets both the block's tags and its own tags. + +--- + +## Task 2: Docker Compose (3 pts) + +### 2.1 Role Rename + +Renamed `app_deploy` → `web_app` for clarity: +```bash +cd ansible/roles && mv app_deploy web_app +``` + +All playbook references updated: `deploy.yml`, `site.yml`. + +### 2.2 Docker Compose Template + +**File:** `roles/web_app/templates/docker-compose.yml.j2` + +```yaml +# {{ ansible_managed }} +# Docker Compose configuration for {{ web_app_name }} + +services: + {{ web_app_name }}: + image: {{ web_app_image }}:{{ web_app_tag }} + container_name: {{ web_app_name }} + ports: + - "{{ web_app_port }}:{{ web_app_internal_port }}" +{% if web_app_env | length > 0 %} + environment: +{% for key, value in web_app_env.items() %} + {{ key }}: "{{ value }}" +{% endfor %} +{% endif %} + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:{{ web_app_internal_port }}/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 10s +``` + +**Features:** +- Dynamic service name, image, ports via Jinja2 variables +- Conditional environment block (only rendered if `web_app_env` has entries) +- Built-in Docker healthcheck for self-healing +- `unless-stopped` restart policy (survives host reboots, respects manual stops) +- `ansible_managed` comment to indicate the file is generated + +### 2.3 Role Dependencies + +**File:** `roles/web_app/meta/main.yml` + +```yaml +--- +dependencies: + - role: docker +``` + +This ensures Docker is automatically installed before `web_app` deploys. Running `ansible-playbook playbooks/deploy.yml` triggers the docker role first without needing it in the playbook. + +### 2.4 Deployment Tasks + +**File:** `roles/web_app/tasks/main.yml` + +```yaml +--- +# Wipe logic runs first (when explicitly requested) +- name: Include wipe tasks + ansible.builtin.include_tasks: wipe.yml + tags: + - web_app_wipe + +# Deploy application with Docker Compose +- name: Deploy application with Docker Compose + become: true + tags: + - app_deploy + - compose + block: + - name: Create application directory + ansible.builtin.file: + path: "{{ web_app_compose_dir }}" + state: directory + mode: "0755" + + - name: Template docker-compose file + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{ web_app_compose_dir }}/docker-compose.yml" + mode: "0644" + register: web_app_compose_file + + - name: Log in to Docker Hub + community.docker.docker_login: + username: "{{ dockerhub_username }}" + password: "{{ dockerhub_password }}" + no_log: true + + - name: Pull latest Docker image + community.docker.docker_image: + name: "{{ web_app_image }}" + tag: "{{ web_app_tag }}" + source: pull + register: web_app_image_pull + + - name: Deploy with docker compose + ansible.builtin.command: + cmd: docker compose up -d --remove-orphans + chdir: "{{ web_app_compose_dir }}" + register: web_app_compose_up + changed_when: >- + 'Started' in web_app_compose_up.stderr or + 'Created' in web_app_compose_up.stderr or + web_app_compose_file.changed or + web_app_image_pull.changed + + - name: Wait for application to be ready + ansible.builtin.wait_for: + port: "{{ web_app_port }}" + host: "127.0.0.1" + delay: 5 + timeout: 30 + + - name: Verify health endpoint + ansible.builtin.uri: + url: "http://127.0.0.1:{{ web_app_port }}/health" + method: GET + return_content: true + status_code: 200 + register: web_app_health_check + retries: 3 + delay: 5 + + - name: Display health check result + ansible.builtin.debug: + var: web_app_health_check.json + + rescue: + - name: Log deployment failure + ansible.builtin.debug: + msg: >- + Deployment of {{ web_app_name }} failed. Check logs with: + docker compose -f {{ web_app_compose_dir }}/docker-compose.yml logs + + - name: Fail with error message + ansible.builtin.fail: + msg: "Docker Compose deployment failed for {{ web_app_name }}" +``` + +**Before (Lab 5):** Used `community.docker.docker_container` module to run individual containers with `docker run` semantics. + +**After (Lab 6):** Uses `docker compose up -d` with a templated `docker-compose.yml`: + +1. Create `/opt/devops-app/` directory +2. Template `docker-compose.yml.j2` → `/opt/devops-app/docker-compose.yml` +3. Login to Docker Hub (credentials from Vault) +4. Pull the latest image +5. Run `docker compose up -d --remove-orphans` +6. Wait for port + verify `/health` endpoint + +### 2.5 Variables Configuration + +**File:** `roles/web_app/defaults/main.yml` + +```yaml +--- +web_app_name: devops-app +web_app_port: 5000 +web_app_internal_port: 5000 +web_app_env: {} +web_app_image: graymansion/devops-info-service +web_app_tag: latest +web_app_compose_dir: "/opt/{{ web_app_name }}" +web_app_wipe: false +``` + +Sensitive values (`dockerhub_username`, `dockerhub_password`) remain in Vault-encrypted `group_vars/all.yml`. + +### 2.6 Evidence — Successful Docker Compose Deployment + +``` +$ ansible-playbook playbooks/deploy.yml \ + -i inventory/hosts.ini --vault-password-file /tmp/.vault_pass + +PLAY [Deploy application] ****************************************************** + +TASK [Gathering Facts] ********************************************************* +ok: [devops_vm] + +TASK [docker : Install prerequisites for Docker repository] ******************** +ok: [devops_vm] +... +TASK [docker : Ensure Docker service is enabled and started] ******************* +ok: [devops_vm] +TASK [docker : Add user to docker group] *************************************** +ok: [devops_vm] +TASK [docker : Install python3-docker for Ansible docker modules] ************** +ok: [devops_vm] + +TASK [web_app : Include wipe tasks] ******************************************** +included: .../roles/web_app/tasks/wipe.yml for devops_vm + +TASK [web_app : Wipe web application] ****************************************** +skipping: [devops_vm] + +TASK [web_app : Create application directory] ********************************** +ok: [devops_vm] + +TASK [web_app : Template docker-compose file] ********************************** +ok: [devops_vm] + +TASK [web_app : Log in to Docker Hub] ****************************************** +ok: [devops_vm] + +TASK [web_app : Pull latest Docker image] ************************************** +ok: [devops_vm] + +TASK [web_app : Deploy with docker compose] ************************************ +ok: [devops_vm] + +TASK [web_app : Wait for application to be ready] ****************************** +ok: [devops_vm] + +TASK [web_app : Verify health endpoint] **************************************** +ok: [devops_vm] + +TASK [web_app : Display health check result] *********************************** +ok: [devops_vm] => { + "web_app_health_check.json": { + "status": "healthy", + "timestamp": "2026-03-04T13:21:46.507Z", + "uptime_seconds": 60 + } +} + +PLAY RECAP ********************************************************************* +devops_vm : ok=18 changed=0 unreachable=0 failed=0 skipped=1 rescued=0 ignored=0 +``` + +### 2.7 Evidence — Idempotency (Second Run, changed=0) + +``` +$ ansible-playbook playbooks/deploy.yml \ + -i inventory/hosts.ini --vault-password-file /tmp/.vault_pass + +PLAY RECAP ********************************************************************* +devops_vm : ok=18 changed=0 unreachable=0 failed=0 skipped=1 rescued=0 ignored=0 +``` + +`changed=0` confirms full idempotency. Wipe task is `skipped=1` (variable gate is false by default). + +### 2.8 Evidence — Rendered docker-compose.yml on VM + +``` +$ ssh vagrant@192.168.121.159 cat /opt/devops-app/docker-compose.yml +# Ansible managed +# Docker Compose configuration for devops-app + +services: + devops-app: + image: graymansion/devops-info-service:latest + container_name: devops-app + ports: + - "5000:5000" + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:5000/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 10s +``` + +### 2.9 Evidence — Container Running + +``` +$ ssh vagrant@192.168.121.159 docker ps +CONTAINER ID IMAGE STATUS PORTS NAMES +2ac8c632334c graymansion/devops-info-service:latest Up 3 minutes (healthy) 0.0.0.0:5000->5000/tcp devops-app +``` + +### 2.10 Evidence — Curl Verification + +``` +$ curl -s http://192.168.121.159:5000/ | python3 -m json.tool +{ + "service": { + "name": "DevOps Information Service", + "version": "1.0.0", + "description": "Lightweight Python web app crafted for DevOps Core Course labs" + }, + "system": { + "hostname": "2ac8c632334c", + "platform": "Linux-5.15.0-130-generic-x86_64-with-glibc2.36", + "python_version": "3.11.12", + "cpu_count": 2, + "memory_total_mb": 1972.83 + }, + "runtime": { + "start_time": "2026-03-04T13:20:46.487181+00:00", + "uptime_seconds": 92.099... + }, + "request": { + "remote_addr": "192.168.121.1", + "method": "GET", + "path": "/", + "timestamp": "2026-03-04T13:22:18.586Z" + } +} + +$ curl -s http://192.168.121.159:5000/health | python3 -m json.tool +{ + "status": "healthy", + "timestamp": "2026-03-04T13:22:18.585Z", + "uptime_seconds": 92 +} +``` + +### 2.11 Research Answers — Docker Compose + +**Q: What's the difference between `restart: always` and `restart: unless-stopped`?** +`always` restarts the container unconditionally, even after a manual `docker stop`. `unless-stopped` also restarts on failure/reboot, but respects a manual stop — if you explicitly stop a container, it stays stopped until you start it again. `unless-stopped` is preferred for production since operators can intentionally stop services. + +**Q: How do Docker Compose networks differ from Docker bridge networks?** +Docker Compose creates a project-specific bridge network (named `_default`) where services can reach each other by service name. Manual `docker run` uses the default `bridge` network where container-to-container communication requires explicit `--link` or network creation. Compose networks provide automatic DNS-based service discovery. + +**Q: Can you reference Ansible Vault variables in the template?** +Yes. Vault-encrypted variables are decrypted at runtime and available to Jinja2 templates exactly like plaintext variables. You can use `{{ vault_var }}` in templates; the rendered file contains the decrypted value. Be cautious — the rendered file on disk is plaintext, so set proper file permissions. + +--- + +## Task 3: Wipe Logic (1 pt) + +### 3.1 Implementation Details + +**File:** `roles/web_app/tasks/wipe.yml` + +```yaml +--- +# Wipe tasks for web application — requires BOTH: +# 1. Variable: web_app_wipe=true (when condition) +# 2. Tag: --tags web_app_wipe (tag gating) +# This double-safety prevents accidental wipe during normal deployments. + +- name: Wipe web application + when: web_app_wipe | bool + become: true + tags: + - web_app_wipe + block: + - name: Stop and remove containers with docker compose + ansible.builtin.command: + cmd: docker compose down --remove-orphans + chdir: "{{ web_app_compose_dir }}" + changed_when: true + failed_when: false + + - name: Remove docker-compose file + ansible.builtin.file: + path: "{{ web_app_compose_dir }}/docker-compose.yml" + state: absent + + - name: Remove application directory + ansible.builtin.file: + path: "{{ web_app_compose_dir }}" + state: absent + + - name: Remove Docker image (optional cleanup) + community.docker.docker_image: + name: "{{ web_app_image }}" + tag: "{{ web_app_tag }}" + state: absent + failed_when: false + + - name: Log wipe completion + ansible.builtin.debug: + msg: "Application {{ web_app_name }} wiped successfully from {{ web_app_compose_dir }}" +``` + +### 3.2 Double-Safety Mechanism + +Wipe requires **both** conditions: +1. **Variable gate:** `when: web_app_wipe | bool` (default: `false`) +2. **Tag gate:** `tags: [web_app_wipe]` (only runs when this tag is explicitly specified or all tags are run) + +During normal `ansible-playbook deploy.yml` (no `--tags` flag), **all** tags run, but the `when` condition blocks wipe (variable is `false`). This means wipe never runs accidentally. + +### 3.3 Wipe Ordering in main.yml + +Wipe is included **before** deployment tasks to support the clean reinstall pattern (wipe old → deploy new): + +```yaml +# Wipe logic runs first +- name: Include wipe tasks + ansible.builtin.include_tasks: wipe.yml + tags: [web_app_wipe] + +# Then deployment +- name: Deploy application with Docker Compose + block: ... + tags: [app_deploy, compose] +``` + +### 3.4 Evidence — Scenario 4a: Tag Without Variable (Safety Check) + +``` +$ ansible-playbook playbooks/deploy.yml --tags web_app_wipe \ + -i inventory/hosts.ini --vault-password-file /tmp/.vault_pass + +PLAY [Deploy application] ****************************************************** + +TASK [Gathering Facts] ********************************************************* +ok: [devops_vm] + +TASK [web_app : Include wipe tasks] ******************************************** +included: .../roles/web_app/tasks/wipe.yml for devops_vm + +TASK [web_app : Wipe web application] ****************************************** +skipping: [devops_vm] + +PLAY RECAP ********************************************************************* +devops_vm : ok=2 changed=0 unreachable=0 failed=0 skipped=1 rescued=0 ignored=0 +``` + +The wipe block is **skipped** because `web_app_wipe` defaults to `false`. Even with the tag selected, the variable gate prevents any destructive action. + +### 3.5 Evidence — Scenario 2: Wipe Only + +``` +$ ansible-playbook playbooks/deploy.yml \ + -e "web_app_wipe=true" --tags web_app_wipe \ + -i inventory/hosts.ini --vault-password-file /tmp/.vault_pass + +PLAY [Deploy application] ****************************************************** + +TASK [Gathering Facts] ********************************************************* +ok: [devops_vm] + +TASK [web_app : Include wipe tasks] ******************************************** +included: .../roles/web_app/tasks/wipe.yml for devops_vm + +TASK [web_app : Stop and remove containers with docker compose] **************** +changed: [devops_vm] + +TASK [web_app : Remove docker-compose file] ************************************ +changed: [devops_vm] + +TASK [web_app : Remove application directory] ********************************** +changed: [devops_vm] + +TASK [web_app : Remove Docker image (optional cleanup)] ************************ +ok: [devops_vm] + +TASK [web_app : Log wipe completion] ******************************************* +ok: [devops_vm] => { + "msg": "Application devops-app wiped successfully from /opt/devops-app" +} + +PLAY RECAP ********************************************************************* +devops_vm : ok=7 changed=3 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 +``` + +Wipe removed the running containers, compose file, and application directory. Deployment tasks were **not run** because they carry `app_deploy`/`compose` tags (filtered out by `--tags web_app_wipe`). + +**Verification after wipe:** +``` +$ ssh vagrant@192.168.121.159 docker ps +CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES + +$ ssh vagrant@192.168.121.159 ls /opt/devops-app +ls: cannot access '/opt/devops-app': No such file or directory +``` + +No containers running. Application directory removed. + +### 3.6 Evidence — Scenario 3: Clean Reinstallation (Wipe + Deploy) + +``` +$ ansible-playbook playbooks/deploy.yml \ + -e "web_app_wipe=true" \ + -i inventory/hosts.ini --vault-password-file /tmp/.vault_pass + +PLAY [Deploy application] ****************************************************** + +TASK [Gathering Facts] ********************************************************* +ok: [devops_vm] +... +TASK [web_app : Include wipe tasks] ******************************************** +included: .../roles/web_app/tasks/wipe.yml for devops_vm + +TASK [web_app : Stop and remove containers with docker compose] **************** +changed: [devops_vm] + +TASK [web_app : Remove docker-compose file] ************************************ +changed: [devops_vm] + +TASK [web_app : Remove application directory] ********************************** +changed: [devops_vm] + +TASK [web_app : Remove Docker image (optional cleanup)] ************************ +changed: [devops_vm] + +TASK [web_app : Log wipe completion] ******************************************* +ok: [devops_vm] => { + "msg": "Application devops-app wiped successfully from /opt/devops-app" +} + +TASK [web_app : Create application directory] ********************************** +changed: [devops_vm] + +TASK [web_app : Template docker-compose file] ********************************** +changed: [devops_vm] + +TASK [web_app : Log in to Docker Hub] ****************************************** +ok: [devops_vm] + +TASK [web_app : Pull latest Docker image] ************************************** +changed: [devops_vm] + +TASK [web_app : Deploy with docker compose] ************************************ +changed: [devops_vm] + +TASK [web_app : Wait for application to be ready] ****************************** +ok: [devops_vm] + +TASK [web_app : Verify health endpoint] **************************************** +ok: [devops_vm] + +TASK [web_app : Display health check result] *********************************** +ok: [devops_vm] => { + "web_app_health_check.json": { + "status": "healthy", + "timestamp": "...", + "uptime_seconds": 6 + } +} + +PLAY RECAP ********************************************************************* +devops_vm : ok=23 changed=8 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 +``` + +`changed=8` = 4 wipe changes + 4 deploy changes. The application was completely removed and re-deployed from scratch in a single playbook run. + +### 3.7 Evidence — Scenario 1: Normal Deployment (Wipe Skipped) + +This is the default behavior shown in Section 2.6. When `web_app_wipe` defaults to `false`: + +``` +TASK [web_app : Wipe web application] ****************************************** +skipping: [devops_vm] +``` + +The wipe block is skipped, deployment proceeds normally. `skipped=1` in the PLAY RECAP. + +### 3.8 Research Answers — Wipe Logic + +**1. Why use both variable AND tag?** +Double safety. The variable prevents wipe when someone accidentally adds the wrong `--tags`. The tag prevents wipe during normal deployments (where all tags run). Neither alone is sufficient. + +**2. What's the difference between `never` tag and this approach?** +The `never` tag is a special Ansible tag that causes tasks to be skipped unless explicitly included with `--tags never`. Our approach uses a custom tag + variable gate. The `never` tag approach lacks the variable safety — anyone running `--tags never` would trigger wipe. Our approach requires both `--tags web_app_wipe` AND `-e "web_app_wipe=true"`. + +**3. Why must wipe logic come BEFORE deployment in main.yml?** +For the clean reinstall use case: when running `ansible-playbook deploy.yml -e "web_app_wipe=true"` (no tag filter), both wipe and deploy execute. Wipe must run first to remove old state before fresh deployment replaces it. + +**4. When would you want clean reinstallation vs. rolling update?** +Clean reinstall is needed when: migrating to a different compose config structure, debugging persistent state issues, changing container names/networks. Rolling updates are preferred for routine version bumps with zero downtime. + +**5. How would you extend this to wipe Docker images and volumes too?** +Add `docker compose down --rmi all --volumes` to remove images and named volumes. Add `docker system prune -f` for dangling resources. Our implementation already removes the Docker image as an optional step. + +--- + +## Task 4: CI/CD (3 pts) + +### 4.1 Workflow Architecture + +**File:** `.github/workflows/ansible-deploy.yml` + +``` +Push to ansible/** → Lint Job → Deploy Job → Verify Deployment +``` + +Two-job pipeline: +1. **lint** — runs `ansible-lint` on all playbooks +2. **deploy** — installs Ansible, configures SSH, deploys via playbook, verifies app + +### 4.2 Full Workflow Configuration + +```yaml +name: Ansible Deployment + +on: + push: + branches: [ main, master ] + paths: + - 'ansible/**' + - '!ansible/docs/**' + - '.github/workflows/ansible-deploy.yml' + pull_request: + branches: [ main, master ] + paths: + - 'ansible/**' + - '!ansible/docs/**' + +concurrency: + group: ansible-deploy-${{ github.ref }} + cancel-in-progress: true + +jobs: + lint: + name: Ansible Lint + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + - name: Install dependencies + run: pip install ansible ansible-lint + - name: Run ansible-lint + run: | + cd ansible + ansible-lint playbooks/*.yml + + deploy: + name: Deploy Application + needs: lint + runs-on: ubuntu-latest + if: github.event_name == 'push' + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + - name: Install Ansible + run: pip install ansible + - name: Setup SSH + run: | + mkdir -p ~/.ssh + echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa + chmod 600 ~/.ssh/id_rsa + ssh-keyscan -H ${{ secrets.VM_HOST }} >> ~/.ssh/known_hosts + - name: Deploy with Ansible + env: + ANSIBLE_HOST_KEY_CHECKING: "False" + run: | + cd ansible + echo "${{ secrets.ANSIBLE_VAULT_PASSWORD }}" > /tmp/vault_pass + ansible-playbook playbooks/deploy.yml \ + -i inventory/hosts.ini \ + --vault-password-file /tmp/vault_pass + rm -f /tmp/vault_pass + - name: Verify Deployment + run: | + sleep 10 + curl -f http://${{ secrets.VM_HOST }}:5000 || exit 1 + curl -f http://${{ secrets.VM_HOST }}:5000/health || exit 1 +``` + +### 4.3 Required GitHub Secrets + +| Secret | Purpose | +|--------|---------| +| `ANSIBLE_VAULT_PASSWORD` | Decrypts Vault-encrypted variables | +| `SSH_PRIVATE_KEY` | SSH key for target VM access | +| `VM_HOST` | Target VM IP address (192.168.121.159) | + +### 4.4 Path Filters + +Ansible workflow only triggers on changes to: +- `ansible/**` (all Ansible code) +- `.github/workflows/ansible-deploy.yml` (workflow itself) + +Excluded: `ansible/docs/**` (documentation changes don't need deployment). + +### 4.5 Concurrency Control + +```yaml +concurrency: + group: ansible-deploy-${{ github.ref }} + cancel-in-progress: true +``` + +Prevents multiple deployment runs from overlapping on the same branch. If a new push arrives while a deployment is running, the old one is cancelled. + +### 4.6 Verification Step + +After deployment, the workflow waits 10 seconds and verifies: +```bash +curl -f http://$VM_HOST:5000 # Main endpoint +curl -f http://$VM_HOST:5000/health # Health endpoint +``` + +### 4.7 Status Badge in README.md + +```markdown +[![Ansible Deployment](https://github.com/GrayMansion/DevOps-Core-Course/actions/workflows/ansible-deploy.yml/badge.svg)](https://github.com/GrayMansion/DevOps-Core-Course/actions/workflows/ansible-deploy.yml) +``` + +Added to the top of `README.md` alongside the existing Python CI badge. + +### 4.8 Research Answers — CI/CD + +**1. What are the security implications of storing SSH keys in GitHub Secrets?** +GitHub Secrets are encrypted at rest and masked in logs. However, any workflow code in the repository can access them, so a malicious PR could exfiltrate secrets if workflow runs on PRs with write access. Mitigations: use environment protection rules, require approval for workflow runs from forks, use short-lived credentials or deploy keys with minimal permissions. + +**2. How would you implement a staging → production deployment pipeline?** +Use GitHub environments (`staging`, `production`) with protection rules. Deploy to staging automatically on push, require manual approval for production. Use separate inventory files or host groups for each environment. Add integration tests between stages. + +**3. What would you add to make rollbacks possible?** +Pin Docker image tags (not `latest`) using CalVer or SemVer. Store the previous tag in a variable/file. On rollback, redeploy with the previous tag. Alternatively, use the wipe + deploy pattern with the old image tag: `-e "web_app_tag=2026.02.15"`. + +**4. How does self-hosted runner improve security compared to GitHub-hosted?** +Self-hosted runners are within your network, reducing the attack surface — no SSH keys need to be stored in GitHub Secrets since the runner has direct local access. However, self-hosted runners require maintenance (updates, security patches) and can be a risk if compromised, since they have network access to production infrastructure. + +--- + +## Task 5: Documentation + +This document serves as the complete Lab 6 documentation. + +### Updated File Structure + +``` +ansible/ +├── ansible.cfg +├── Vagrantfile +├── docs/ +│ ├── LAB05.md +│ └── LAB06.md # This file +├── inventory/ +│ ├── hosts.ini +│ └── group_vars/ +│ └── all.yml # Vault-encrypted +├── playbooks/ +│ ├── deploy.yml # Updated: web_app role +│ ├── provision.yml +│ └── site.yml # Updated: web_app role +└── roles/ + ├── common/ + │ ├── defaults/main.yml + │ └── tasks/main.yml # Refactored: blocks, tags, rescue/always + ├── docker/ + │ ├── defaults/main.yml + │ ├── handlers/main.yml + │ └── tasks/main.yml # Refactored: blocks, tags, rescue/always + └── web_app/ # Renamed from app_deploy + ├── defaults/main.yml # Updated: compose vars, wipe var + ├── handlers/main.yml # Updated: compose restart + ├── meta/main.yml # NEW: role dependencies + ├── tasks/ + │ ├── main.yml # Rewritten: compose deployment + │ └── wipe.yml # NEW: wipe logic + └── templates/ + └── docker-compose.yml.j2 # NEW: Jinja2 template + +.github/workflows/ +├── python-ci.yml # Existing: Python app CI +└── ansible-deploy.yml # NEW: Ansible deployment CD +``` + + +## Challenges & Solutions + +1. **ansible-lint production profile:** Required multiple iterations to fix `yaml[truthy]` (yes→true), `var-naming[no-role-prefix]` (all vars prefixed `web_app_`), `name[casing]` (handler names capitalized), `key-order` (`become`/`tags` before `block`), `ignore-errors` (replaced with `failed_when: false`), `command-instead-of-module` (replaced shell command with `ansible.builtin.apt`), and `yaml[line-length]` (multi-line `changed_when` with `>-`). + +2. **Container name conflict:** Lab 5 left a container named `devops-app` (created with `docker_container` module). Docker Compose couldn't claim that name. The rescue block correctly caught the error (`rescued=1`), and after removing the old container, redeployment succeeded. + +--- + +## Summary + +- **Time spent:** ~3 hours +- **Key learnings:** + - Ansible blocks provide clean error handling (rescue) and guaranteed cleanup (always) + - Tags enable selective execution — critical for large playbooks in production + - Docker Compose via templates is more maintainable than `docker run` commands + - Role dependencies automate execution order + - Double-gated wipe logic prevents accidental data loss + - CI/CD with path filters avoids unnecessary workflow runs + - ansible-lint production profile enforces consistent code quality diff --git a/ansible/inventory/group_vars/all.yml b/ansible/inventory/group_vars/all.yml index 2e77588630..c9cf6b4590 100644 --- a/ansible/inventory/group_vars/all.yml +++ b/ansible/inventory/group_vars/all.yml @@ -1,20 +1,18 @@ $ANSIBLE_VAULT;1.1;AES256 -65373735616431663364353939343031313535373535336533666365643339646233336162336138 -3462636636303335613733356266643239646338376533650a316664363538376566353537613237 -34626366663061623262303966613435663365366334316464623564373437613430383239663366 -3636316663303838350a323062363962636539306337616665373162323838363364326331346435 -38396463373265376461393538376663313465656262396433343934343766626266623535336138 -37646162386661623166343332383363343361376531353863326535626338666138363230303133 -39323961313232636534383965663535326537306436306361343032333435353034633361666662 -65613433623565363539366635643238626137326138646264656632663561396131383739383864 -37383864346436663734353963383530626166323462643664633866313330333636313563326466 -61313839653663333632643461643930663339313530383639396631653963663365616130633261 -63636463666436343963343232613461333066633038326561383761376261346239333938636561 -64336238653862613630616433343435666363363266633665616336636138643061393264303066 -64623734366462336439616534393866636631663437663635623866616338366234383037646336 -34643365666336623865383266633665663237386333626161303762333863323437313634356232 -63386363353465326165333932346232376663373832623836356263383866646530316463343434 -34326438336133343662313665323131353331363539356662623436356434616532306638303636 -37623033623137323966663062353831656234303739626530383830393166346338373765383361 -62316166633364323934633137623065333739643861376238633039396336393135613831623236 -383532666630396161313035663366653663 +62613964616539383339316564646235366637363636323934393264373931343936316634333866 +3664393365366332613561313333356134353737366664650a633032313431356537613737323331 +34636431316466636237353039386437613434663530306563623464623264316161356433623730 +6464636639373263360a323738353336313865356462613439653866653736383032393439333331 +64313831623662373432333165343536383933643433396335633831326430653134373564333064 +39396431653530656431663635343566613438323838353566323834366432386438313965613866 +63373034633830383563626664386536616465643933326362643862336166393932643666613034 +62313965346133376233306333613233343835346630396135383932316436633662333730623763 +38623830613735663934633539366564663162373731383366623938393331326239663833346266 +32323934393361653430643062616536353532396132303236633136643664373163326533653236 +35383564633365363965633963626634306438656361383731366534633431376134343332616333 +37373436326134613365333831396163366363613832623430663764336434366363366232616535 +66383734333839373138376363356462316137396338346232613339336634623135656333373665 +66356633623464316266623839363366303430383861323661653830633063613332323737663536 +32303230633931356630646434333631383565393161636362363534303431356135353130326236 +62366463653461653961303666626633353332376437313965633866666632643632323037316561 +38633539373161393336306239636164313664363061326138326630613033346463 diff --git a/ansible/playbooks/deploy.yml b/ansible/playbooks/deploy.yml index 56850a7585..95174b9e0e 100644 --- a/ansible/playbooks/deploy.yml +++ b/ansible/playbooks/deploy.yml @@ -1,7 +1,7 @@ --- - name: Deploy application hosts: webservers - become: yes + become: true roles: - - app_deploy + - web_app diff --git a/ansible/playbooks/provision.yml b/ansible/playbooks/provision.yml index f53efb0248..7cc2e6678d 100644 --- a/ansible/playbooks/provision.yml +++ b/ansible/playbooks/provision.yml @@ -1,7 +1,7 @@ --- - name: Provision web servers hosts: webservers - become: yes + become: true roles: - common diff --git a/ansible/playbooks/site.yml b/ansible/playbooks/site.yml index 9756f5c1db..37d6460ee1 100644 --- a/ansible/playbooks/site.yml +++ b/ansible/playbooks/site.yml @@ -1,9 +1,9 @@ --- - name: Full site deployment hosts: webservers - become: yes + become: true roles: - common - docker - - app_deploy + - web_app diff --git a/ansible/roles/app_deploy/defaults/main.yml b/ansible/roles/app_deploy/defaults/main.yml deleted file mode 100644 index 23c7b46331..0000000000 --- a/ansible/roles/app_deploy/defaults/main.yml +++ /dev/null @@ -1,6 +0,0 @@ ---- -app_name: devops-app -app_port: 5000 -app_container_name: "{{ app_name }}" -app_restart_policy: unless-stopped -app_env: {} diff --git a/ansible/roles/app_deploy/handlers/main.yml b/ansible/roles/app_deploy/handlers/main.yml deleted file mode 100644 index 73deea15ef..0000000000 --- a/ansible/roles/app_deploy/handlers/main.yml +++ /dev/null @@ -1,6 +0,0 @@ ---- -- name: restart app container - community.docker.docker_container: - name: "{{ app_container_name }}" - state: started - restart: yes diff --git a/ansible/roles/app_deploy/tasks/main.yml b/ansible/roles/app_deploy/tasks/main.yml deleted file mode 100644 index 18c6f293cb..0000000000 --- a/ansible/roles/app_deploy/tasks/main.yml +++ /dev/null @@ -1,50 +0,0 @@ ---- -- name: Log in to Docker Hub - community.docker.docker_login: - username: "{{ dockerhub_username }}" - password: "{{ dockerhub_password }}" - no_log: true - -- name: Pull Docker image - community.docker.docker_image: - name: "{{ docker_image }}" - tag: "{{ docker_image_tag }}" - source: pull - -- name: Stop existing container - community.docker.docker_container: - name: "{{ app_container_name }}" - state: absent - ignore_errors: yes - -- name: Run application container - community.docker.docker_container: - name: "{{ app_container_name }}" - image: "{{ docker_image }}:{{ docker_image_tag }}" - state: started - restart_policy: "{{ app_restart_policy }}" - ports: - - "{{ app_port }}:{{ app_port }}" - env: "{{ app_env }}" - notify: restart app container - -- name: Wait for application to be ready - ansible.builtin.wait_for: - port: "{{ app_port }}" - host: "127.0.0.1" - delay: 5 - timeout: 30 - -- name: Verify health endpoint - ansible.builtin.uri: - url: "http://127.0.0.1:{{ app_port }}/health" - method: GET - return_content: yes - status_code: 200 - register: health_check - retries: 3 - delay: 5 - -- name: Display health check result - ansible.builtin.debug: - var: health_check.json diff --git a/ansible/roles/common/tasks/main.yml b/ansible/roles/common/tasks/main.yml index 6e859f0f2f..df07fdea97 100644 --- a/ansible/roles/common/tasks/main.yml +++ b/ansible/roles/common/tasks/main.yml @@ -1,14 +1,40 @@ --- -- name: Update apt cache - ansible.builtin.apt: - update_cache: yes - cache_valid_time: 3600 - -- name: Install common packages - ansible.builtin.apt: - name: "{{ common_packages }}" - state: present - -- name: Set timezone - community.general.timezone: - name: "{{ common_timezone }}" +# Package installation block with error handling +- name: Install system packages + become: true + tags: + - packages + - common + block: + - name: Update apt cache + ansible.builtin.apt: + update_cache: true + cache_valid_time: 3600 + + - name: Install common packages + ansible.builtin.apt: + name: "{{ common_packages }}" + state: present + + - name: Set timezone + community.general.timezone: + name: "{{ common_timezone }}" + + rescue: + - name: Fix apt cache on failure + ansible.builtin.apt: + update_cache: true + force: true + + - name: Retry package installation after fix + ansible.builtin.apt: + name: "{{ common_packages }}" + state: present + update_cache: true + + always: + - name: Log package installation completion + ansible.builtin.copy: + content: "Common role packages block completed at {{ ansible_date_time.iso8601 }}\n" + dest: /tmp/common_packages_done.log + mode: "0644" diff --git a/ansible/roles/docker/handlers/main.yml b/ansible/roles/docker/handlers/main.yml index 1a5058da5e..07aa0eb290 100644 --- a/ansible/roles/docker/handlers/main.yml +++ b/ansible/roles/docker/handlers/main.yml @@ -1,5 +1,5 @@ --- -- name: restart docker +- name: Restart docker ansible.builtin.service: name: docker state: restarted diff --git a/ansible/roles/docker/tasks/main.yml b/ansible/roles/docker/tasks/main.yml index 8d5f77bacf..2a9552c1b5 100644 --- a/ansible/roles/docker/tasks/main.yml +++ b/ansible/roles/docker/tasks/main.yml @@ -1,53 +1,84 @@ --- -- name: Install prerequisites for Docker repository - ansible.builtin.apt: - name: - - ca-certificates - - curl - - gnupg - state: present - -- name: Create keyrings directory - ansible.builtin.file: - path: /etc/apt/keyrings - state: directory - mode: "0755" - -- name: Add Docker GPG key - ansible.builtin.apt_key: - url: https://download.docker.com/linux/ubuntu/gpg - keyring: /etc/apt/keyrings/docker.gpg - state: present - -- name: Add Docker repository - ansible.builtin.apt_repository: - repo: >- - deb [arch=amd64 signed-by=/etc/apt/keyrings/docker.gpg] - https://download.docker.com/linux/ubuntu - {{ ansible_facts['distribution_release'] }} stable - state: present - filename: docker - -- name: Install Docker packages - ansible.builtin.apt: - name: "{{ docker_packages }}" - state: present - update_cache: yes - notify: restart docker - -- name: Ensure Docker service is running and enabled - ansible.builtin.service: - name: docker - state: started - enabled: yes - -- name: Add user to docker group - ansible.builtin.user: - name: "{{ docker_user }}" - groups: docker - append: yes - -- name: Install python3-docker for Ansible docker modules - ansible.builtin.apt: - name: python3-docker - state: present +# Docker installation block with error handling +- name: Install Docker Engine + become: true + tags: + - docker_install + - docker + block: + - name: Install prerequisites for Docker repository + ansible.builtin.apt: + name: + - ca-certificates + - curl + - gnupg + state: present + + - name: Create keyrings directory + ansible.builtin.file: + path: /etc/apt/keyrings + state: directory + mode: "0755" + + - name: Add Docker GPG key + ansible.builtin.apt_key: + url: https://download.docker.com/linux/ubuntu/gpg + keyring: /etc/apt/keyrings/docker.gpg + state: present + + - name: Add Docker repository + ansible.builtin.apt_repository: + repo: >- + deb [arch=amd64 signed-by=/etc/apt/keyrings/docker.gpg] + https://download.docker.com/linux/ubuntu + {{ ansible_facts['distribution_release'] }} stable + state: present + filename: docker + + - name: Install Docker packages + ansible.builtin.apt: + name: "{{ docker_packages }}" + state: present + update_cache: true + notify: Restart docker + + rescue: + - name: Wait before retrying Docker installation + ansible.builtin.pause: + seconds: 10 + + - name: Retry apt update after GPG key failure + ansible.builtin.apt: + update_cache: true + + - name: Retry Docker package installation + ansible.builtin.apt: + name: "{{ docker_packages }}" + state: present + notify: Restart docker + + always: + - name: Ensure Docker service is enabled and started + ansible.builtin.service: + name: docker + state: started + enabled: true + failed_when: false + +# Docker configuration block +- name: Configure Docker + become: true + tags: + - docker_config + - docker + block: + - name: Add user to docker group + ansible.builtin.user: + name: "{{ docker_user }}" + groups: docker + append: true + + - name: Install python3-docker for Ansible docker modules + ansible.builtin.apt: + name: python3-docker + state: present diff --git a/ansible/roles/web_app/defaults/main.yml b/ansible/roles/web_app/defaults/main.yml new file mode 100644 index 0000000000..8e6eb34f76 --- /dev/null +++ b/ansible/roles/web_app/defaults/main.yml @@ -0,0 +1,19 @@ +--- +# Application Configuration +web_app_name: devops-app +web_app_port: 5000 +web_app_internal_port: 5000 +web_app_env: {} + +# Docker Image Configuration +web_app_image: graymansion/devops-info-service +web_app_tag: latest + +# Docker Compose Configuration +web_app_compose_dir: "/opt/{{ web_app_name }}" + +# Wipe Logic Control (default: do not wipe) +# Set to true to remove application completely +# Wipe only: ansible-playbook deploy.yml -e "web_app_wipe=true" --tags web_app_wipe +# Clean install: ansible-playbook deploy.yml -e "web_app_wipe=true" +web_app_wipe: false diff --git a/ansible/roles/web_app/handlers/main.yml b/ansible/roles/web_app/handlers/main.yml new file mode 100644 index 0000000000..63441ac691 --- /dev/null +++ b/ansible/roles/web_app/handlers/main.yml @@ -0,0 +1,7 @@ +--- +- name: Restart app with compose + ansible.builtin.command: + cmd: docker compose restart + chdir: "{{ web_app_compose_dir }}" + changed_when: true + become: true diff --git a/ansible/roles/web_app/meta/main.yml b/ansible/roles/web_app/meta/main.yml new file mode 100644 index 0000000000..3ec4fc801c --- /dev/null +++ b/ansible/roles/web_app/meta/main.yml @@ -0,0 +1,4 @@ +--- +# Role dependencies — ensure Docker is installed before deploying the web app +dependencies: + - role: docker diff --git a/ansible/roles/web_app/tasks/main.yml b/ansible/roles/web_app/tasks/main.yml new file mode 100644 index 0000000000..7c6165fd63 --- /dev/null +++ b/ansible/roles/web_app/tasks/main.yml @@ -0,0 +1,80 @@ +--- +# Wipe logic runs first (when explicitly requested) +- name: Include wipe tasks + ansible.builtin.include_tasks: wipe.yml + tags: + - web_app_wipe + +# Deploy application with Docker Compose +- name: Deploy application with Docker Compose + become: true + tags: + - app_deploy + - compose + block: + - name: Create application directory + ansible.builtin.file: + path: "{{ web_app_compose_dir }}" + state: directory + mode: "0755" + + - name: Template docker-compose file + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{ web_app_compose_dir }}/docker-compose.yml" + mode: "0644" + register: web_app_compose_file + + - name: Log in to Docker Hub + community.docker.docker_login: + username: "{{ dockerhub_username }}" + password: "{{ dockerhub_password }}" + no_log: true + + - name: Pull latest Docker image + community.docker.docker_image: + name: "{{ web_app_image }}" + tag: "{{ web_app_tag }}" + source: pull + register: web_app_image_pull + + - name: Deploy with docker compose + ansible.builtin.command: + cmd: docker compose up -d --remove-orphans + chdir: "{{ web_app_compose_dir }}" + register: web_app_compose_up + changed_when: >- + 'Started' in web_app_compose_up.stderr or + 'Created' in web_app_compose_up.stderr or + web_app_compose_file.changed or + web_app_image_pull.changed + + - name: Wait for application to be ready + ansible.builtin.wait_for: + port: "{{ web_app_port }}" + host: "127.0.0.1" + delay: 5 + timeout: 30 + + - name: Verify health endpoint + ansible.builtin.uri: + url: "http://127.0.0.1:{{ web_app_port }}/health" + method: GET + return_content: true + status_code: 200 + register: web_app_health_check + retries: 3 + delay: 5 + + - name: Display health check result + ansible.builtin.debug: + var: web_app_health_check.json + + rescue: + - name: Log deployment failure + ansible.builtin.debug: + msg: "Deployment of {{ web_app_name }} failed. Check logs with: docker compose -f {{ web_app_compose_dir }}/docker-compose.yml logs" + + - name: Fail with error message + ansible.builtin.fail: + msg: "Docker Compose deployment failed for {{ web_app_name }}" diff --git a/ansible/roles/web_app/tasks/wipe.yml b/ansible/roles/web_app/tasks/wipe.yml new file mode 100644 index 0000000000..33575053b3 --- /dev/null +++ b/ansible/roles/web_app/tasks/wipe.yml @@ -0,0 +1,39 @@ +--- +# Wipe tasks for web application — requires BOTH: +# 1. Variable: web_app_wipe=true (when condition) +# 2. Tag: --tags web_app_wipe (tag gating) +# This double-safety prevents accidental wipe during normal deployments. + +- name: Wipe web application + when: web_app_wipe | bool + become: true + tags: + - web_app_wipe + block: + - name: Stop and remove containers with docker compose + ansible.builtin.command: + cmd: docker compose down --remove-orphans + chdir: "{{ web_app_compose_dir }}" + changed_when: true + failed_when: false + + - name: Remove docker-compose file + ansible.builtin.file: + path: "{{ web_app_compose_dir }}/docker-compose.yml" + state: absent + + - name: Remove application directory + ansible.builtin.file: + path: "{{ web_app_compose_dir }}" + state: absent + + - name: Remove Docker image (optional cleanup) + community.docker.docker_image: + name: "{{ web_app_image }}" + tag: "{{ web_app_tag }}" + state: absent + failed_when: false + + - name: Log wipe completion + ansible.builtin.debug: + msg: "Application {{ web_app_name }} wiped successfully from {{ web_app_compose_dir }}" diff --git a/ansible/roles/web_app/templates/docker-compose.yml.j2 b/ansible/roles/web_app/templates/docker-compose.yml.j2 new file mode 100644 index 0000000000..79761259ae --- /dev/null +++ b/ansible/roles/web_app/templates/docker-compose.yml.j2 @@ -0,0 +1,22 @@ +# {{ ansible_managed }} +# Docker Compose configuration for {{ web_app_name }} + +services: + {{ web_app_name }}: + image: {{ web_app_image }}:{{ web_app_tag }} + container_name: {{ web_app_name }} + ports: + - "{{ web_app_port }}:{{ web_app_internal_port }}" +{% if web_app_env | length > 0 %} + environment: +{% for key, value in web_app_env.items() %} + {{ key }}: "{{ value }}" +{% endfor %} +{% endif %} + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:{{ web_app_internal_port }}/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 10s diff --git a/app_python/app.py b/app_python/app.py index 828b26bc83..8f83a9ac9b 100644 --- a/app_python/app.py +++ b/app_python/app.py @@ -3,6 +3,7 @@ FastAPI implementation. """ +import json import logging import os import platform @@ -20,14 +21,43 @@ HOST = os.getenv("HOST", "0.0.0.0") PORT = int(os.getenv("PORT", "5000")) DEBUG = os.getenv("DEBUG", "False").lower() == "true" +LOG_FORMAT = os.getenv("LOG_FORMAT", "text") # "json" for structured logging + + +# ------------------------- +# JSON Log Formatter +# ------------------------- +class JSONFormatter(logging.Formatter): + """Outputs log records as single-line JSON objects.""" + + def format(self, record: logging.LogRecord) -> str: + log_entry = { + "timestamp": datetime.now(timezone.utc) + .isoformat(timespec="milliseconds") + .replace("+00:00", "Z"), + "level": record.levelname, + "logger": record.name, + "message": record.getMessage(), + } + if record.exc_info and record.exc_info[0] is not None: + log_entry["exception"] = self.formatException(record.exc_info) + return json.dumps(log_entry) + # ------------------------- # Logging # ------------------------- -logging.basicConfig( - level=logging.DEBUG if DEBUG else logging.INFO, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", -) +_log_level = logging.DEBUG if DEBUG else logging.INFO +_handler = logging.StreamHandler() + +if LOG_FORMAT == "json": + _handler.setFormatter(JSONFormatter()) +else: + _handler.setFormatter( + logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") + ) + +logging.basicConfig(level=_log_level, handlers=[_handler]) logger = logging.getLogger("devops-info-service") # ------------------------- @@ -99,8 +129,16 @@ async def unhandled_exception_handler(_: Request, exc: Exception): @app.middleware("http") async def log_requests(request: Request, call_next): - logger.info("Request: %s %s", request.method, request.url.path) - return await call_next(request) + client_ip = request.client.host if request.client else "unknown" + response = await call_next(request) + logger.info( + "method=%s path=%s status=%d client=%s", + request.method, + request.url.path, + response.status_code, + client_ip, + ) + return response @app.get("/") diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml new file mode 100644 index 0000000000..bc894597c3 --- /dev/null +++ b/monitoring/docker-compose.yml @@ -0,0 +1,130 @@ +# Monitoring stack: Loki 3.0 + Promtail 3.0 + Grafana 12.3 +# Lab 7 — Observability & Logging with Loki Stack + +services: + # ---- Loki: log aggregation & storage ---- + loki: + image: grafana/loki:3.0.0 + command: -config.file=/etc/loki/config.yml + ports: + - "3100:3100" + volumes: + - ./loki/config.yml:/etc/loki/config.yml:ro + - loki-data:/loki + networks: + - logging + labels: + logging: "promtail" + app: "loki" + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 15s + deploy: + resources: + limits: + cpus: "1.0" + memory: 1G + reservations: + cpus: "0.25" + memory: 256M + restart: unless-stopped + + # ---- Promtail: log collector ---- + promtail: + image: grafana/promtail:3.0.0 + command: -config.file=/etc/promtail/config.yml + volumes: + - ./promtail/config.yml:/etc/promtail/config.yml:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + networks: + - logging + labels: + logging: "promtail" + app: "promtail" + depends_on: + loki: + condition: service_healthy + deploy: + resources: + limits: + cpus: "0.5" + memory: 512M + reservations: + cpus: "0.1" + memory: 128M + restart: unless-stopped + + # ---- Grafana: visualization ---- + grafana: + image: grafana/grafana:12.3.1 + ports: + - "3000:3000" + volumes: + - grafana-data:/var/lib/grafana + environment: + - GF_AUTH_ANONYMOUS_ENABLED=false + - GF_SECURITY_ADMIN_USER=${GF_ADMIN_USER:-admin} + - GF_SECURITY_ADMIN_PASSWORD=${GF_ADMIN_PASSWORD:-admin} + - GF_SECURITY_ALLOW_EMBEDDING=true + networks: + - logging + labels: + logging: "promtail" + app: "grafana" + depends_on: + loki: + condition: service_healthy + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 15s + deploy: + resources: + limits: + cpus: "1.0" + memory: 1G + reservations: + cpus: "0.25" + memory: 256M + restart: unless-stopped + + # ---- Python App from Lab 1 ---- + app-python: + build: + context: ../app_python + dockerfile: Dockerfile + ports: + - "8000:5000" + environment: + - LOG_FORMAT=json + networks: + - logging + labels: + logging: "promtail" + app: "devops-python" + depends_on: + loki: + condition: service_healthy + deploy: + resources: + limits: + cpus: "0.5" + memory: 256M + reservations: + cpus: "0.1" + memory: 64M + restart: unless-stopped + +networks: + logging: + driver: bridge + +volumes: + loki-data: + grafana-data: diff --git a/monitoring/docs/LAB07.md b/monitoring/docs/LAB07.md new file mode 100644 index 0000000000..756bc4d0d1 --- /dev/null +++ b/monitoring/docs/LAB07.md @@ -0,0 +1,279 @@ +# Lab 7 — Observability & Logging with Loki Stack + +**Name:** Makar +**Date:** 2026-03-12 +**Lab Points:** 10 + 0 bonus + +--- + +## 1. Architecture + +``` +┌──────────────────────────────────────────────────────┐ +│ Docker Network: logging │ +│ │ +│ ┌─────────────┐ logs ┌──────────────┐ │ +│ │ app-python │───────────▶│ Promtail │ │ +│ │ :8000→5000 │ │ (collector) │ │ +│ └─────────────┘ └──────┬───────┘ │ +│ │ push │ +│ ┌─────────────┐ ▼ │ +│ │ Grafana │ query ┌──────────────┐ │ +│ │ :3000 │◀──────────▶│ Loki │ │ +│ │ (dashboard) │ │ :3100 │ │ +│ └─────────────┘ │ (TSDB store) │ │ +│ └──────────────┘ │ +└──────────────────────────────────────────────────────┘ +``` + +**Component roles:** +- **Loki 3.0** — Log aggregation and storage engine using TSDB index with filesystem backend. Unlike Elasticsearch, Loki only indexes labels (not full text), making it lightweight and efficient. +- **Promtail 3.0** — Agent that discovers Docker containers via the Docker socket, tails their logs, and pushes them to Loki with appropriate labels. +- **Grafana 12.3** — Visualization frontend. Queries Loki via LogQL and renders dashboards. +- **app-python** — The FastAPI application from Lab 1, now emitting structured JSON logs. + +--- + +## 2. Setup Guide + +### Prerequisites +- Docker Engine and Docker Compose v2 installed +- The `app_python/` directory with the updated app (JSON logging support) + +### Deployment + +```bash +cd monitoring + +# Start the full stack +docker compose up -d + +# Check service health +docker compose ps + +# Verify Loki readiness +curl http://localhost:3100/ready + +# Access Grafana +open http://localhost:3000 +# Login: admin / (see .env file) +``` + +### Configure Grafana Data Source +1. Go to **Connections** → **Data sources** → **Add data source** → **Loki** +2. URL: `http://loki:3100` +3. Click **Save & Test** → should show "Data source connected" + +### Generate Traffic +```bash +for i in {1..20}; do curl -s http://localhost:8000/; done +for i in {1..20}; do curl -s http://localhost:8000/health; done +``` + +--- + +## 3. Configuration + +### Loki (`loki/config.yml`) + +Key configuration choices: + +```yaml +schema_config: + configs: + - from: "2024-01-01" + store: tsdb # TSDB: 10x faster queries than boltdb-shipper + object_store: filesystem # Single-instance, local storage + schema: v13 # Latest schema for Loki 3.0+ + +limits_config: + retention_period: 168h # 7-day retention +``` + +**Why TSDB over BoltDB?** Loki 3.0 introduced TSDB as the recommended index store. It provides up to 10x faster queries, lower memory usage, and better compression compared to the deprecated `boltdb-shipper`. + +**Why `auth_enabled: false`?** Single-tenant deployment for development. Multi-tenancy would require an authentication proxy in production. + +### Promtail (`promtail/config.yml`) + +```yaml +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + filters: + - name: label + values: ["logging=promtail"] +``` + +**Docker service discovery** uses the Docker socket to automatically find containers. The `filters` setting ensures only containers with `logging=promtail` label are scraped — this prevents collecting logs from unrelated containers. + +**Relabeling** extracts the container name (stripping the leading `/`) and the custom `app` label to use as Loki labels. This enables queries like `{app="devops-python"}`. + +--- + +## 4. Application Logging + +### JSON Formatter Implementation + +Added a `JSONFormatter` class in `app_python/app.py`: + +```python +class JSONFormatter(logging.Formatter): + def format(self, record: logging.LogRecord) -> str: + log_entry = { + "timestamp": datetime.now(timezone.utc).isoformat(...), + "level": record.levelname, + "logger": record.name, + "message": record.getMessage(), + } + if record.exc_info and record.exc_info[0] is not None: + log_entry["exception"] = self.formatException(record.exc_info) + return json.dumps(log_entry) +``` + +**Activation:** Controlled via `LOG_FORMAT` environment variable. When `LOG_FORMAT=json`, the JSON formatter is used; otherwise, the default text format is applied. This keeps backward compatibility. + +**Middleware logging** now includes method, path, status code, and client IP: + +```python +@app.middleware("http") +async def log_requests(request: Request, call_next): + response = await call_next(request) + logger.info("method=%s path=%s status=%d client=%s", + request.method, request.url.path, + response.status_code, client_ip) + return response +``` + +**Example JSON output:** +```json +{"timestamp": "2026-03-12T09:44:57.211Z", "level": "INFO", "logger": "devops-info-service", "message": "method=GET path=/ status=200 client=172.21.0.1"} +``` + +--- + +## 5. Dashboard + +### Panel 1: Logs Table +- **Type:** Logs visualization +- **Query:** `{app=~"devops-.*"}` +- **Purpose:** Shows recent logs from all applications in real time + +### Panel 2: Request Rate +- **Type:** Time series graph +- **Query:** `sum by (app) (rate({app=~"devops-.*"} [1m]))` +- **Purpose:** Shows logs per second by application, useful for spotting traffic spikes + +### Panel 3: Error Logs +- **Type:** Logs visualization +- **Query:** `{app=~"devops-.*"} | json | level="ERROR"` +- **Purpose:** Filtered view showing only ERROR-level logs for quick incident detection + +### Panel 4: Log Level Distribution +- **Type:** Stat/Pie chart +- **Query:** `sum by (level) (count_over_time({app=~"devops-.*"} | json [5m]))` +- **Purpose:** Breakdown of log volumes by level (INFO, WARNING, ERROR) — helps assess health at a glance + +### Additional LogQL queries: + +```logql +# All Python app logs +{app="devops-python"} + +# Only errors +{app="devops-python"} |= "ERROR" + +# Parse JSON and filter by method +{app="devops-python"} | json | message=~".*method=GET.*" + +# Count logs per minute +count_over_time({app="devops-python"}[1m]) + +# Logs containing "health" +{app="devops-python"} |= "health" +``` + +--- + +## 6. Production Configuration + +### Resource Limits + +All services have `deploy.resources` constraints: + +| Service | CPU Limit | Memory Limit | CPU Reserved | Memory Reserved | +|---------|-----------|-------------|-------------|-----------------| +| Loki | 1.0 | 1G | 0.25 | 256M | +| Promtail | 0.5 | 512M | 0.1 | 128M | +| Grafana | 1.0 | 1G | 0.25 | 256M | +| app-python | 0.5 | 256M | 0.1 | 64M | + +### Security + +- **Anonymous access disabled:** `GF_AUTH_ANONYMOUS_ENABLED=false` +- **Admin credentials** stored in `.env` file (excluded via `.gitignore`) +- **Promtail Docker socket access** is read-only (`:ro`) to minimize attack surface + +### Health Checks + +- **Loki:** `wget --spider http://localhost:3100/ready` (10s interval, 15s start period) +- **Grafana:** `wget --spider http://localhost:3000/api/health` (10s interval, 15s start period) +- **Dependency ordering:** Promtail and Grafana wait for Loki to be healthy before starting + +### Log Retention + +- 7-day retention (`168h`) configured in Loki +- Compactor runs every 10 minutes to clean up expired logs +- Retention delete delay of 2 hours provides a safety buffer + +--- + +## 7. Testing + +### Verify all services are running: +```bash +docker compose ps +# Expected: all services healthy/running +``` + +### Verify Loki: +```bash +curl -s http://localhost:3100/ready +# Expected: "ready" +``` + +### Verify application logging: +```bash +# Generate traffic +for i in {1..20}; do curl -s http://localhost:8000/; done +curl -s http://localhost:8000/health + +# Check logs reach Loki +curl -s 'http://localhost:3100/loki/api/v1/query?query={app="devops-python"}' | python3 -m json.tool +``` + +### Verify Grafana: +```bash +curl -s http://localhost:3000/api/health +# Expected: {"commit":"...","database":"ok","version":"12.3.1"} +``` + +### LogQL test queries in Grafana Explore: +1. `{job="docker"}` — all container logs +2. `{app="devops-python"}` — Python app logs only +3. `{app="devops-python"} | json | level="INFO"` — parsed JSON filtering +4. `rate({app="devops-python"}[1m])` — request rate metric + +--- + +## 8. Challenges + +1. **Loki 3.0 TSDB configuration:** The schema v13 + TSDB store requires specific `common.storage` configuration. Older Loki examples use `boltdb-shipper` which is deprecated. The `common` section in Loki 3.0 simplifies the config considerably by reducing duplication between `storage_config` and `schema_config`. + +2. **Promtail Docker SD filtering:** Without the `filters` option in `docker_sd_configs`, Promtail would scrape all containers including itself and Loki, creating a log feedback loop. Using `logging=promtail` label to opt-in containers prevents this. + +3. **JSON logging backward compatibility:** Using an environment variable (`LOG_FORMAT=json`) to toggle JSON output ensures the app works both in development (human-readable text) and production (machine-parseable JSON) without code changes. Existing tests pass unchanged. + +4. **Log retention + compactor:** Retention in Loki only works when the compactor is enabled with `retention_enabled: true`. Without the compactor section, Loki silently ignores `retention_period` in `limits_config`. diff --git a/monitoring/docs/evidence_app_logs.txt b/monitoring/docs/evidence_app_logs.txt new file mode 100644 index 0000000000..3e2e08746e --- /dev/null +++ b/monitoring/docs/evidence_app_logs.txt @@ -0,0 +1,15 @@ +INFO: 172.18.0.1:42738 - "GET / HTTP/1.1" 200 OK +{"timestamp": "2026-03-12T09:59:17.069Z", "level": "INFO", "logger": "devops-info-service", "message": "method=GET path=/ status=200 client=172.18.0.1"} +INFO: 172.18.0.1:42742 - "GET / HTTP/1.1" 200 OK +{"timestamp": "2026-03-12T09:59:17.079Z", "level": "INFO", "logger": "devops-info-service", "message": "method=GET path=/health status=200 client=172.18.0.1"} +INFO: 172.18.0.1:42754 - "GET /health HTTP/1.1" 200 OK +{"timestamp": "2026-03-12T09:59:17.085Z", "level": "INFO", "logger": "devops-info-service", "message": "method=GET path=/health status=200 client=172.18.0.1"} +INFO: 172.18.0.1:42758 - "GET /health HTTP/1.1" 200 OK +{"timestamp": "2026-03-12T10:06:13.305Z", "level": "INFO", "logger": "devops-info-service", "message": "method=GET path=/ status=200 client=172.18.0.1"} +INFO: 172.18.0.1:42430 - "GET / HTTP/1.1" 200 OK +{"timestamp": "2026-03-12T10:06:13.469Z", "level": "INFO", "logger": "devops-info-service", "message": "method=GET path=/favicon.ico status=404 client=172.18.0.1"} +INFO: 172.18.0.1:42430 - "GET /favicon.ico HTTP/1.1" 404 Not Found +{"timestamp": "2026-03-12T10:17:26.305Z", "level": "INFO", "logger": "devops-info-service", "message": "method=GET path=/health status=200 client=172.18.0.1"} +INFO: 172.18.0.1:40410 - "GET /health HTTP/1.1" 200 OK +{"timestamp": "2026-03-12T10:17:26.358Z", "level": "INFO", "logger": "devops-info-service", "message": "method=GET path=/ status=200 client=172.18.0.1"} +INFO: 172.18.0.1:40412 - "GET / HTTP/1.1" 200 OK diff --git a/monitoring/docs/evidence_loki_app_values.txt b/monitoring/docs/evidence_loki_app_values.txt new file mode 100644 index 0000000000..738471edcc --- /dev/null +++ b/monitoring/docs/evidence_loki_app_values.txt @@ -0,0 +1 @@ +{"status":"success","data":["devops-python","grafana","loki","promtail"]} diff --git a/monitoring/docs/evidence_loki_labels.txt b/monitoring/docs/evidence_loki_labels.txt new file mode 100644 index 0000000000..a6190b8594 --- /dev/null +++ b/monitoring/docs/evidence_loki_labels.txt @@ -0,0 +1 @@ +{"status":"success","data":["app","container","project","service","service_name"]} diff --git a/monitoring/docs/evidence_loki_query.json b/monitoring/docs/evidence_loki_query.json new file mode 100644 index 0000000000..d33e1747ee --- /dev/null +++ b/monitoring/docs/evidence_loki_query.json @@ -0,0 +1 @@ +{"status":"success","data":{"resultType":"streams","result":[{"stream":{"app":"devops-python","container":"monitoring-app-python-1","level":"info","project":"monitoring","service":"app-python","service_name":"app-python"},"values":[["1773310646358405222","INFO: 172.18.0.1:40412 - \"GET / HTTP/1.1\" 200 OK"],["1773310646358294205","{\"timestamp\": \"2026-03-12T10:17:26.358Z\", \"level\": \"INFO\", \"logger\": \"devops-info-service\", \"message\": \"method=GET path=/ status=200 client=172.18.0.1\"}"],["1773310646305702600","INFO: 172.18.0.1:40410 - \"GET /health HTTP/1.1\" 200 OK"],["1773310646305588017","{\"timestamp\": \"2026-03-12T10:17:26.305Z\", \"level\": \"INFO\", \"logger\": \"devops-info-service\", \"message\": \"method=GET path=/health status=200 client=172.18.0.1\"}"],["1773309973469177682","INFO: 172.18.0.1:42430 - \"GET /favicon.ico HTTP/1.1\" 404 Not Found"],["1773309973469155123","{\"timestamp\": \"2026-03-12T10:06:13.469Z\", \"level\": \"INFO\", \"logger\": \"devops-info-service\", \"message\": \"method=GET path=/favicon.ico status=404 client=172.18.0.1\"}"],["1773309973305480625","INFO: 172.18.0.1:42430 - \"GET / HTTP/1.1\" 200 OK"],["1773309973305439456","{\"timestamp\": \"2026-03-12T10:06:13.305Z\", \"level\": \"INFO\", \"logger\": \"devops-info-service\", \"message\": \"method=GET path=/ status=200 client=172.18.0.1\"}"]]}],"stats":{"summary":{"bytesProcessedPerSecond":181808,"linesProcessedPerSecond":1579,"totalBytesProcessed":921,"totalLinesProcessed":8,"execTime":0.005066,"queueTime":0.000394,"subqueries":0,"totalEntriesReturned":8,"splits":1,"shards":0,"totalPostFilterLines":8,"totalStructuredMetadataBytesProcessed":64},"querier":{"store":{"totalChunksRef":0,"totalChunksDownloaded":0,"chunksDownloadTime":0,"queryReferencedStructuredMetadata":false,"chunk":{"headChunkBytes":0,"headChunkLines":0,"decompressedBytes":0,"decompressedLines":0,"compressedBytes":0,"totalDuplicates":0,"postFilterLines":0,"headChunkStructuredMetadataBytes":0,"decompressedStructuredMetadataBytes":0},"chunkRefsFetchTime":0,"congestionControlLatency":0,"pipelineWrapperFilteredLines":0}},"ingester":{"totalReached":1,"totalChunksMatched":1,"totalBatches":1,"totalLinesSent":8,"store":{"totalChunksRef":0,"totalChunksDownloaded":0,"chunksDownloadTime":0,"queryReferencedStructuredMetadata":false,"chunk":{"headChunkBytes":921,"headChunkLines":8,"decompressedBytes":0,"decompressedLines":0,"compressedBytes":0,"totalDuplicates":0,"postFilterLines":8,"headChunkStructuredMetadataBytes":64,"decompressedStructuredMetadataBytes":0},"chunkRefsFetchTime":126460,"congestionControlLatency":0,"pipelineWrapperFilteredLines":0}},"cache":{"chunk":{"entriesFound":0,"entriesRequested":0,"entriesStored":0,"bytesReceived":0,"bytesSent":0,"requests":0,"downloadTime":0,"queryLengthServed":0},"index":{"entriesFound":0,"entriesRequested":0,"entriesStored":0,"bytesReceived":0,"bytesSent":0,"requests":0,"downloadTime":0,"queryLengthServed":0},"result":{"entriesFound":0,"entriesRequested":0,"entriesStored":0,"bytesReceived":0,"bytesSent":0,"requests":0,"downloadTime":0,"queryLengthServed":0},"statsResult":{"entriesFound":0,"entriesRequested":0,"entriesStored":0,"bytesReceived":0,"bytesSent":0,"requests":0,"downloadTime":0,"queryLengthServed":0},"volumeResult":{"entriesFound":0,"entriesRequested":0,"entriesStored":0,"bytesReceived":0,"bytesSent":0,"requests":0,"downloadTime":0,"queryLengthServed":0},"seriesResult":{"entriesFound":0,"entriesRequested":0,"entriesStored":0,"bytesReceived":0,"bytesSent":0,"requests":0,"downloadTime":0,"queryLengthServed":0},"labelResult":{"entriesFound":0,"entriesRequested":0,"entriesStored":0,"bytesReceived":0,"bytesSent":0,"requests":0,"downloadTime":0,"queryLengthServed":0},"instantMetricResult":{"entriesFound":0,"entriesRequested":0,"entriesStored":0,"bytesReceived":0,"bytesSent":0,"requests":0,"downloadTime":0,"queryLengthServed":0}},"index":{"totalChunks":0,"postFilterChunks":0}}}} diff --git a/monitoring/loki/config.yml b/monitoring/loki/config.yml new file mode 100644 index 0000000000..b11177372c --- /dev/null +++ b/monitoring/loki/config.yml @@ -0,0 +1,49 @@ +# Loki 3.0 Configuration +# Single-instance deployment with TSDB + filesystem storage + +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + log_level: info + +common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: "2024-01-01" + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +limits_config: + retention_period: 168h # 7 days + reject_old_samples: true + reject_old_samples_max_age: 168h + max_query_series: 500 + ingestion_burst_size_mb: 16 + ingestion_rate_mb: 8 + +compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + retention_delete_delay: 2h + delete_request_store: filesystem + +analytics: + reporting_enabled: false diff --git a/monitoring/promtail/config.yml b/monitoring/promtail/config.yml new file mode 100644 index 0000000000..c768135115 --- /dev/null +++ b/monitoring/promtail/config.yml @@ -0,0 +1,39 @@ +# Promtail 3.0 Configuration +# Collects Docker container logs via Docker service discovery + +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + filters: + - name: label + values: ["logging=promtail"] + + relabel_configs: + # Extract container name and remove leading slash + - source_labels: ["__meta_docker_container_name"] + regex: "/(.*)" + target_label: "container" + + # Extract compose service name + - source_labels: ["__meta_docker_container_label_com_docker_compose_service"] + target_label: "service" + + # Extract the custom 'app' label + - source_labels: ["__meta_docker_container_label_app"] + target_label: "app" + + # Extract compose project name + - source_labels: ["__meta_docker_container_label_com_docker_compose_project"] + target_label: "project"