In [None]:
import datetime
import json
from pathlib import Path
import os
import urllib.parse

from dotenv import load_dotenv
import requests

In [None]:
load_dotenv("../../aws.env")

In [None]:
api_url = urllib.parse.urljoin(os.environ["DBAPI_URL"],
                               os.environ["DBAPI_STAGE"])

In [None]:
output_dir = Path("20210418_load_s3_out") / "documents"
os.makedirs(output_dir, exist_ok=True)

In [None]:
# request token
login_data_dict = {
  "username": os.environ["FIRST_USER"],
  "password": os.environ["FIRST_USER_PASSWORD"]
}
r = requests.post(f"{api_url}/token", data=login_data_dict)
a_token = r.json()["access_token"]
token_headers = {"Authorization": f"Bearer {a_token}"}

In [None]:
# request episodes
id_prefixes = ["PythonBytes", "TalkPythonToMe"]
no_ranges = [range(217, 228), range(295, 306)]
for id_prefix, no_range in zip(id_prefixes, no_ranges):
    for episode_no in no_range:
        episode_id = f"{id_prefix}:{episode_no}"
        episode_url = f"{api_url}/documents/{episode_id}"
        r = requests.get(episode_url, headers=token_headers)
        if r.status_code != 200:
            continue
        j = r.json()
        file_name = episode_id.replace(":", "_") + ".txt"
        out_file = output_dir / file_name
        with open(out_file, "wt") as fout:
            fout.write(j["parsed_text"])
            
        # https://docs.aws.amazon.com/kendra/latest/dg/custom-attributes.html
        # https://docs.aws.amazon.com/kendra/latest/dg/s3-metadata.html
        metadata_dict = {
            "Attributes": {
                "_authors": j["authors"],
                "_category": [j["document_type"]],
                "_created_at": datetime.datetime.fromisoformat(j["publication_date"]).isoformat() + "Z",  # Z for UTC
                "_last_updated_at": datetime.datetime.fromisoformat(j["update_date"]).isoformat() + "Z",  # Z for UTC
                "_document_id": j["id"],
                "_document_title": j["title"],
                "_source_uri": j["urls"][0],
                "keywords": j["keywords"]
            },
            "DocumentId": j["id"],
            "Title": j["title"],
            "ContentType": "PLAIN_TEXT"
        }
        metadata_file = output_dir / (file_name + ".metadata.json")
        with open(metadata_file, "wt") as fout:
            json.dump(metadata_dict, fout)

Followed by:

```bash
aws s3 sync --sse AES256 20210418_load_s3_out/documents s3://${KENDRA_SOURCE_S3_BUCKET}/documents
```