<a href="https://colab.research.google.com/github/JFarley2024/Legacy_Transkribus_API/blob/main/JFarley2024_ROAD_TranskribusAPI3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#*ROAD Trankribus API*


In [12]:
import requests
import json
import xml.etree.ElementTree as ET  # https://docs.python.org/3/library/xml.etree.elementtree.html
from PIL import Image
from urllib.parse import urlparse   # https://docs.python.org/3/library/urllib.parse.html#module-urllib.parse
import re
from io import BytesIO
from time import sleep  # Add a delay where needed

__Shows the global variables used throught the API during a session.__

In [13]:
USER = "jerms_farley@hotmail.com"
PWD = 'Tr@nsformers89'
API_URL = "https://transkribus.eu/TrpServer/rest/"
SESSION_CACHE = {}       # Store as a dictionary
COLLECTIONS_CACHE = []   # Store as a json
s = requests.Session()


> To connect and upload to Transkribus it can require a lot of repetitive coding. To counteract this problem, you can create helper functions to handle most of the tasks and call them as needed.

In [14]:
def correct_url(url, add_w = False, session_cache = SESSION_CACHE, use_sess_id = True):

    ### Parse the url
    up = urlparse(url)

    ### Reformat the url
    up = up._replace(scheme = 'https' if not bool(up.scheme) else up.scheme)
    netloc, _, path = (up.netloc or up.path).partition('/')
    up = up._replace(path = re.sub( '/+', '/', up.path if bool(up.netloc) else path) )
    # Add `www` to the website if needed
    netloc = ('www.' + netloc) if bool(add_w) else netloc
    up = up._replace(netloc = re.sub('^w*.?w*w.', 'www.', netloc))



    # Logic to insert `JSESSIONID` in the url if needed
    # Defunct after adopting SessionID into the headers, but leaving to show how to insert directly into url
    if bool(use_sess_id) and bool(session_cache) and bool(session_cache.get('sessionId')):
        if bool(up.query):
            up = up._replace(query=f"{up.query}&JSESSIONID={session_cache['sessionId']}")
        else:
            up = up._replace(query=f"JSESSIONID={session_cache['sessionId']}")

    return up


def act_on_url(url,
               ###
               method = 'GET',
               req_data = None,
               timeout = 20,
               retry_delay = 10,
               retry_num = 3,
               ###
               add_w = False,
               session_cache = SESSION_CACHE,
               use_sess_id = False,
              **kwargs):

    # Notice the use of `.geturl()`
    parsed_url = correct_url(url, session_cache = session_cache, use_sess_id = use_sess_id).geturl()

    # Set the counter to 0
    retry_cnt = 0

    while retry_cnt < retry_num:
        try:
            response = requests.request(method = method, url = parsed_url, data = req_data, timeout = timeout, **kwargs)
            response.raise_for_status()  # Raise an exception if the request failed, otherwise returns `None`
            return response
        except requests.exceptions.HTTPError as http_err:
            print(f'HTTP error occurred for the url {parsed_url}:\n\t {http_err}')
            return None
        except requests.exceptions.Timeout:
            print('The request timed out, retrying...')
            sleep(retry_delay)  # Delay between retries
            retry_cnt += 1  # Tracks number of retries
        except Exception as err:
            print(f'Other error occurred for the url {parsed_url}:\n\t {err}')
            return None

    print(f'The request timed out after {retry_num} attempts for the url {parsed_url}')
    return None


def build_headers(sessionid = None):

    if not bool(sessionid):
        return None

    # This references a global variable implicitly. It can be quite a bad idea to design in this way
    header = {'Cookie': f'''JSESSIONID={SESSION_CACHE['sessionId']}'''}
    header.update( {'Content-Type': 'application/x-www-form-urlencoded'} ) # Unsure why this is done, but taking it from Dr. Mulligan's script

    return header


def authenticate(url = f'{API_URL}auth/login',
                 ###
                 method = 'POST',
                 req_data = {'user': USER, 'pw': PWD},
                 print_results = False,
                **kwargs):

    # Get the response from the url
    response = act_on_url(url, method = method, req_data = req_data, **kwargs)

    if response is None:
        return None  # Handle error case when `act_on_url` returns None

    ### Returns response from the page as text and parses it, assuming its an XML file
    # You can confirm what page is encoded as (e.g. text, json, xml, etc.) using the
    # following function/method(?), `response.headers['Content-Type']`, where response is the response object
    root = ET.fromstring(response.text)

    session_cache = {}
    for child in root:
        session_cache.update({child.tag : child.text})
        if bool(print_results):
            print(f"{child.tag}: {child.text}")

    if bool(session_cache['sessionId']):
        return session_cache
    else:
        print('failed to get session details from authentication')
        return None

def logout(self):
        """ Logs out and sets the "session_id" variable to False. """

        cookies = dict(JSESSIONID=self.session_id)
        response = requests.post(self._url("auth/logout"), cookies=cookies)
        if response:
            self.session_id = False
            print("TRANSKRIBUS: Logged out successfully.")
            return True
        else:
            print("TRANSKRIBUS: Logout failed. HTTP status:", response.status_code, response.content)
            return False

> This code identifies the USER and the SessionID, the latter is called on in later tasks.

In [15]:
SESSION_CACHE = authenticate(print_results = True)

userId: 223226
userName: jerms_farley@hotmail.com
email: jerms_farley@hotmail.com
affiliation: None
firstname: Jeremy
lastname: Farley
gender: unknown
userRoleList: User
isActive: 1
isAdmin: false
created: 2024-03-01T19:31:18.684+01:00
loginTime: 2024-03-15T15:18:09.092+01:00
sessionId: CFBAF1728EA690AA5C7BC65EF8E630A8
userAgent: python-requests/2.31.0
ip: 104.196.237.143


In [16]:
SESSION_CACHE['sessionId']

'CFBAF1728EA690AA5C7BC65EF8E630A8'

In [17]:
headers = build_headers(sessionid = SESSION_CACHE['sessionId'])

In [18]:
# Session ID stored in a header as a cookie, for further use
headers

{'Cookie': 'JSESSIONID=CFBAF1728EA690AA5C7BC65EF8E630A8',
 'Content-Type': 'application/x-www-form-urlencoded'}

In [19]:
collection_list_json = act_on_url(url = f'{API_URL}collections/list',
                                  headers = headers).json()
# Notice the following:
# Although we never explicitly wrote headers in our code, we can still hand it off to requests due to the **kwargs
# We use `.json()` at the end of our function because it returns a response object which we can use the relevant methods on directly

In [20]:
collection_list_json

[{'type': 'trpCollection',
  'colId': 284491,
  'colName': 'jerms_farley@hotmail.com Collection',
  'description': 'jerms_farley@hotmail.com',
  'created': '2024-03-01T19:31:18.764+01:00',
  'crowdsourcing': False,
  'elearning': False,
  'pageId': 65926097,
  'url': 'https://files.transkribus.eu/Get?fileType=view&id=DAEQZNPMDNXLWKBWOHOHPWDD',
  'thumbUrl': 'https://files.transkribus.eu/Get?fileType=thumb&id=DAEQZNPMDNXLWKBWOHOHPWDD',
  'nrOfDocuments': 9,
  'role': 'Owner',
  'accountingStatus': 1},
 {'type': 'trpCollection',
  'colId': 289136,
  'colName': 'API_Test',
  'description': 'created by jerms_farley@hotmail.com',
  'created': '2024-03-14T01:21:00.018+01:00',
  'crowdsourcing': False,
  'elearning': False,
  'pageId': 66635173,
  'url': 'https://files.transkribus.eu/Get?fileType=view&id=OGOZOOOPGXGMLXNHZKJSOEPB',
  'thumbUrl': 'https://files.transkribus.eu/Get?fileType=thumb&id=OGOZOOOPGXGMLXNHZKJSOEPB',
  'nrOfDocuments': 3,
  'role': 'Owner',
  'accountingStatus': 1}]

In [21]:
# If we had different collections,
# this is one way which allows us to get a quick overview
for dict in collection_list_json:
    print('Id num.', ': ', dict['colId'],
          '\t|\t',
          'Description', ': ', dict['description'], # Depending on the length, it may be good to remove
          '\t|\t',
          'num. of Doc.', ': ', dict['nrOfDocuments'],
          sep = '')

Id num.: 284491	|	Description: jerms_farley@hotmail.com	|	num. of Doc.: 9
Id num.: 289136	|	Description: created by jerms_farley@hotmail.com	|	num. of Doc.: 3


> Here we select which collection we are going to be working with, in this case it is the collection we created earlier.

In [22]:
# Here we select the collection id we're interested in
col_id = collection_list_json[1]['colId']

In [23]:
# Notice how we're inserting the collection id into the url.
# This insertion can be done programtically if needed.
# I'm using f''' ''', but you can use just f' ' as long as the same `'` isn't in any of the variables in the string.
document_list_json = act_on_url(url = f'''{API_URL}collections/{col_id}/list''',
                                  headers = headers).json()

In [24]:
document_list_json

[{'type': 'trpDocMetadata',
  'docId': 1878024,
  'title': 'API_test2',
  'author': 'Jim Farley',
  'uploadTimestamp': 1710376774943,
  'genre': 'test',
  'writer': 'Some Guy',
  'uploader': 'jerms_farley@hotmail.com',
  'uploaderId': 223226,
  'nrOfPages': 1,
  'pageId': 66635173,
  'url': 'https://files.transkribus.eu/Get?fileType=view&id=OGOZOOOPGXGMLXNHZKJSOEPB',
  'thumbUrl': 'https://files.transkribus.eu/Get?fileType=thumb&id=OGOZOOOPGXGMLXNHZKJSOEPB',
  'status': 0,
  'fimgStoreColl': 'TrpDoc_DEA_1878024',
  'origDocId': 0,
  'collectionList': {'colList': [{'colId': 289136,
     'colName': 'API_Test',
     'description': 'created by jerms_farley@hotmail.com',
     'crowdsourcing': False,
     'elearning': False,
     'nrOfDocuments': 0}]},
  'attributes': [],
  'mainColId': 289136,
  'isInMain': True},
 {'type': 'trpDocMetadata',
  'docId': 1878043,
  'title': 'API_test3',
  'author': 'Jim Farley',
  'uploadTimestamp': 1710380364431,
  'genre': 'test',
  'writer': 'Some Guy',
  

> Similar to above in collection list, the docs in the collection are listed

In [25]:

for dict in document_list_json:
    print('Id num.', ': ', dict['docId'],
          '\t|\t',
          'Title', ': ', dict['title'],
          '\t|\t',
          'num. of Pages', ': ', dict['nrOfPages'],
          sep = '')

Id num.: 1878024	|	Title: API_test2	|	num. of Pages: 1
Id num.: 1878043	|	Title: API_test3	|	num. of Pages: 1


In [26]:
# Here we select the document id we're interested in
doc_id = document_list_json[1]['docId']

In [27]:
# Similar to before, keep note of the changes in the url
full_document_json = act_on_url(url = f'''{API_URL}collections/{col_id}/{doc_id}/fulldoc''',
                                headers = headers).json()

In [28]:
full_document_json

{'md': {'nrOfRegions': 0,
  'nrOfTranscribedRegions': 0,
  'nrOfWordsInRegions': 0,
  'nrOfLines': 0,
  'nrOfTranscribedLines': 0,
  'nrOfWordsInLines': 0,
  'nrOfWords': 0,
  'nrOfTranscribedWords': 0,
  'nrOfCharsInLines': 0,
  'nrOfNew': 1,
  'nrOfInProgress': 0,
  'nrOfDone': 0,
  'nrOfFinal': 0,
  'nrOfGT': 0,
  'docId': 1878043,
  'title': 'API_test3',
  'author': 'Jim Farley',
  'uploadTimestamp': 1710380364431,
  'genre': 'test',
  'writer': 'Some Guy',
  'uploader': 'jerms_farley@hotmail.com',
  'uploaderId': 223226,
  'nrOfPages': 1,
  'pageId': 66635281,
  'url': 'https://files.transkribus.eu/Get?fileType=view&id=PBOWFQBPVNMBONGNCOZHKKAA',
  'thumbUrl': 'https://files.transkribus.eu/Get?fileType=thumb&id=PBOWFQBPVNMBONGNCOZHKKAA',
  'status': 0,
  'fimgStoreColl': 'TrpDoc_DEA_1878043',
  'origDocId': 0,
  'collectionList': {'colList': [{'colId': 289136,
     'colName': 'API_Test',
     'description': 'created by jerms_farley@hotmail.com',
     'crowdsourcing': False,
     'e

> Here we list some of the values listed in our documents json

In [29]:
for val in ('nrOfRegions', 'nrOfTranscribedLines', 'nrOfTranscribedWords'):
    print(val, ': ', full_document_json['md'][val], sep = '')

nrOfRegions: 0
nrOfTranscribedLines: 0
nrOfTranscribedWords: 0


> Next we pull the jobs list and list their json entries

In [30]:
job_list_json = act_on_url(url = f'{API_URL}jobs/list',
                                  headers = headers).json()

In [31]:
job_list_json;

In [32]:
for dict in job_list_json:
    print('Job Id num.', ': ', dict['jobId'],
          '\t|\t',
          'Doc. Id num.', ': ', dict['docId'],
          '\t|\t',
          'Type', ': ', dict['type'],
          '\t|\t',
          'State', ': ', dict['state'],
          '\t|\t',
          'Succeeded', ': ', dict['success'],
          sep = '')

Job Id num.: 8353824	|	Doc. Id num.: 1878053	|	Type: PyLaia Decoding	|	State: FINISHED	|	Succeeded: True
Job Id num.: 8353804	|	Doc. Id num.: 1878053	|	Type: Create Document	|	State: FINISHED	|	Succeeded: True
Job Id num.: 8353781	|	Doc. Id num.: 1878043	|	Type: Create Document	|	State: FINISHED	|	Succeeded: True
Job Id num.: 8353761	|	Doc. Id num.: 1878036	|	Type: PyLaia Decoding	|	State: FINISHED	|	Succeeded: True
Job Id num.: 8353750	|	Doc. Id num.: 1878036	|	Type: Create Document	|	State: FINISHED	|	Succeeded: True
Job Id num.: 8353746	|	Doc. Id num.: 1878035	|	Type: Create Document	|	State: FINISHED	|	Succeeded: True
Job Id num.: 8353696	|	Doc. Id num.: 1878024	|	Type: PyLaia Decoding	|	State: FINISHED	|	Succeeded: True
Job Id num.: 8353685	|	Doc. Id num.: 1878028	|	Type: Create Document	|	State: FINISHED	|	Succeeded: True
Job Id num.: 8353680	|	Doc. Id num.: 1878024	|	Type: Create Document	|	State: FINISHED	|	Succeeded: True
Job Id num.: 8353676	|	Doc. Id num.: 1878022	|	Type: Cr

In [33]:
job_id = job_list_json[0]['jobId']

In [34]:
full_job_json = act_on_url(url = f'''{API_URL}jobs/{job_id}''',
                                  headers = headers).json()

In [35]:
full_job_json

{'jobId': '8353824',
 'docId': 1878053,
 'pageNr': -1,
 'pages': '1',
 'type': 'PyLaia Decoding',
 'state': 'FINISHED',
 'success': True,
 'description': 'Done, duration: 2m 13s 406ms',
 'userName': 'jerms_farley@hotmail.com',
 'userId': 223226,
 'createTime': 1710380860985,
 'startTime': 1710380862410,
 'endTime': 1710380995816,
 'jobData': '#Thu Mar 14 02:47:42 CET 2024\ndoNotDeleteWorkDir=false\nwriteKwsIndex=false\ndoLinePolygonSimplification=true\nkeepOriginalLinePolygons=false\nmodelId=53042\nwriteLineConfScore=false\nwriteWordConfScores=false\nb2pBackend=Legacy\nuserRoles=User\nclearLines=false\nisNextGen=false\nuserEmail=jerms_farley@hotmail.com\nGPU_DEVICE=0\ndoWordSeg=true\nworkDir=/tmp/HTR/PyLaia/trpProd/Decode/pylaiaDecode_8353824\nbatchSize=10\nlanguageModel=trainDataLanguageModel\nuseExistingLinePolygons=false\nnBest=1\n',
 'resumable': False,
 'jobImpl': 'PyLaiaDecodingJob',
 'moduleUrl': 'http://srv6146:8081/PyLaiaModule-trpProd-2.12.1',
 'moduleName': 'PyLaiaModule',
 

In [36]:
# listing some of the values in the jobs json entries
for val in ('type', 'docTitle', 'description', 'state'):
    print(val, ': ', full_job_json[val], sep = '')

type: PyLaia Decoding
docTitle: Deleted Document
description: Done, duration: 2m 13s 406ms
state: FINISHED


> Here we download the image file, assign it to a variable, create a json for it, and pPOST it to the TRanskribus website

In [37]:
# The image we intend to upload is downloaded and assigned to a variable we then use in the rest of this API
!wget "https://cdm16003.contentdm.oclc.org/digital/iiif/p16003coll20/7/full/full/0/default.jpg" -O transkribusAPI4.jpg


--2024-03-15 14:20:45--  https://cdm16003.contentdm.oclc.org/digital/iiif/p16003coll20/7/full/full/0/default.jpg
Resolving cdm16003.contentdm.oclc.org (cdm16003.contentdm.oclc.org)... 132.174.3.1
Connecting to cdm16003.contentdm.oclc.org (cdm16003.contentdm.oclc.org)|132.174.3.1|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [image/jpeg]
Saving to: ‘transkribusAPI4.jpg’

transkribusAPI4.jpg     [  <=>               ]   6.31M   962KB/s    in 7.4s    

2024-03-15 14:21:18 (871 KB/s) - ‘transkribusAPI4.jpg’ saved [6618050]



In [52]:
image_upload = {
    "md": {
        "title": "API_test4",
        "author": "Jim Farley",
        "genre": "test",
        "writer": "Some Guy still testing"
    },
    "pageList": {
        "pages": [
            {
                "fileName": "transkribusAPI4.jpg",
                "pageNr": 1
            }
            # Add more details if needed
        ]
    }
}

In [53]:
image_upload

{'md': {'title': 'API_test4',
  'author': 'Jim Farley',
  'genre': 'test',
  'writer': 'Some Guy still testing'},
 'pageList': {'pages': [{'fileName': 'transkribusAPI4.jpg', 'pageNr': 1}]}}

In [54]:
header_up = headers.copy()
header_up.pop('Content-Type')
header_up.update( {'Content-Type': 'application/json'} )

In [55]:
header_up

{'Cookie': 'JSESSIONID=CFBAF1728EA690AA5C7BC65EF8E630A8',
 'Content-Type': 'application/json'}

In [56]:
temp = requests.request(url = f'''{API_URL}uploads?collId={col_id}''',
                        method = 'POST', json = image_upload, headers = header_up)

In [57]:
ET.fromstring(temp.text)
for child in ET.fromstring(temp.text):
    if not bool(child):
        print(f"{child.tag}: {child.text}")
    else:
        print(child.tag)
        for x in child:
            if not bool(x):
                print(f"\t{x.tag}: {x.text}")
            else:
                print(f'\t{x.tag}')
                for y in x:
                    if not bool(y):
                        print(f"\t\t{y.tag}: {y.text}")
                    else:
                        print(f'\t\t{y.tag}')

md
	docId: -1
	title: API_test4
	author: Jim Farley
	uploadTimestamp: 0
	genre: test
	writer: Some Guy still testing
	uploaderId: 0
	nrOfPages: 0
	collectionList: None
pageList
	pages
		fileName: transkribusAPI4.jpg
		pageUploaded: false
		pageNr: 1
uploadId: 1880847
created: 2024-03-15T15:22:48.761+01:00
userId: 223226
userName: jerms_farley@hotmail.com
nrOfPagesTotal: 1
uploadType: JSON
colId: 289136


> After we POST the image doc to Transkribus, we parse the upload id, then place that data into the collection with a PUT method.

In [58]:
upload_id = ET.fromstring(temp.text)[2].text

In [59]:
upload_id

'1880847'

In [60]:
# Place upload in variable 'b'
b = open('transkribusAPI4.jpg', 'rb')

In [61]:
files = {'img': b, 'Content-Type': 'application/octet-stream' }

In [62]:
header_up.pop('Content-Type')

'application/json'

In [63]:
temp_put = requests.request(url = f'''{API_URL}uploads/{upload_id}''',
                        method = 'PUT', files = files, headers = header_up)

In [64]:
temp_put.content

b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?><trpUpload><md><docId>-1</docId><title>API_test4</title><author>Jim Farley</author><uploadTimestamp>0</uploadTimestamp><genre>test</genre><writer>Some Guy still testing</writer><uploaderId>0</uploaderId><nrOfPages>0</nrOfPages><collectionList/></md><pageList><pages><fileName>transkribusAPI4.jpg</fileName><pageUploaded>true</pageUploaded><pageNr>1</pageNr></pages></pageList><uploadId>1880847</uploadId><created>2024-03-15T15:22:48.761+01:00</created><finished>2024-03-15T15:23:06.409+01:00</finished><userId>223226</userId><userName>jerms_farley@hotmail.com</userName><nrOfPagesTotal>1</nrOfPagesTotal><uploadType>JSON</uploadType><jobId>8381588</jobId><colId>289136</colId></trpUpload>'

In [65]:
for child in ET.fromstring(temp_put.text):
    if not bool(child):
        print(f"{child.tag}: {child.text}")
    else:
        print(child.tag)
        for x in child:
            if not bool(x):
                print(f"\t{x.tag}: {x.text}")
            else:
                print(f'\t{x.tag}')
                for y in x:
                    if not bool(y):
                        print(f"\t\t{y.tag}: {y.text}")
                    else:
                        print(f'\t\t{y.tag}')

md
	docId: -1
	title: API_test4
	author: Jim Farley
	uploadTimestamp: 0
	genre: test
	writer: Some Guy still testing
	uploaderId: 0
	nrOfPages: 0
	collectionList: None
pageList
	pages
		fileName: transkribusAPI4.jpg
		pageUploaded: true
		pageNr: 1
uploadId: 1880847
created: 2024-03-15T15:22:48.761+01:00
finished: 2024-03-15T15:23:06.409+01:00
userId: 223226
userName: jerms_farley@hotmail.com
nrOfPagesTotal: 1
uploadType: JSON
jobId: 8381588
colId: 289136


> Next we parse out the job id for the uploaded image and check the status on the upload



In [66]:
job_api_id = ET.fromstring(temp_put.text)[9].text

In [67]:
job_api_id

'8381588'

In [68]:
job_api_details = act_on_url(url = f'''{API_URL}jobs/{job_api_id}''',
                             headers = headers).json()

In [69]:
for val in ('type', 'docTitle', 'docId', 'colId', 'description', 'state'):
    print(val, ': ',
          job_api_details[val],
          sep = '')

type: Create Document
docTitle: API_test4
docId: 1880847
colId: 289136
description: Done, duration: 3s 54ms
state: FINISHED


In [70]:
job_api_details

{'jobId': '8381588',
 'docId': 1880847,
 'pageNr': -1,
 'type': 'Create Document',
 'state': 'FINISHED',
 'success': True,
 'description': 'Done, duration: 3s 54ms',
 'userName': 'jerms_farley@hotmail.com',
 'userId': 223226,
 'createTime': 1710512586274,
 'startTime': 1710512586899,
 'endTime': 1710512589953,
 'jobData': '#Fri Mar 15 15:23:06 CET 2024\ncolId=289136\n',
 'resumable': False,
 'jobImpl': 'UploadImportJob',
 'moduleUrl': 'http://dea-bl04:8080/UtilityModule-trpProd-2.12.0',
 'moduleName': 'UtilityModule',
 'moduleVersion': '2.12.0',
 'started': '2024-03-15T15:23:06.899+01:00',
 'ended': '2024-03-15T15:23:09.953+01:00',
 'created': '2024-03-15T15:23:06.274+01:00',
 'batchId': 0,
 'pageid': 0,
 'tsid': 0,
 'parent_jobid': 0,
 'parent_batchid': 0,
 'colId': 289136,
 'progress': 1,
 'totalWork': 1,
 'nrOfErrors': 0,
 'docTitle': 'API_test4',
 'priority': 0}

> Check jobs list for the transcription job

In [86]:
job_list_json = act_on_url(url = f'{API_URL}jobs/list',
                                  headers = headers).json()

In [87]:
job_list_json;

In [88]:
for dict in job_list_json:
    print('Job Id num.', ': ', dict['jobId'],
          '\t|\t',
          'Doc. Id num.', ': ', dict['docId'],
          '\t|\t',
          'Type', ': ', dict['type'],
          '\t|\t',
          'State', ': ', dict['state'],
          '\t|\t',
          'Succeeded', ': ', dict['success'],
          sep = '')

Job Id num.: 8381600	|	Doc. Id num.: 1880847	|	Type: PyLaia Decoding	|	State: FINISHED	|	Succeeded: True
Job Id num.: 8381588	|	Doc. Id num.: 1880847	|	Type: Create Document	|	State: FINISHED	|	Succeeded: True
Job Id num.: 8353824	|	Doc. Id num.: 1878053	|	Type: PyLaia Decoding	|	State: FINISHED	|	Succeeded: True
Job Id num.: 8353804	|	Doc. Id num.: 1878053	|	Type: Create Document	|	State: FINISHED	|	Succeeded: True
Job Id num.: 8353781	|	Doc. Id num.: 1878043	|	Type: Create Document	|	State: FINISHED	|	Succeeded: True
Job Id num.: 8353761	|	Doc. Id num.: 1878036	|	Type: PyLaia Decoding	|	State: FINISHED	|	Succeeded: True
Job Id num.: 8353750	|	Doc. Id num.: 1878036	|	Type: Create Document	|	State: FINISHED	|	Succeeded: True
Job Id num.: 8353746	|	Doc. Id num.: 1878035	|	Type: Create Document	|	State: FINISHED	|	Succeeded: True
Job Id num.: 8353696	|	Doc. Id num.: 1878024	|	Type: PyLaia Decoding	|	State: FINISHED	|	Succeeded: True
Job Id num.: 8353685	|	Doc. Id num.: 1878028	|	Type: Cr

In [89]:
job_id = job_list_json[0]['jobId']
full_job_json = act_on_url(url = f'''{API_URL}jobs/{job_id}''',
                                  headers = headers).json()


In [90]:
full_job_json

{'jobId': '8381600',
 'docId': 1880847,
 'pageNr': -1,
 'pages': '1',
 'type': 'PyLaia Decoding',
 'state': 'FINISHED',
 'success': True,
 'description': 'Done, duration: 2m 59s 718ms',
 'userName': 'jerms_farley@hotmail.com',
 'userId': 223226,
 'createTime': 1710512646568,
 'startTime': 1710512649589,
 'endTime': 1710512829307,
 'jobData': '#Fri Mar 15 15:24:09 CET 2024\ndoNotDeleteWorkDir=false\nwriteKwsIndex=false\ndoLinePolygonSimplification=true\nkeepOriginalLinePolygons=false\nmodelId=53042\nwriteLineConfScore=false\nwriteWordConfScores=false\nb2pBackend=Legacy\nuserRoles=User\nclearLines=false\nisNextGen=false\nuserEmail=jerms_farley@hotmail.com\nGPU_DEVICE=0\ndoWordSeg=true\nworkDir=/tmp/HTR/PyLaia/trpProd/Decode/pylaiaDecode_8381600\nbatchSize=10\nlanguageModel=trainDataLanguageModel\nuseExistingLinePolygons=false\nnBest=1\n',
 'resumable': False,
 'jobImpl': 'PyLaiaDecodingJob',
 'moduleUrl': 'http://srv6103:8081/PyLaiaModule-trpProd-2.12.1',
 'moduleName': 'PyLaiaModule',
 

In [83]:
logout;