<a href="https://colab.research.google.com/github/Reader641/Legacy_Transkribus_API/blob/main/Reader641_Transkribus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using the Transkribus REST API

First we will test out logging into the API, afterwards we'll write functions for each task we need to do to upload an image, start an OCR job and then retreive the recognized text.

In [None]:
# Importing the stuff we need
import requests
from google.colab import userdata
import os
import json
from bs4 import BeautifulSoup
import time
from datetime import datetime
from IPython.display import Image, display

## POST-ing Login Credentials
-------
Username and Password stored as secrets in google colab

In [None]:
url = "https://transkribus.eu/TrpServer/rest/auth/login"
data = {"user" : userdata.get('username'), "pw" : userdata.get('password')}
response = requests.post(url, data = data)
response.status_code



200

Lets see whats in our response from the API

In [None]:

bs = BeautifulSoup(response.content, 'xml')
pretty_xml = bs.prettify()
print(pretty_xml)


<?xml version="1.0" encoding="utf-8"?>
<trpUserLogin>
 <userId>
  220701
 </userId>
 <userName>
  rianfrederick4@gmail.com
 </userName>
 <email>
  rianfrederick4@gmail.com
 </email>
 <affiliation>
  None
 </affiliation>
 <firstname>
  Derrian
 </firstname>
 <lastname>
  Frederick
 </lastname>
 <gender>
  unknown
 </gender>
 <userRoleList>
  User
 </userRoleList>
 <isActive>
  1
 </isActive>
 <isAdmin>
  false
 </isAdmin>
 <created>
  2024-02-23T00:31:21.291+01:00
 </created>
 <loginTime>
  2024-03-15T16:22:41.327+01:00
 </loginTime>
 <sessionId>
  A2F70A6A923DE4793B0943CAEA0F5975
 </sessionId>
 <userAgent>
  python-requests/2.31.0
 </userAgent>
 <ip>
  35.204.235.168
 </ip>
</trpUserLogin>



It works! Now lets make functions for the tasks:
* Login
* Get Collections List
* Upload Images
* List Jobs
* Start OCR Job
* Retrieve Recognized Text

# Login Function
-----
This fucntion will take the username and password and return the session id.

In [None]:
def login(user, password):
  url = "https://transkribus.eu/TrpServer/rest/auth/login"
  data = {"user" : user, "pw" : password}
  response = requests.post(url, data = data)
  ##print(response.status_code)
  bs = BeautifulSoup(response.content, 'xml')
  session_id = bs.find_all("sessionId")
  return session_id[0].text

# Get Collections List
--------
This function will take in the session ID and return a list of collections that the user has access to.

In [None]:
def get_collections(session_id):
  url = "https://transkribus.eu/TrpServer/rest/collections/list"
  cookie = {"JSESSIONID" : session_id}
  response = requests.get(url, cookies = cookie)
  ##print(response.status_code)
  collection_list = json.loads(response.content)
  ##print(collection_list)
  return collection_list



# Get Documents List
-------
Returns a list of documents in a specific collection by ID.

In [None]:
def get_documents(session_id, collection_id):
  url = "https://transkribus.eu/TrpServer/rest/collections/{}/list".format(collection_id)
  cookie = {"JSESSIONID" : session_id}
  response = requests.get(url, cookies = cookie)
  #print(response.status_code)
  document = json.loads(response.content)
  #print(document_list)
  return document




```
# This is formatted as code
```

# Get Document
--------
Returns a specfied document by ID from a specific collection by ID.


In [None]:
def get_document(session_id, collection_id, document_id):
  url = "https://transkribus.eu/TrpServer/rest/collections/{}/{}/fulldoc".format(collection_id, document_id)
  cookie = {"JSESSIONID" : session_id}
  response = requests.get(url, cookies = cookie)
  #print(response.status_code)
  document_list = json.loads(response.content)
  #print(document_list)
  return document_list

# Display Image
-------
Displays the image from a URL.


In [None]:
def display_image(url):
  r = requests.get(url, stream = True)
  with open('image.jpg', 'wb') as img:
    for chunk in r.iter_content(chunk_size=128):
        img.write(chunk)
  display(Image(filename='image.jpg'))

# Get Image URLs
--------
Get image URL from LOC

In [None]:
def get_image_urls(url, num_items):
    items = []
    params = {"fo": "json", "c": num_items, "at": "results"}
    call = requests.get(url, params=params)
    data = call.json()
    results = data['results']
    for result in results:
        # don't try to get images from the collection-level result
        if "collection" not in result.get("original_format") and "web page" not in result.get("original_format"):
            # take the last URL listed in the image_url array
            if result.get("image_url"):
                item = result.get("image_url")[3]
                items.append(item)
    return items

# Get Collection Metadata
-----------------
Retrieves Metadata from the collection URL

In [None]:
def get_collection_metadata(url):
    params = {"fo": "json"}
    response = requests.get(url, params=params)
    data = response.json()

    title = 'default' #data['results'][0]['title']
    author = 'default' #data['results'][0]['contributor']
    genre = 'default' #data['results'][0]['item']['genre']
    return title, author, genre

# Job Management
------
Functions for getting the list of jobs, the details of specific jobs and cancelling specific jobs.

In [None]:
def get_jobs(session_id):
  url = "https://transkribus.eu/TrpServer/rest/jobs/list"
  cookie = {"JSESSIONID" : session_id}
  response = requests.get(url, cookies = cookie)
  #print(response.status_code)
  job_list = json.loads(response.content)
  #print(document_list)
  return job_list


def get_job(session_id, job_id):
  url = "https://transkribus.eu/TrpServer/rest/jobs/{}".format(job_id)
  cookie = {"JSESSIONID" : session_id}
  response = requests.get(url, cookies = cookie)
  #print(response.status_code)
  job = json.loads(response.content)
  #print(job_list)
  return job

def kill_job(session_id, job_id):
  url = "https://transkribus.eu/TrpServer/rest/jobs/{}/kill".format(job_id)
  cookie = {"JSESSIONID" : session_id}
  response = requests.post(url, cookies = cookie)
  #print(response.status_code)



def job_errors(session_id, job_id):
  url = "https://transkribus.eu/TrpServer/rest/jobs/{}/errors".format(job_id)
  cookie = {"JSESSIONID" : session_id}
  response = requests.get(url, cookies = cookie)
  #print(response.status_code)
  errors = json.loads(response.content)
  #print(errors)
  return errors

# Start Job
---------
Starts a an OCR job on a specifc document page in a collection and returns the job ID

In [None]:
def start_ocr_job(session_id, collection_id, model_id, document_id, page_num):
  # Got this from looking at the request generated when you do the recognition manually
  '''
  fetch("https://transkribus.eu/TrpServer/rest/pylaia/282926/39995/recognition?id=1877858&pages=2&writeKwsIndex=false&doStructures=&clearLines=false&doWordSeg=true&allowConcurrentExecution=false&keepOriginalLinePolygons=false&useExistingLinePolygons=false", {
    "headers": {
      "accept": "*/*",
      "accept-language": "en-US,en;q=0.9",
      "sec-fetch-dest": "empty",
      "sec-fetch-mode": "cors",
      "sec-fetch-site": "cross-site"
    },
    "referrer": "https://app.transkribus.org/",
    "referrerPolicy": "strict-origin-when-cross-origin",
    "body": null,
    "method": "OPTIONS",
    "mode": "cors",
    "credentials": "omit"
  });
  '''
  url = "https://transkribus.eu/TrpServer/rest/pylaia/{}/{}/recognition".format(collection_id, model_id)
  cookie = {"JSESSIONID" : session_id}
  params = {"id": document_id, "pages": page_num}
  response = requests.post(url, cookies = cookie, params=params)
  #print(response.status_code)
  #print(response.content)
  #print(url)
  job_id = json.loads(response.content)
  #print(document_list)


  return job_id

# Upload Document Images
---------
Takes a collection URL from LOC and uploads a specific number of images from that collection as a document. Returns a Job ID.

In [None]:

def upload_documents(collection_url, num_items, collection_id):
  image_urls = get_image_urls(collection_url, num_items)
  title, author, genre = get_collection_metadata(collection_url)
  page_num = 1
  pages = []
  files = []
  file_paths = []

  for image_url in image_urls:
    file_name = collection_url.split('/')[-1] + '_' + str(page_num) + '.jpg'
    file_path = "./tmp/" + file_name
    file_paths.append(file_path)
    pages.append({"fileName": file_name, "pageNr": page_num})
    page_num += 1
    #print(image_urls)
    #print(file_name)
    r = requests.get(image_url, stream = True)
    with open(file_path, 'wb') as img:
      for chunk in r.iter_content(chunk_size=128):
        img.write(chunk)
    files.append({'img': open(file_path,'rb'), 'Content-Type': 'application/octet-stream'})

  document = {
    "md":
      {
        "title": title,
        "author": author,
        "genre": genre,
      },
    "pageList":
      {
        "pages": pages
      }
  }


  url = "https://transkribus.eu/TrpServer/rest/uploads?collId={}".format(collection_id)
  cookie = {"JSESSIONID" : session_id}
  response = requests.post(url, cookies = cookie, json = document)

  bs = BeautifulSoup(response.content, 'xml')
  upload_id = bs.find_all("uploadId")[0].text

  #print(upload_id)

  for file in files:
    url = "https://transkribus.eu/TrpServer/rest/uploads/{}".format(upload_id)
    cookie = {"JSESSIONID" : session_id}
    response = requests.put(url, cookies = cookie, files = file)
    #print(file)
    #print(response.status_code)
    #print(response.content)
  for file_path in file_paths:
    os.remove(file_path)


  bs = BeautifulSoup(response.content, 'xml')
  job_id = bs.find_all("jobId")[0].text

  return job_id





# Log Progress
--------------
This function logs the mentioned message at a given stage of the code execution to a log file. Function returns nothing

In [None]:
def log_progress(message):
    timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second
    now = datetime.now() # get current timestamp
    timestamp = now.strftime(timestamp_format)
    with open("./log.txt","a") as f:
        f.write(timestamp + ' : ' + message + '\n')

# Work Flow
-------
Now that all of our functions are written we can work on a work flow for uploading a document to Transkribus, starting the OCR Job and getting the transcribed text

In [None]:
model_id = 39995
num = 5
url = "https://www.loc.gov/collections/nineteenth-century-song-sheets"

log_progress("Intiating process")

session_id = login(userdata.get('username'), userdata.get('password'))
log_progress("Login Completed | Session ID - {}".format(session_id))

collections = get_collections(session_id)
collection_id = collections[1]["colId"]
log_progress("Retrived Collection ID | Collection ID - {}".format(collection_id))




job_id = upload_documents(url, num, collection_id)
log_progress("Uploading ({}) Documents from {} to {}".format(num, url, collection_id))


job_status = get_job(session_id, job_id)


while(job_status['state'] == "RUNNING"):
  job_status = get_job(session_id, job_id)
  log_progress("Job ID - {} |  State - {}".format(job_id, job_status['state']))
  time.sleep(1)

job_status = get_job(session_id, job_id)
if (job_status['state'] == "FINISHED"):
  log_progress("Job Finished")

log_progress("Documents Uploaded")

document_list = get_documents(session_id, collection_id)
document_id = document_list[0]['docId']




for page_num in range(1, num+1):
  job_id = start_ocr_job(session_id, collection_id, model_id, document_id, page_num)
  job_status = get_job(session_id, job_id)
  log_progress("Started OCR Job on Page ID - {}, of Document ID - {}, of Collection ID - {}, with Model ID - {} | Job ID - {}".format(page_num, document_id, collection_id, model_id, job_id))
  while(job_status['state'] == "RUNNING"):
    job_status = get_job(session_id, job_id)
    log_progress("OCR Job ID - {} |  State - {}".format(job_id, job_status['state']))
    time.sleep(1)


  job_status = get_job(session_id, job_id)
  if (job_status['state'] == "FINISHED"):
    log_progress("OCR Job Finished")

document = get_document(session_id, collection_id, document_id)
log_progress("Document Retrived")

transcripts = []

for page_num in range(0, num):
   xml_url = document['pageList']['pages'][page_num]['tsList']['transcripts'][0]['url']
   r = requests.get(xml_url)
   transcripts.append(r.content)

log_progress("Transcripts Retrived")


In [None]:
json_str = json.dumps(document, indent=4, sort_keys=True)
print(json_str)

{
    "collection": {
        "accountingStatus": 1,
        "colId": 282926,
        "colName": "Test Collection",
        "created": "2024-02-27T21:00:45.18+01:00",
        "crowdsourcing": false,
        "description": "created by rianfrederick4@gmail.com",
        "elearning": false,
        "nrOfDocuments": 0,
        "pageId": 66705944,
        "thumbUrl": "https://files.transkribus.eu/Get?fileType=thumb&id=BNSNIKEFHMRRWTFWESKXGOKB",
        "url": "https://files.transkribus.eu/Get?fileType=view&id=BNSNIKEFHMRRWTFWESKXGOKB"
    },
    "edDeclList": [],
    "md": {
        "attributes": [],
        "author": "default",
        "collectionList": {
            "colList": [
                {
                    "colId": 282926,
                    "colName": "Test Collection",
                    "crowdsourcing": false,
                    "description": "created by rianfrederick4@gmail.com",
                    "elearning": false,
                    "nrOfDocuments": 0
             