In [1]:
import requests
import os
import json
from copy import deepcopy

from tqdm import tqdm

In [2]:
# ### API Endpoints

# - **POST /upload-zip/**: Upload a ZIP file containing PDFs.
#     - **Request**: `multipart/form-data` with the file field named `file`.
#     - **Response**: JSON indicating success or failure.


In [3]:
# filepath = 'WharncliffeGardensRepairsAndDecorations.zip'
# url = 'http://localhost:8000/upload-zip/'
# requests.post(
#     url,
#     files={'file': open(filepath, 'rb')}
# )

In [4]:
BASE_PAYLOAD = {
    "processesToExecute": ["preprocessing", "segmentationKraken", "lineSegmentation", "recognition"],
    "processSettings": {
        "lineSegmentation": {
            "cmdArgs": ["--max-whiteseps", "-1", "--parallel", "16"]
        },
        "preprocessing": {
            "cmdArgs": ["--nocheck", "--maxskew", "0", "--parallel", "16"]
        },
        "recognition": {
            "cmdArgs": [
                "--verbose True",
                "--estimate_skew",
                "--data.output_confidences",
                "--data.output_glyphs",
                "--pipeline.batch_size",
                "5",
                "--data.max_glyph_alternatives",
                "1",
                "--checkpoint",
                "/var/ocr4all/models/default/default/uw3-modern-english/0.ckpt.json /var/ocr4all/models/default/default/uw3-modern-english/1.ckpt.json /var/ocr4all/models/default/default/uw3-modern-english/2.ckpt.json /var/ocr4all/models/default/default/uw3-modern-english/3.ckpt.json /var/ocr4all/models/default/default/uw3-modern-english/4.ckpt.json"
            ]
        },
        "segmentationKraken": {
            "imageType": "Binary"
        }
    }
}

HEADERS = {
    "Accept": "application/json",
    "Accept-Language": "pl-PL,pl;q=0.9,en-US;q=0.8,en;q=0.7",
    "Connection": "keep-alive",
    "Content-Type": "application/json",
    "Origin": "http://localhost:1476",
    "Referer": "http://localhost:1476/ocr4all/ProcessFlow",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "same-origin",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0",
    "X-Requested-With": "XMLHttpRequest",
    "sec-ch-ua": '"Opera GX";v="109", "Not:A-Brand";v="8", "Chromium";v="123"',
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": '"Windows"',
    # "Content-Length": "764"
}

In [5]:
PAYLOAD_RESULTS_GENERATION ={
    # 'pageIds[]': '0000',
    # 'pageIds[]': '0001'
    'resultType': 'txt',
    'resultStrategy': 'pred',
    'preserveEmptyLines': 'true',
    'addPageDelimiter': 'true',
    'customPageDelimiter': ''
}

In [6]:
HEADERS_RESULTS_GENERATION = {
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate, br, zstd',
    'Accept-Language': 'pl-PL,pl;q=0.9,en-US;q=0.8,en;q=0.7',
    'Connection': 'keep-alive',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    # 'Cookie': 'JSESSIONID=7BC14C57A413FCF46AD2283AA8F3532F; ajs_anonymous_id=c7f42e44-8ae1-41f2-a1e4-62c6dc25221f',
    'Host': 'localhost:1476',
    'Origin': 'http://localhost:1476',
    'Referer': 'http://localhost:1476/ocr4all/ResultGeneration',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest',
    'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Linux"'
}

In [12]:
input_paths = os.listdir('data')
OCR4ALL_EXECUTE_URL = "http://localhost:1476/ocr4all/ajax/processFlow/execute"
OCR4ALL_LISTFILES_URL = "http://localhost:1476/ocr4all/ajax/overview/list"
OCR4ALL_GENERATE_RESULTS = 'http://localhost:1476/ocr4all/ajax/resultGeneration/execute'

for path in tqdm(input_paths[-4:]):
    url_checkDir = 'http://localhost:1476/ocr4all/ajax/overview/checkDir?&projectDir=%2Fvar%2Focr4all%2Fdata%2F{name}%2F&imageType=Gray&resetSession=true'
    path_processed = path.replace(' ', '%20')
    url_checkDir = url_checkDir.format(name=path_processed)
    # 04%2003%202019%20Wharncliffe%20Gardnes%20Major%20Works%20AA102%20-%20Update%20FAQ
    # print(url_checkDir)

    # 1st establish session through cookies
    resp = requests.get(url_checkDir)
    cookies = resp.cookies

    # # 2nd list files in session
    files = requests.get(OCR4ALL_LISTFILES_URL, cookies=cookies).json()
    files = [f['pageId'] for f in files]

    # 3rd execute OCR4ALL
    payload = deepcopy(BASE_PAYLOAD)
    payload = {"pageIds": files, **payload}
    print('STARTED PROCESSING')
    resp = requests.post(OCR4ALL_EXECUTE_URL, json=payload, headers=HEADERS, cookies=cookies)
    assert resp.status_code == 200

    # 4th generate results
    payload = deepcopy(PAYLOAD_RESULTS_GENERATION)
    gen_responses = []
    for f in files:
        payload[f'pageIds[]'] = f
        r = requests.post(OCR4ALL_GENERATE_RESULTS, data=payload, headers=HEADERS_RESULTS_GENERATION, cookies=cookies)
        if r.status_code != 200:
            print('!!!!!! ERROR FOR FILE', f, '!!!!!!')
            print(r.content)
        gen_responses.append(r)
    

  0%|          | 0/4 [00:00<?, ?it/s]

STARTED PROCESSING


 25%|██▌       | 1/4 [00:16<00:50, 16.67s/it]

STARTED PROCESSING


 50%|█████     | 2/4 [00:36<00:36, 18.28s/it]

STARTED PROCESSING


 75%|███████▌  | 3/4 [01:01<00:21, 21.36s/it]

!!!!!! ERROR FOR FILE 0002 !!!!!!
b'<!doctype html><html lang="en"><head><title>HTTP Status 500 \xe2\x80\x93 Internal Server Error</title><style type="text/css">body {font-family:Tahoma,Arial,sans-serif;} h1, h2, h3, b {color:white;background-color:#525D76;} h1 {font-size:22px;} h2 {font-size:16px;} h3 {font-size:14px;} p {font-size:12px;} a {color:black;} .line {height:1px;background-color:#525D76;border:none;}</style></head><body><h1>HTTP Status 500 \xe2\x80\x93 Internal Server Error</h1><hr class="line" /><p><b>Type</b> Exception Report</p><p><b>Message</b> Request processing failed; nested exception is java.lang.NullPointerException</p><p><b>Description</b> The server encountered an unexpected condition that prevented it from fulfilling the request.</p><p><b>Exception</b></p><pre>org.springframework.web.util.NestedServletException: Request processing failed; nested exception is java.lang.NullPointerException\n\torg.springframework.web.servlet.FrameworkServlet.processRequest(Framewo

100%|██████████| 4/4 [01:36<00:00, 24.01s/it]


In [8]:
r.status_code

500

In [10]:
r.content

b'<!doctype html><html lang="en"><head><title>HTTP Status 500 \xe2\x80\x93 Internal Server Error</title><style type="text/css">body {font-family:Tahoma,Arial,sans-serif;} h1, h2, h3, b {color:white;background-color:#525D76;} h1 {font-size:22px;} h2 {font-size:16px;} h3 {font-size:14px;} p {font-size:12px;} a {color:black;} .line {height:1px;background-color:#525D76;border:none;}</style></head><body><h1>HTTP Status 500 \xe2\x80\x93 Internal Server Error</h1><hr class="line" /><p><b>Type</b> Exception Report</p><p><b>Message</b> Request processing failed; nested exception is java.lang.NullPointerException</p><p><b>Description</b> The server encountered an unexpected condition that prevented it from fulfilling the request.</p><p><b>Exception</b></p><pre>org.springframework.web.util.NestedServletException: Request processing failed; nested exception is java.lang.NullPointerException\n\torg.springframework.web.servlet.FrameworkServlet.processRequest(FrameworkServlet.java:1014)\n\torg.spring

In [None]:
payload

{'resultType': 'txt',
 'resultStrategy': 'pred',
 'preserveEmptyLines': 'true',
 'addPageDelimiter': 'true',
 'customPageDelimiter': '',
 'pageIds[]': '0003'}

In [None]:
gen_responses

In [None]:
gen_responses[0].content

In [None]:
path

In [None]:
cookies

In [None]:
# payload

In [None]:
resp

In [None]:
print(resp.content)

In [None]:
print(resp.text)

In [None]:
files