# OCR on InfographicVQA dataset via MICROSOFT READ API
#### Results saved as json files. Resizing is performed on images with widht or height greater than 10000 px.

In [1]:
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
from azure.cognitiveservices.vision.computervision.models import VisualFeatureTypes
from msrest.authentication import CognitiveServicesCredentials

from array import array
import os
from PIL import Image
import sys
import time
import requests
import json
from pdf2image import convert_from_path
import io

In [15]:
### Authenticates your credentials and creates a client.

subscription_key = ""
endpoint = ""

computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))

headers = {
    'Ocp-Apim-Subscription-Key': subscription_key, 
    'Content-Type': 'application/octet-stream'
}

In [None]:
for split in ["train", "val", "test"]:
    # Create OCR folder if necessary
    new_ocr_path = f"{split}/{split}_msr_ocr_results"
    if not os.path.exists(new_ocr_path):
        os.makedirs(new_ocr_path)
        
    resized_image_folder = f"{split}/resized_images/"
    if not os.path.exists(resized_image_folder):
        os.makedirs(resized_image_folder)
    
    # Iterate over every infographic
    image_folder = f"{split}/infographicVQA_{split}_v1.0_images"

    for n, image_file in enumerate(sorted(os.listdir(image_folder))):
    #for n, image_file in enumerate(sorted(failed_images[split])):
        image_path = os.path.join(image_folder, image_file)
        is_resized = False
        data = open(image_path, "rb")
        
        image = Image.open(image_path)
        if image.width >= 10000 or image.height >= 10000:
            print(f"{image_file} {split} too large ({image.width}x{image.height}). Resizing")
            
            # The image is too large for the API, resizing is needed
            resized_image = image.resize((min(image.width, 10000), min(image.height, 10000)))
            is_resized = True
            
            # Save the resized image and load its data for the API
            image_path = os.path.join(resized_image_folder, image_file)
            resized_image.save(image_path)
            data = open(image_path, "rb")
            
        # Wait so the request limit is not exceeded
        time.sleep(5)
        # Call to the API
        response = computervision_client.read_in_stream(data, raw=True)
        # Holds the URL used to retrieve the recognized text.
        operation_url = response.headers["Operation-Location"]
        # Take the ID off and use to get results
        operation_id = operation_url.split("/")[-1]

        # The recognized text isn't immediately available, so poll to wait for completion.
        while(True):
            read_result = computervision_client.get_read_result(operation_id)

            if read_result.status.lower () not in ['notstarted', 'running']:
                break
            time.sleep(1)
                
        result_ocr = {}
        result_ocr["status"] = read_result.status
        result_ocr["is_resized"] = is_resized
        result_ocr["recognitionResults"] = []
        
        for text_result in read_result.analyze_result.read_results:
            recognitionResult = {
                "page": text_result.page,
                "clockwiseOrientation": text_result.angle,
                "width": text_result.width,
                "height": text_result.height,
                "unit": text_result.unit.name,
                "lines": []
            }
            result_ocr["recognitionResults"].append(recognitionResult)
            for line in text_result.lines:
                lineResult = {
                    "boundingBox": line.bounding_box,
                    "text": line.text,
                    "words": []
                }
                recognitionResult["lines"].append(lineResult)
                for word in line.words:
                    wordResult = {
                        "boundingBox": word.bounding_box,
                        "text": word.text
                    }
                    lineResult["words"].append(wordResult)
        
        # Write down the ocr results json for the current infographic
        print(f"{n+1}/{len(sorted(os.listdir(image_folder)))}", split, image_file)
        with open(f"{new_ocr_path}/{image_file[:-5]}.json", "w+") as write_file:
            json.dump(result_ocr, write_file)

In [5]:
# Infographics with width or height >= 10000
large_images = {
    'train': [
        '10133.jpeg',
        '10611.jpeg',
        '10868.jpeg',
        '20403.jpeg',
        '20411.jpeg',
        '20464.jpeg',
        '20467.jpeg',
        '30032.jpeg',
        '30124.jpeg',
        '30361.jpeg',
        '30428.jpeg',
        '30616.jpeg',
        '31166.jpeg',
        '31416.jpeg',
        '31748.jpeg',
        '31912.jpeg',
        '32199.jpeg',
        '32494.jpeg',
        '33681.jpeg',
        '33838.jpeg',
        '33840.jpeg',
        '33909.jpeg',
        '35333.jpeg',
        '35443.jpeg',
        '35471.jpeg',
        '37949.jpeg',
        '38019.jpeg',
        '38040.jpeg',
        '38238.jpeg',
        '38361.jpeg',
        '38377.jpeg',
        '40253.jpeg',
        '40315.jpeg',
        '40617.jpeg',
        '41454.jpeg',
        '41567.jpeg',
        '41997.jpeg',
        '42051.jpeg',
        '44048.jpeg',
        '44804.jpeg',
        '70201.jpeg'
     ],
     'test': [
         '11159.jpeg',
         '39327.jpeg',
         '40353.jpeg'
     ],
     'val': [
        '20369.jpeg',
        '37212.jpeg',
        '37324.jpeg',
        '37395.jpeg',
        '39522.jpeg',
        '39695.jpeg',
        '44728.jpeg',
        '44787.jpeg'
     ]
}