# Text Extract in Images

## Imports


In [None]:
import requests
from PIL import Image
import pytesseract
from io import BytesIO
import json
import time as TIME

## Load data


In [None]:
pages_1st_process = []
pages_2nd_process = []
pages_fact_checking = []

with open("./image_model/json_data_fast_check_1_fixed.json", 'r', encoding='utf-8') as file:
    pages_1st_process = json.load(file)  

with open("./image_model/json_data_fast_check_2_fixed.json", 'r', encoding='utf-8') as file:
    pages_2nd_process = json.load(file)  

with open("./image_model/json_data_fact_checking.json", 'r', encoding='utf-8') as file:
    pages_fact_checking = json.load(file)  

## Text Extracting


In [None]:
def extract_text(image): 
    start = TIME.time()
    text = pytesseract.image_to_string(image, lang='spa') 
    end = TIME.time()
    return text, end - start

def extract_text_from_pages(pages):
    texts_in_images = {"pages": []}
    for index, page in enumerate(pages):
        texts_in_images["pages"].append(page)
        for img_index, image in enumerate(page["images"]):
            try: 
                response = None
                img = None
                try: 
                    response = requests.get(image)
                    img = Image.open(BytesIO(response.content))
                except: 
                    print(f"[0.00 s] - [p: {index:02}][i: {img_index:02}] Ocurrio un error al obtener la imagen [{image}]")
                    continue
                text, time = extract_text(img)
                texts_in_images["pages"][index]["images"][img_index] = {"image": image, "text": text, "error": False}
                print(f"[{time:.2f} s] - [p: {index:02}][i: {img_index:02}] Éxito [{len(text)} caracteres]")
            except Exception as e: 
                texts_in_images["pages"][index]["images"][img_index] = {"image": image, "text": "", "error": True}
                print(f"[{time:.2f} s] - [p: {index:02}][i: {img_index:02}] Ocurrio un error al analizar la imagen [{e}]")
        page_text = texts_in_images["pages"][index]["text"]
        page_text = page_text + " ".join([img["text"] for img in texts_in_images["pages"][index]["images"] if not img["error"]])
        texts_in_images["pages"][index]["text"] = page_text
    return texts_in_images

In [None]:
pages_1st_process = extract_text_from_pages(pages_1st_process["pages"])
pages_2nd_process = extract_text_from_pages(pages_2nd_process["pages"])
pages_fact_checking = extract_text_from_pages(pages_fact_checking["pages"])

# Statistics


In [None]:
pages = {"pages": []}
pages["pages"] = pages_1st_process["pages"] + pages_2nd_process["pages"] + pages_fact_checking["pages"]

sum = 0
for page in pages["pages"]:
    sum += len(page["text"])
print(f"Cantidad de Texto: {sum}")

sum = 0
for page in pages["pages"]:
    for image in page["images"]:
        sum += len(image["text"])
print(f"Texto adicional en imagenes: {sum}")

print("")

print(f"Total de imagenes {len([image for page in pages['pages'] for image in page['images']])}")
print(f"Imagenes con texto adicional: {len([image for page in pages['pages'] for image in page['images'] if len(image['text']) > 0])}")

# Save data


In [None]:
with open(f"./image_model/json_data_fast_check_1_fixed.json", 'w', encoding='utf-8') as file:
    json.dump(pages_1st_process, file, ensure_ascii=False, indent=4)
with open(f"./image_model/json_data_fast_check_2_fixed.json", 'w', encoding='utf-8') as file:
    json.dump(pages_2nd_process, file, ensure_ascii=False, indent=4)
with open(f"./image_model/json_data_fact_checking.json", 'w', encoding='utf-8') as file:
    json.dump(pages_fact_checking, file, ensure_ascii=False, indent=4)