# Text extract in images


In [25]:
import requests
from PIL import Image
import pytesseract
from io import BytesIO
import json
import time as TIME

In [26]:
pages_1st_process = []
pages_2nd_process = []
pages_fact_checking = []

with open("../fast_check_data/test_1st.json", 'r', encoding='utf-8') as file:
    pages_1st_process = json.load(file)  

with open("../fast_check_data/test_2nd.json", 'r', encoding='utf-8') as file:
    pages_2nd_process = json.load(file)  

with open("../fact_checking_data/fact_checking_data_extracted_141124.json", 'r', encoding='utf-8') as file:
    pages_fact_checking = json.load(file)  

pages = pages_1st_process["pages"] + pages_2nd_process["pages"] + pages_fact_checking["pages"]

In [27]:
def extract_text(image): 
    start = TIME.time()
    text = pytesseract.image_to_string(image, lang='spa') 
    end = TIME.time()
    return text, end - start

texts_in_images = {"pages": []}

for index, page in enumerate(pages):
    texts_in_images["pages"].append(page)
    for img_index, image in enumerate(page["images"]):
        try: 
            response = None
            img = None
            try: 
                response = requests.get(image)
                img = Image.open(BytesIO(response.content))
            except: 
                print(f"[0.00 s] - [p: {index:02}][i: {img_index:02}] Ocurrio un error al obtener la imagen [{image}]")
                continue
            text, time = extract_text(img)
            texts_in_images["pages"][index]["images"][img_index] = {"image": image, "text": text, "error": False}
            print(f"[{time:.2f} s] - [p: {index:02}][i: {img_index:02}] Éxito [{len(text)} caracteres]")
        except Exception as e: 
            texts_in_images["pages"][index]["images"][img_index] = {"image": image, "text": "", "error": True}
            print(f"[{time:.2f} s] - [p: {index:02}][i: {img_index:02}] Ocurrio un error al analizar la imagen [{e}]")
    if index == 5:
        break

[0.95 s] - [p: 00][i: 00] Éxito [191 caracteres]
[0.82 s] - [p: 00][i: 01] Éxito [250 caracteres]
[1.01 s] - [p: 00][i: 02] Éxito [1007 caracteres]
[0.00 s] - [p: 01][i: 00] Ocurrio un error al obtener la imagen [https://scontent-scl2-1.xx.fbcdn.net/v/t39.30808-6/280608293_560759738739728_1001463741805250641_n.jpg?_nc_cat=106&ccb=1-6&_nc_sid=8bfeb9&_nc_ohc=8AQdEyEv8V8AX8AcRoB&_nc_ht=scontent-scl2-1.xx&oh=00_AT9kivLD5I_jJ6lfuGNqLrArHqmFwXHowPZ5cUmTcouSHg&oe=62807DB1]
[1.04 s] - [p: 03][i: 00] Éxito [762 caracteres]
[0.80 s] - [p: 03][i: 01] Éxito [642 caracteres]
[0.43 s] - [p: 03][i: 02] Éxito [302 caracteres]
[0.40 s] - [p: 04][i: 00] Éxito [66 caracteres]
[0.91 s] - [p: 04][i: 01] Éxito [959 caracteres]
[0.51 s] - [p: 05][i: 00] Éxito [211 caracteres]
[0.24 s] - [p: 05][i: 01] Éxito [0 caracteres]


# Statistics


In [None]:
sum = 0
for page in texts_in_images["pages"]:
    sum += len(page["text"])
print(f"Cantidad de Texto: {sum}")

sum = 0
for page in texts_in_images["pages"]:
    for image in page["images"]:
        sum += len(image["text"])
print(f"Texto adicional en imagenes: {sum}")

print("")

print(f"Total de imagenes {len([image for page in texts_in_images['pages'] for image in page['images']])}")
print(f"Imagenes con texto adicional: {len([image for page in texts_in_images['pages'] for image in page['images'] if len(image['text']) > 0])}")

Cantidad de Texto: 1129366
Texto adicional en imagenes: 58686

Total de imagenes 349
Imagenes con texto adicional: 244


In [23]:
with open(f"text_extract_in_images.json", 'w', encoding='utf-8') as file:
    json.dump(texts_in_images, file, ensure_ascii=False, indent=4)