In [4]:
# Authentication to Google API
import os
import math
from collections import Counter
from google.cloud import vision
import io
import json
import re

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] ='vision_key.json'
WORD = re.compile(r"\w+")

In [5]:
def detect_text(path):
    """Detects text in the file."""

    client = vision.ImageAnnotatorClient()

    with open(path, "rb") as image_file:
        content = image_file.read()

    image = vision.Image(content=content)


    response = client.document_text_detection(image=image)
    texts = response.text_annotations
    ocr_text = []

    for text in texts:
        ocr_text.append(f"\r\n{text.description}")

    if response.error.message:
        raise Exception(
            "{}\nFor more info on error messages, check: "
            "https://cloud.google.com/apis/design/errors".format(response.error.message)
        )
    return ocr_text

In [7]:
image_path = "Genova.png"
text = detect_text(image_path)


['\r\n(A10)\nSP 35\nA7\nSS 1\nGênes\nE80\nTirrenia\nBacino\nPorto\nVecchio\nVIA BOBBIO\nSS1\nCamaldoli\nA12)', '\r\n(', '\r\nA10', '\r\n)', '\r\nSP', '\r\n35', '\r\nA7', '\r\nSS', '\r\n1', '\r\nGênes', '\r\nE80', '\r\nTirrenia', '\r\nBacino', '\r\nPorto', '\r\nVecchio', '\r\nVIA', '\r\nBOBBIO', '\r\nSS1', '\r\nCamaldoli', '\r\nA12', '\r\n)']


In [8]:
text[0]
for line in text:
    print(line)


(A10)
SP 35
A7
SS 1
Gênes
E80
Tirrenia
Bacino
Porto
Vecchio
VIA BOBBIO
SS1
Camaldoli
A12)

(

A10

)

SP

35

A7

SS

1

Gênes

E80

Tirrenia

Bacino

Porto

Vecchio

VIA

BOBBIO

SS1

Camaldoli

A12

)


In [30]:



def calculate_distance(box1, box2):
    """Calculate the horizontal distance between two bounding boxes."""
    # Assuming box coordinates are [(x1, y1), (x2, y2), (x3, y3), (x4, y4)]
    right1 = max([v['x'] for v in box1])
    left2 = min([v['x'] for v in box2])
    # Calculate distance based on proximity
    return left2 - right1 if left2 > right1 else float('inf')

def merge_texts(results, distance_threshold=10):
    """Merge texts that are close to each other based on a distance threshold."""
    merged_results = []
    current_text = results[0]
    for next_text in results[1:]:
        distance = calculate_distance(current_text['bounds'], next_text['bounds'])
        if distance <= distance_threshold and current_text['bounds'][0]['y'] == next_text['bounds'][0]['y']:
            # Extend the bounding box
            current_text['bounds'].extend(next_text['bounds'])
            current_text['bounds'] = sorted(current_text['bounds'], key=lambda v: v['x'])
            current_text['text'] += next_text['text']
            current_text['confidence'] = (current_text['confidence'] + next_text['confidence']) / 2  # Average confidence
        else:
            merged_results.append(current_text)
            current_text = next_text
    merged_results.append(current_text)  # Add the last text
    return merged_results


def detect_text(path):
    """Detects text in the file and returns details including the text, its location, and confidence."""
    client = vision.ImageAnnotatorClient()
    ocr_results = []

    with io.open(path, 'rb') as image_file:
        content = image_file.read()

    image = vision.Image(content=content)
    response = client.document_text_detection(image=image)
    if response.error.message:
        raise Exception(
            "{}\nFor more info on error messages, check: https://cloud.google.com/apis/design/errors".format(response.error.message)
        )

    if response.text_annotations:
        first_annotation = response.text_annotations[0]
        for page in response.full_text_annotation.pages:
            for block in page.blocks:
                for paragraph in block.paragraphs:
                    for word in paragraph.words:
                        word_text = ''.join([symbol.text for symbol in word.symbols])
                        cleaned_text = re.sub(r'[^\w,.!? ]+', '', word_text, flags=re.UNICODE)
                        if cleaned_text.strip():  # Checks if the string is not just empty or whitespace
                            vertices = [{'x': vertex.x, 'y': vertex.y} for vertex in word.bounding_box.vertices]
                            ocr_results.append({
                                'text': cleaned_text,
                                'confidence': word.confidence,
                                'bounds': vertices
                            })
    # Apply merging based on proximity
    merged_ocr_results = merge_texts(ocr_results)
    json_result = json.dumps(merged_ocr_results, indent=2)
    return json_result

def print_json_data(json_string):
    """Prints the parsed JSON data to ensure proper Unicode display."""
    data = json.loads(json_string)
    print(json.dumps(data, indent=2, ensure_ascii=False))


path_to_image = 'Genova.png'
ocr_output = detect_text(path_to_image)

print_json_data(ocr_output)

[
  {
    "text": "A10",
    "confidence": 0.9873760342597961,
    "bounds": [
      {
        "x": 35,
        "y": 97
      },
      {
        "x": 51,
        "y": 97
      },
      {
        "x": 51,
        "y": 105
      },
      {
        "x": 35,
        "y": 105
      }
    ]
  },
  {
    "text": "SP35",
    "confidence": 0.9664221405982971,
    "bounds": [
      {
        "x": 68,
        "y": 56
      },
      {
        "x": 68,
        "y": 63
      },
      {
        "x": 78,
        "y": 56
      },
      {
        "x": 78,
        "y": 63
      },
      {
        "x": 81,
        "y": 56
      },
      {
        "x": 81,
        "y": 62
      },
      {
        "x": 92,
        "y": 56
      },
      {
        "x": 92,
        "y": 62
      }
    ]
  },
  {
    "text": "A7",
    "confidence": 0.8888351917266846,
    "bounds": [
      {
        "x": 137,
        "y": 178
      },
      {
        "x": 147,
        "y": 178
      },
      {
        "x": 147,
        "y": 18