In [23]:
!pip install easyocr




In [None]:
import easyocr
from PIL import Image
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import os

# Load spaCy NER model for Bangla language
tokenizer = AutoTokenizer.from_pretrained("sagorsarker/mbert-bengali-ner")
model = AutoModelForTokenClassification.from_pretrained("sagorsarker/mbert-bengali-ner")

nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)

# Function to perform OCR on an image using EasyOCR
def perform_ocr(image_path):
    reader = easyocr.Reader(['bn'])  # 'bn' is the language code for Bengali
    result = reader.readtext(image_path)
    text = ' '.join([x[1] for x in result])  # Extracting text from the result
    return text

# Function to perform NER on text
def perform_ner(text):
    # Perform NER using transformer-based pipeline
    entities = nlp(text)
    return entities

# Example usage
image_dir = '/content/drive/MyDrive/Colab_Notebooks/Bangla_LPDB/'
training_data = []

for filename in os.listdir(image_dir):
    if filename.endswith('.jpeg'):
        image_path = os.path.join(image_dir, filename)
        text = perform_ocr(image_path)
        named_entities = perform_ner(text)
        print("Image:", filename)
        print("\nOCR Output:")
        print(text)
        print("\nNamed Entities:")
        print(named_entities)
        print("\nEntity word:\tEntity label:")
        for entity in named_entities:
          print(entity['word'], entity['entity_group'])
          print()
        print("--------------------------------------------------")
        entities_list = []
        for entity in named_entities:
            entities_list.append((entity['start'], entity['end'], entity['entity_group']))
        training_data.append((text, {"entities": entities_list}))

# Output the training data in the desired format
output_file = "/content/drive/MyDrive/Colab_Notebooks/Bangla_LPDB_ANPR_semi_annotated.txt"
with open(output_file, "w", encoding="utf-8") as f:
    for text, annotations in training_data:
        f.write(f'("{text}", {annotations}),\n')



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[{'entity_group': 'LABEL_5', 'score': 0.99084777, 'word': 'পদমাখালী', 'start': 0, 'end': 9}, {'entity_group': 'LABEL_0', 'score': 0.99916315, 'word': ': ১২ - ০২৪০', 'start': 9, 'end': 18}]

Entity word:	Entity label:
পদমাখালী LABEL_5

: ১২ - ০২৪০ LABEL_0

--------------------------------------------------
Image: Vehicle1906.jpeg

OCR Output:
হ ১৫-১৪৪৫ বগুড়া-

Named Entities:
[{'entity_group': 'LABEL_0', 'score': 0.99165124, 'word': 'হ ১৫ - ১৪৪৫', 'start': 0, 'end': 9}, {'entity_group': 'LABEL_5', 'score': 0.96830744, 'word': 'বগডা', 'start': 10, 'end': 15}, {'entity_group': 'LABEL_0', 'score': 0.9991272, 'word': '-', 'start': 15, 'end': 16}]

Entity word:	Entity label:
হ ১৫ - ১৪৪৫ LABEL_0

বগডা LABEL_5

- LABEL_0

--------------------------------------------------
Image: Vehicle1915.jpeg

OCR Output:
'১৯>৩

Named Entities:
[{'entity_group': 'LABEL_0', 'score': 0.9987036, 'word': "' ১৯ > ৩", 'start': 0, 'end': 5}]

Entity 