
1: Generate `.box` Files from Images

In [6]:
import os


image_dir = r'C:\Users\krsna\OneDrive\Desktop\training_set' 
box_dir = os.path.join(image_dir, 'box_files') 


if not os.path.exists(box_dir):
    os.makedirs(box_dir)


for img_name in os.listdir(image_dir):
    if img_name.endswith('.bmp'):
       
        character = img_name.split('.')[0][0]  
       
        x1, y1 = 10, 10
        x2, y2 = 70, 70

        
        box_content = f"{character} {x1} {y1} {x2} {y2} 0\n"

       
        box_filename = os.path.join(box_dir, img_name.replace('.bmp', '.box'))
        with open(box_filename, 'w') as f:
            f.write(box_content)

        print(f"Generated {box_filename}")


Generated C:\Users\krsna\OneDrive\Desktop\training_set\box_files\0.box
Generated C:\Users\krsna\OneDrive\Desktop\training_set\box_files\0xccd.box
Generated C:\Users\krsna\OneDrive\Desktop\training_set\box_files\1.box
Generated C:\Users\krsna\OneDrive\Desktop\training_set\box_files\10.box
Generated C:\Users\krsna\OneDrive\Desktop\training_set\box_files\100.box
Generated C:\Users\krsna\OneDrive\Desktop\training_set\box_files\101.box
Generated C:\Users\krsna\OneDrive\Desktop\training_set\box_files\102.box
Generated C:\Users\krsna\OneDrive\Desktop\training_set\box_files\103.box
Generated C:\Users\krsna\OneDrive\Desktop\training_set\box_files\104.box
Generated C:\Users\krsna\OneDrive\Desktop\training_set\box_files\105.box
Generated C:\Users\krsna\OneDrive\Desktop\training_set\box_files\106.box
Generated C:\Users\krsna\OneDrive\Desktop\training_set\box_files\107.box
Generated C:\Users\krsna\OneDrive\Desktop\training_set\box_files\108.box
Generated C:\Users\krsna\OneDrive\Desktop\training_set

2: Create Project and Training Data Directories

In [8]:
import os


project_dir = 'sinhala_ocr_project'
os.makedirs(project_dir, exist_ok=True)


os.chdir(project_dir)


os.makedirs('training_data', exist_ok=True)

print(f"Created project directory: {os.path.abspath(project_dir)}")
print(f"Created training_data directory: {os.path.abspath('training_data')}")

Created project directory: C:\Users\krsna\PROJECT\sinhala_ocr_project\sinhala_ocr_project
Created training_data directory: C:\Users\krsna\PROJECT\sinhala_ocr_project\training_data


3: Copy `.bmp` Files to `training_data` Directory

In [10]:
import shutil


source_dir = r'C:\Users\krsna\OCR\sinhala_ocr_project\training_data\source_images' 

if not os.path.exists(source_dir):
    print(f"Error: The directory {source_dir} does not exist.")
else:
    
    for file in os.listdir(source_dir):
        if file.endswith('.bmp'):
            shutil.copy(os.path.join(source_dir, file), 'training_data')
    
    print("Copied .bmp files to training_data directory")


print("\nContents of training_data directory:")
print(os.listdir('training_data'))

Copied .bmp files to training_data directory

Contents of training_data directory:
['a1.bmp', 'a10.bmp', 'a11.bmp', 'a12.bmp', 'a13.bmp', 'a14.bmp', 'a15.bmp', 'a16.bmp', 'a17.bmp', 'a18.bmp', 'a19.bmp', 'a2.bmp', 'a20.bmp', 'a21.bmp', 'a22.bmp', 'a23.bmp', 'a24.bmp', 'a25.bmp', 'a3.bmp', 'a4.bmp', 'a5.bmp', 'a6.bmp', 'a7.bmp', 'a8.bmp', 'a9.bmp', 'aa1.bmp', 'aa10.bmp', 'aa11.bmp', 'aa12.bmp', 'aa13.bmp', 'aa14.bmp', 'aa15.bmp', 'aa16.bmp', 'aa17.bmp', 'aa18.bmp', 'aa19.bmp', 'aa2.bmp', 'aa20.bmp', 'aa21.bmp', 'aa22.bmp', 'aa23.bmp', 'aa24.bmp', 'aa25.bmp', 'aa3.bmp', 'aa4.bmp', 'aa5.bmp', 'aa6.bmp', 'aa7.bmp', 'aa8.bmp', 'aa9.bmp', 'ac1.bmp', 'ac10.bmp', 'ac11.bmp', 'ac12.bmp', 'ac13.bmp', 'ac14.bmp', 'ac15.bmp', 'ac16.bmp', 'ac17.bmp', 'ac18.bmp', 'ac19.bmp', 'ac2.bmp', 'ac20.bmp', 'ac21.bmp', 'ac22.bmp', 'ac23.bmp', 'ac24.bmp', 'ac25.bmp', 'ac3.bmp', 'ac4.bmp', 'ac5.bmp', 'ac6.bmp', 'ac7.bmp', 'ac8.bmp', 'ac9.bmp', 'ae1.bmp', 'ae10.bmp', 'ae11.bmp', 'ae12.bmp', 'ae13.bmp', 'ae14.bmp

4: Convert `.bmp` Files to `.tiff`

In [12]:
from PIL import Image
import os

for file in os.listdir('training_data'):
    if file.endswith('.bmp'):
        input_path = os.path.join('training_data', file)
        output_path = os.path.join('training_data', f"{os.path.splitext(file)[0]}.tiff")
        img = Image.open(input_path)
        img.save(output_path, 'TIFF')

print("Converted BMP files to TIFF")

Converted BMP files to TIFF


5: Generate `.box` and Training Files with Tesseract

In [14]:
import subprocess
import os

for file in os.listdir('training_data'):
    if file.endswith('.tiff'):
        base_name = os.path.splitext(file)[0]
        input_path = os.path.join('training_data', file)
             
        subprocess.run(['tesseract', input_path, base_name, 'batch.nochop', 'makebox'])
        
        subprocess.run(['tesseract', input_path, base_name, 'box.train'])

print("Generated box files and created training files")

Generated box files and created training files


6: Extract `unicharset` and Create `font_properties`

In [16]:
import subprocess

subprocess.run(['unicharset_extractor', 'training_data/*.box'])

with open('font_properties', 'w') as f:
    f.write("sinhala 0 0 0 0 0")

print("Extracted unicharset and created font properties file")

Extracted unicharset and created font properties file


7: Cluster Characters and Generate Training Files

In [18]:
import subprocess


subprocess.run(['shapeclustering', '-F', 'font_properties', '-U', 'unicharset', 'training_data/*.tr'])


subprocess.run(['mftraining', '-F', 'font_properties', '-U', 'unicharset', '-O', 'sinhala.unicharset', 'training_data/*.tr'])


subprocess.run(['cntraining', 'training_data/*.tr'])

print("Clustered characters and created necessary files")

Clustered characters and created necessary files


8: Rename and Combine Tesseract Training Files

In [20]:
import os
import subprocess

def safe_rename(src, dst):
    if os.path.exists(src):
        if os.path.exists(dst):
            os.remove(dst)
        os.rename(src, dst)
        print(f"Renamed {src} to {dst}")
    else:
        print(f"Warning: {src} not found, skipping rename")


print("Files in current directory:")
for file in os.listdir('.'):
    print(file)


safe_rename('inttemp', 'sinhala.inttemp')
safe_rename('normproto', 'sinhala.normproto')
safe_rename('pffmtable', 'sinhala.pffmtable')
safe_rename('shapetable', 'sinhala.shapetable')


try:
    subprocess.run(['combine_tessdata', 'sinhala.'], check=True)
    print("Files combined successfully")
except subprocess.CalledProcessError:
    print("Error: Failed to combine files. Make sure Tesseract is installed and in your PATH.")
except Exception as e:
    print(f"An unexpected error occurred while combining files: {e}")


print("\nFiles after renaming and combining:")
for file in os.listdir('.'):
    print(file)

Files in current directory:
a1.box
a1.tr
a10.box
a10.tr
a11.box
a11.tr
a12.box
a12.tr
a13.box
a13.tr
a14.box
a14.tr
a15.box
a15.tr
a16.box
a16.tr
a17.box
a17.tr
a18.box
a18.tr
a19.box
a19.tr
a2.box
a2.tr
a20.box
a20.tr
a21.box
a21.tr
a22.box
a22.tr
a23.box
a23.tr
a24.box
a24.tr
a25.box
a25.tr
a3.box
a3.tr
a4.box
a4.tr
a5.box
a5.tr
a6.box
a6.tr
a7.box
a7.tr
a8.box
a8.tr
a9.box
a9.tr
aa1.box
aa1.tr
aa10.box
aa10.tr
aa11.box
aa11.tr
aa12.box
aa12.tr
aa13.box
aa13.tr
aa14.box
aa14.tr
aa15.box
aa15.tr
aa16.box
aa16.tr
aa17.box
aa17.tr
aa18.box
aa18.tr
aa19.box
aa19.tr
aa2.box
aa2.tr
aa20.box
aa20.tr
aa21.box
aa21.tr
aa22.box
aa22.tr
aa23.box
aa23.tr
aa24.box
aa24.tr
aa25.box
aa25.tr
aa3.box
aa3.tr
aa4.box
aa4.tr
aa5.box
aa5.tr
aa6.box
aa6.tr
aa7.box
aa7.tr
aa8.box
aa8.tr
aa9.box
aa9.tr
ac1.box
ac1.tr
ac10.box
ac10.tr
ac11.box
ac11.tr
ac12.box
ac12.tr
ac13.box
ac13.tr
ac14.box
ac14.tr
ac15.box
ac15.tr
ac16.box
ac16.tr
ac17.box
ac17.tr
ac18.box
ac18.tr
ac19.box
ac19.tr
ac2.box
ac2.tr
ac20.box

9: Move `sinhala.traineddata` to the Current Directory

In [33]:
import shutil
import os


source_path = r'C:\Users\krsna\Downloads\jTessBoxEditor-master\jTessBoxEditor-master\tesseract-ocr\tessdata\sin.traineddata'


if not os.path.exists(source_path):
    raise FileNotFoundError(f"The file {source_path} does not exist.")


destination_dir = os.getcwd()  


destination_path = os.path.join(destination_dir, 'sinhala.traineddata')


shutil.move(source_path, destination_path)
print(f"Moved {source_path} to {destination_path}")

Moved C:\Users\krsna\Downloads\jTessBoxEditor-master\jTessBoxEditor-master\tesseract-ocr\tessdata\sin.traineddata to C:\Users\krsna\PROJECT\sinhala_ocr_project\sinhala_ocr_project\sinhala.traineddata


10: Convert `.jpg` to `.tiff`

In [38]:
from PIL import Image
import os
import tempfile

def convert_jpg_to_tiff(input_path, output_path):
    try:
   
        with Image.open(input_path) as img:
          
            with tempfile.NamedTemporaryFile(delete=False, suffix='.tiff') as temp_file:
                img.save(temp_file.name, 'TIFF')
            
           
            try:
                os.replace(temp_file.name, output_path)
                print(f"Converted {input_path} to {output_path}")
            except PermissionError:
                print(f"Permission denied. Saving to {temp_file.name} instead.")
                print("Please move the file manually to your desired location.")
    except Exception as e:
        print(f"Error converting {input_path}: {str(e)}")


img_path = r'C:/Users/krsna/OneDrive/Desktop/train/sinhala.png'
tiff_path = r'C:/Users/krsna/OneDrive/Desktop/train/sinhala.tiff'


convert_jpg_to_tiff(img_path, tiff_path)

Converted C:/Users/krsna/OneDrive/Desktop/train/sinhala.png to C:/Users/krsna/OneDrive/Desktop/train/sinhala.tiff


In [32]:
import cv2 as cv
def preprocess_image(imgs):
 
 
    img = cv.imread(imgs)
    

    if img is None:
        raise FileNotFoundError(f"Image file '{imgs}' not found.")
    else:
        print("Image loaded successfully.")
    
  
    gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
    print("Converted to grayscale.")
    
  
    blurred = cv.GaussianBlur(gray, (5, 5), 0)
    print("Applied Gaussian blur.")
    
  
    _, thresh = cv.threshold(blurred, 0, 255, cv.THRESH_BINARY + cv.THRESH_OTSU)
    print("Applied binary thresholding.")
    
    return thresh


preprocessed_image = preprocess_image('C:/Users/krsna/OneDrive/Desktop/train/sinhala.tiff')

Image loaded successfully.
Converted to grayscale.
Applied Gaussian blur.
Applied binary thresholding.


11: Test the OCR Model

In [40]:
import os
import pytesseract
from PIL import Image

test_image = tiff_path


if not os.path.exists(test_image):
    print(f"Error: The test image {test_image} does not exist.")
else:
    try:
       
        text = pytesseract.image_to_string(Image.open(test_image), lang='sin')
        
        print("Model testing complete. OCR Result:")
        print(text)
        
     
        with open('output.txt', 'w', encoding='utf-8') as f:
            f.write(text)
        print("Results saved to 'output.txt'")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

Model testing complete. OCR Result:
මගේ නම ජෝන්‌. මම පාසලේ ඉගෙන ගන්නවා. මම
වැඩසටහන්කරණය ගැන උනන්දුයි. ඒ නිසා මම ඉක්මනින්ම එය
ඉගෙන ගන්නෙමි. මා ගත යුතු පළමු පියවර කුමක්ද?

Results saved to 'output.txt'


12: Perform OCR and Translate Text

In [42]:
import os
import pytesseract
from PIL import Image
from googletrans import Translator


test_image = tiff_path


if not os.path.exists(test_image):
    print(f"Error: The test image {test_image} does not exist.")
else:
    try:
  
        sinhala_text = pytesseract.image_to_string(Image.open(test_image), lang='sin')
        
        print("OCR Result (Sinhala):")
        print(sinhala_text)
        

        translator = Translator()
        english_text = translator.translate(sinhala_text, src='si', dest='en').text
        
        print("\nTranslated Result (English):")
        print(english_text)
        
     
        with open('output.txt', 'w', encoding='utf-8') as f:
            f.write("OCR Result (Sinhala):\n")
            f.write(sinhala_text)
            f.write("\n\nTranslated Result (English):\n")
            f.write(english_text)
        print("\nResults saved to 'output.txt'")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

OCR Result (Sinhala):
මගේ නම ජෝන්‌. මම පාසලේ ඉගෙන ගන්නවා. මම
වැඩසටහන්කරණය ගැන උනන්දුයි. ඒ නිසා මම ඉක්මනින්ම එය
ඉගෙන ගන්නෙමි. මා ගත යුතු පළමු පියවර කුමක්ද?


Translated Result (English):
My name is John.I'm studying at school.I
Interested in programming.So I quickly
I will learn.What is the first step to take?

Results saved to 'output.txt'
