In [96]:
import os
import random
import re
import pathlib
import subprocess

In [97]:
FONT = 'Consolas'
OUTPUT_DIRECTORY = f'tesstrain/data/{FONT.lower()}-ground-truth'
TRAINING_FILE_NAME_STEM = "mac_serials"
FONT_DIRECTORY = './fonts'
DATA_SET_SIZE = 800

In [98]:
if not os.path.exists(OUTPUT_DIRECTORY):
    os.mkdir(OUTPUT_DIRECTORY)

# Preprocessing

## Generating the training data

This is example code that will generate fake MAC adresses and serial numbers. You may use any data here that best suits your use case, just be sure to have a single line per file.

In [99]:
def generate_random_mac():
    return '-'.join('%02X'%random.randint(0,255) for x in range(6))

def generate_random_serial():
    return str(random.randint(20, 90)) + 'XD' + str(random.randint(3000001, 9000000))

lines = []

for i in range(DATA_SET_SIZE / 2):
    lines.append(generate_random_mac())
    lines.append(generate_random_serial())

random.shuffle(lines)

## Generating the images from the training data

In [100]:
os.environ['FONTCONFIG_FILE'] = 'C:\\Absolute\\Path\\To\\fonts.conf' # TODO: change me. will usually be (generated) the directory this notebook resides in.

line_count = 0
for line in lines:
    line_training_text = os.path.join(OUTPUT_DIRECTORY, f'{TRAINING_FILE_NAME_STEM}_{line_count}.gt.txt')
    with open(line_training_text, 'w') as output_file:
        output_file.writelines([line])
    file_base_name = f'{TRAINING_FILE_NAME_STEM}_{line_count}'
    result = subprocess.run([
        'text2image',
        f'--font={FONT}',
        f'--text={line_training_text}',
        f'--outputbase={OUTPUT_DIRECTORY}/{file_base_name}',
        '--max_pages=1',
        '--strip_unrenderable_words',
        '--leading=0',
        '--xsize=700',
        '--ysize=300',
        '--char_spacing=0.25',
        f'--exposure={random.randint(-2,0)}',
        '--unicharset_file=langdata/eng.unicharset',
        f'--fonts_dir={FONT_DIRECTORY}',
        f'--ptsize={random.randint(6,10)}',
        '--resolution=300',
    ])
    line_count += 1

# Actual training

I strongly recommend to use one of tesseracts "best" models from [this](https://github.com/tesseract-ocr/tessdata_best) repository as the `START_MODEL`.

Run in a linux (I used wsl2) shell:
    
```bash
cd tesstrain
TESSDATA_PREFIX=../tesseract/tessdata make training MODEL_NAME=Consolas START_MODEL=eng TESSDATA=../tesseract/tessdata MAX_ITERATIONS=13500 PSM=6
```