# Character-level RNN for Handwritten-like Text Generation

This notebook implements a character-level recurrent neural network (RNN) to generate handwritten-like text. The model is trained on the HKR dataset (Handwritten Kazakh and Russian) text data.

## 1. Import Libraries

In [1]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
import random
import sys

## 2. Load and Preprocess Dataset

Load text data from the HKR dataset images using OCR. Requires pytesseract and Tesseract OCR installed.
Ensure Tesseract executable is installed and pytesseract is available.
Install pytesseract with `pip install pytesseract`.

Import necessary libraries and extract text from images in the dataset directory.

In [2]:
dataset_path = "CODSOFT_05/HKR_Dataset-master/hkr_text.txt"

# New dataset images directory path
dataset_images_dir = r"C:\Users\User\Desktop\CODSOFT\CODSOFT_05\HKR_Dataset-master\images"

import pytesseract
from PIL import Image
import glob

text = ""
image_files = glob.glob(dataset_images_dir + "\*.jpg")
for img_file in image_files:
    img = Image.open(img_file)
    extracted_text = pytesseract.image_to_string(img, lang='eng')  # Adjust lang as needed
    text += extracted_text + "\n"

if len(text) < 40:
    raise ValueError(f"Extracted text is too short for training. Length: {len(text)} characters.")

print(f"Extracted corpus length: {len(text)} characters")

  image_files = glob.glob(dataset_images_dir + "\*.jpg")


Extracted corpus length: 65 characters


## 3. Create Character Vocabulary and Encode Text

In [3]:
chars = sorted(list(set(text)))
char_to_idx = {c: i for i, c in enumerate(chars)}
idx_to_char = {i: c for i, c in enumerate(chars)}

vocab_size = len(chars)
print(f"Total unique characters: {vocab_size}")

Total unique characters: 27


## 4. Prepare Training Sequences

In [4]:
seq_length = 40
step = 3
sentences = []
next_chars = []

for i in range(0, len(text) - seq_length, step):
    sentences.append(text[i: i + seq_length])
    next_chars.append(text[i + seq_length])

print(f"Number of sequences: {len(sentences)}")

Number of sequences: 9


## 5. Vectorize Input and Output

In [5]:
X = np.zeros((len(sentences), seq_length, vocab_size), dtype=np.bool_)
y = np.zeros((len(sentences), vocab_size), dtype=np.bool_)

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_to_idx[char]] = 1
    y[i, char_to_idx[next_chars[i]]] = 1

## 6. Build the Model

In [6]:
model = Sequential()
model.add(LSTM(128, input_shape=(seq_length, vocab_size)))
model.add(Dense(vocab_size, activation='softmax'))

optimizer = Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

model.summary()

  super().__init__(**kwargs)


## 7. Define Sampling Function for Text Generation

In [7]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-8) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

## 8. Train the Model and Generate Text After Each Epoch

In [8]:
import sys
def on_epoch_end(epoch, logs):
    print(f"\n----- Generating text after Epoch: {epoch + 1}")
    if len(text) <= seq_length:
        print("Text length is too short for generation. Skipping generation.")
        return
    start_index = random.randint(0, len(text) - seq_length - 1)
    generated = ''
    sentence = text[start_index: start_index + seq_length]
    generated += sentence
    print(f"----- Seed: \"{sentence}\"")
    sys.stdout.write(generated)
    sys.stdout.flush()

    for temperature in [0.2, 0.5, 1.0, 1.2]:
        sys.stdout.write(f"\n----- temperature: {temperature}\n")
        sys.stdout.flush()
        generated = ''
        sentence = text[start_index: start_index + seq_length]
        for i in range(400):
            x_pred = np.zeros((1, seq_length, vocab_size))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_to_idx[char]] = 1

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = idx_to_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

        print(generated)
        sys.stdout.flush()

from tensorflow.keras.callbacks import LambdaCallback
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(X, y, batch_size=128, epochs=20, callbacks=[print_callback])

Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - loss: 3.2872
----- Generating text after Epoch: 1
----- Seed: "eptagzA

yom’ Reser? Bonrob’

lem? kes? "
eptagzA

yom’ Reser? Bonrob’

lem? kes? 
----- temperature: 0.2
kmBbJyaeeobO?As’gmyJOalzmbsspz’k? naN?esryhyOpoga
mtynength Ozge?wetb?lbBz ph
hp
zaammNomekAJrOO?w
Owg zRnwJ tpOnzbmallRyysRBr
m?oaBlkJ’nn yheB’?
B’ezRrrrsJynNBnBkwzn??’lJzpA

sphmso 
bo?A pp’
hnap’smgstnprB
hbyBhzNhlayno?gR bpn zgynJgO sa?mmpJb pNkJ?bBRJpzmyRawNRJ’rzNylARgwAtgRt
bl’z’lkpRwlyAw Opyll’gzktsBNtw bs?J’sgpaAJnehneeswkB 
nnkyypt’on
?l?ltr l 
On?l’y
sJr’ ANAOtA?apBJnwk
s?nlAlmzw?khwzAN?

----- temperature: 0.5
nswbkpBpazt?ByrblJkr
N
r
thrz
 Ra
mOb’NbglNgBO’aBh
sONa
s?mRNzR rrNbJpk?aOomsmhwO’pm
OArrnoh nzRm  b?yJ
meBpa’lwlnaoaob
’ohBR 
w?RJg
pwlpsnmtgzsz?bgOyo?nwJJwpknpoJgkAhOtongyspsezazaoNNwhszlRynrnolew gAzzbhB’
mkNsmr
sya’yAy’yJOJl’swht ARkm?ArzntteaNtsnA ’BNpbyb’tBpospN
beoblpns?zBOlroRRzmNtN’g
’n’ylAyNkN?ktJ?gBowt ObAgso
l

<keras.src.callbacks.history.History at 0x28782b7b0b0>