# **Importing Packages and libraries**

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
import numpy as np
import os
import ast
import cv2
import matplotlib.pyplot as plt
from PIL import ImageOps , Image
import torch
from torch.utils.data import Dataset
from torchvision import transforms
from torch.utils.data import DataLoader
import torchvision.models as models
from tqdm import tqdm

# **Importing Data**

In [6]:
AANOTATIONS_PATH = '/kaggle/input/textocr-text-extraction-from-images-dataset/annot.csv'
IMAGES_INFO_PATH = '/kaggle/input/textocr-text-extraction-from-images-dataset/img.csv'
IMAGES_PATH = '/kaggle/input/textocr-text-extraction-from-images-dataset/train_val_images/train_images'

In [7]:
images_annotations = pd.read_csv(AANOTATIONS_PATH, encoding='utf-8')
images_info_data = pd.read_csv(IMAGES_INFO_PATH, encoding='utf-8')

In [8]:
max_width , max_height = 500 , 500
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# **Preprocessing Data**

In [9]:
def preprocess_text_recognizer_data(images_annotations, max_width, max_height):
    data = images_annotations[['image_id', 'bbox', 'utf8_string']]
    images, targets = [], []
    #chars = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()-_=+[]{}|;:'\",.<>?/\\`~\n")
    chars = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
    letter_to_index = {letter: index for index, letter in enumerate(chars)}
    index_to_letters = {index: letter for letter, index in letter_to_index.items()}
    wrong_annotated_images = ['0c6eb2e5cff08c2b','1f7ad7273543715d','260b8a281ff565ed','5bcbdb5d7d9f5f4a','2ceb34939bf4133c']
    wrong_annotated_set = set()
    for image in wrong_annotated_images :
        wrong_annotated_set.add(image)
    
    for _, row in tqdm(data.iterrows()):
        if row['image_id'] not in  wrong_annotated_set :
            encoded_text = [letter_to_index[letter] for letter in str(row['utf8_string']) if letter in chars]
            if len(encoded_text) == len(str(row['utf8_string'])) and len(encoded_text) <= 15 :
                images.append(row['image_id'])
                targets.append({"box": ast.literal_eval(row['bbox']), "text": encoded_text})
            
    
    return images, targets, index_to_letters

In [10]:
images, boxes_and_labels, index_to_letters = preprocess_text_recognizer_data(images_annotations, max_width, max_height)

1052354it [01:27, 12076.19it/s]


# **Splitting Data**

In [11]:
training_images, testing_images, training_targets, testing_targets = train_test_split(images , boxes_and_labels , test_size= .2 , shuffle= True)
validation_images, testing_images, validation_targets, testing_targets = train_test_split(testing_images , testing_targets  , test_size= .5 , shuffle= True)

# **Data Loading**

In [12]:
batch_size = 1
num_workers = 1
pin_memory = True
shuffle = True
epochs = 2

In [13]:
def resize_image_and_padding(image,max_height,max_width) :
    resized_image = cv2.resize(image,(max_width,max_height))
    return resized_image

In [14]:
def preprocess_image(image_path,max_height,max_width,bbox=None) :

    image = cv2.imread(image_path)

    if bbox != None :
        x , y , w , h = bbox
        x , y , w , h = int(x) , int(y) , int(w) , int(h)
        image = image[y:y+h,x:x+w]

    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = resize_image_and_padding(image,max_height,max_width)
    image = image / 255
    image = torch.from_numpy(image)
    image = image.permute(2, 0, 1)
    return image

In [15]:
class CustomRecognizerDataset(Dataset) :
    def __init__(self,images,targets,max_height,max_width) :
        self.max_height = max_height
        self.max_width = max_width
        self.image_id , self.targets = images , targets

    def __len__(self) :
        return len(self.image_id)
    
    def __getitem__(self,index) :
        image_path = os.path.join(IMAGES_PATH,self.image_id[index]+'.jpg')  
        image = preprocess_image(image_path,self.max_height,self.max_width,bbox=self.targets[index]['box'])
        encoding = torch.tensor(self.targets[index]['text'])
        
        return {"images" : image , "targets" : encoding}

In [16]:
class getRecognizerLoader :
    def __init__(self,images,targets , num_workers , shuffle , pin_memory ,max_height,max_width ,batch_size = 1 ) :
        self.dataset = CustomRecognizerDataset(images,targets,max_height,max_width)
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.pin_memory = pin_memory
        self.shuffle = shuffle

    def get_loader(self) :
        return DataLoader(self.dataset , batch_size = self.batch_size , shuffle = self.shuffle , num_workers = self.num_workers , pin_memory = self.pin_memory)

In [17]:
training_data_loader = getRecognizerLoader(images=training_images, targets=training_targets , num_workers = num_workers 
                                 , shuffle=shuffle , pin_memory = pin_memory ,max_height= max_height ,max_width= max_width ).get_loader()
validation_data_loader = getRecognizerLoader(images=validation_images, targets=validation_targets  , num_workers = num_workers 
                                   , shuffle=shuffle , pin_memory = pin_memory ,max_height= max_height ,max_width= max_width).get_loader()
testing_data_loader = getRecognizerLoader(images=testing_images, targets=testing_targets  , num_workers = num_workers 
                                , shuffle=True , pin_memory = pin_memory ,max_height= max_height ,max_width= max_width).get_loader()

# **Building Model**

In [18]:
hidden_size, lstm_layers, num_classes = 256 , 2 , len(index_to_letters) + 1

In [19]:
num_classes

63

In [20]:
class TextRecognizer(nn.Module) :
    def __init__(self, hidden_size, lstm_layers, num_classes):
        super(TextRecognizer,self).__init__()
        vgg16 = models.vgg16(pretrained = True)
        self.cnn = nn.Sequential(*list(vgg16.children())[:-2])
        self.hidden_size = hidden_size
        self.num_layers = lstm_layers
        self.lstm = nn.LSTM(7680 , hidden_size // 2, lstm_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size , num_classes)


    def forward(self,image) :
        output = self.cnn(image)
        batch_size , channels , height, width= output.size()

        output = output.permute(0, 3, 1, 2)  # Permute dimensions to (batch_size, height, width, channels)
        output = output.reshape(batch_size, width ,  height * channels )
        output , _ = self.lstm(output)
        output = self.fc(output)

        return output

In [21]:
text_recognizer_model = TextRecognizer(hidden_size, lstm_layers, num_classes)
text_recognizer_model = text_recognizer_model.to(device)

# Ensure all parameters are trainable
for param in text_recognizer_model.parameters() :
    param.requires_grad = True

Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth
100%|██████████| 528M/528M [00:03<00:00, 165MB/s]  


In [22]:
SAVED_TEXT_RECOGNIZER_MODEL_PATH = '/kaggle/input/text-tecognizer-model/text_recognizer_best_model.bin'
text_recognizer_model.load_state_dict(torch.load(SAVED_TEXT_RECOGNIZER_MODEL_PATH))

<All keys matched successfully>

In [23]:
optimizer = AdamW(text_recognizer_model.parameters(),lr = 0.0001)
best_score = float('inf')
print(len(training_data_loader))

471392


In [24]:
for param_group in optimizer.param_groups:
    param_group['lr'] = 0.0001

# **Training and Evaluation functions**

In [28]:
def train_text_recognizer(model,data_loader,optimizer,device,blank_index) :
    losses = []
    model.train()

    for i in tqdm(data_loader) :
        optimizer.zero_grad()
        criteration = nn.CTCLoss(blank=blank_index,zero_infinity =True)
        image = i["images"].to(device).to(torch.float32)
        targets = i["targets"].to(device)

        output = model(image)
        output = output.permute(1,0,2)
        output = output.log_softmax(2)
        input_lengths = torch.tensor([output.size(0)] * output.size(1))
        target_lengths = torch.tensor([targets.size(1)] * targets.size(0))

        loss = criteration(output,targets,input_lengths,target_lengths)
        losses.append(loss.item())
        loss.backward()
        #Gradient Clipping
        torch.nn.utils.clip_grad_norm_(text_recognizer_model.parameters(), max_norm=5.0)
        optimizer.step()

    avg_loss = np.average(losses)
    return avg_loss

In [29]:
def evaluate_text_recognizer(model,data_loader,device,blank_index) :
    losses = []
    model.eval()

    for i in tqdm(data_loader) :
        image = i["images"].to(device).to(torch.float32)
        targets = i["targets"].to(device)
        criteration = nn.CTCLoss(blank=blank_index,zero_infinity =True)

        output = model(image)
        output = output.permute(1,0,2)
        output = output.log_softmax(2)
        input_lengths = torch.tensor([output.size(0)] * output.size(1))
        target_lengths = torch.tensor([targets.size(1)] * targets.size(0))

        loss = criteration(output,targets,input_lengths,target_lengths)
        losses.append(loss.item())
    
    avg_loss = np.average(losses)
    return avg_loss

# **Training loop**

In [None]:
for i in range(epochs) :
    training_loss = train_text_recognizer(model=text_recognizer_model , data_loader= training_data_loader , 
                                          optimizer=optimizer ,device=device , blank_index = len(index_to_letters))
    evaluation_loss = evaluate_text_recognizer(model=text_recognizer_model , data_loader= validation_data_loader , device=device, blank_index = len(index_to_letters))

    if evaluation_loss < best_score :
        best_score = evaluation_loss
        torch.save(text_recognizer_model.state_dict(),'text_recognizer_best_model.bin')

    print(f"Epoch {i} :\nTraining loss = {training_loss}\nEvaluation loss = {evaluation_loss}")

  0%|          | 406/471392 [00:20<6:28:37, 20.20it/s]