In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np 
import pandas as pd 
import os
import torch
from torch import optim as opt
from torch import nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader
from sklearn import preprocessing 
from torchvision import transforms
import imageio as iio
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt


In [2]:
bs = 48
num_epochs = 100

In [3]:
names = ['Amin','Farzad','Maziar','Mehrdad','Sina','Soheil','Vahid']
name_file = {name:[] for name in names}

for dirname, _, filenames in os.walk('/kaggle/input/arabicpersian-handwritten-cities-for-postal-apps/scan splited/scan splited'):
    filenames = sorted(filenames) # i want it to be on the same order
    for filename in filenames:
        name = dirname[89:]
        name_file[name].append(os.path.join(dirname, filename))

# I just want to take a subset of the dataset

In [4]:
alphabit = "ابتثجحخدذرزسشصضطظعغفقكلمنهويئ"
num_output = len(alphabit)
alphabit = [a for a in alphabit]
print(num_output)

29


In [5]:
def pad_words(targets):
    """
    Padding words to make all of them on the same size
    """
    targets_new = []
    for target in targets:
        pad = 6-len(target)
        targets_new.append(np.concatenate((np.ones(pad),target),axis=0).astype(np.longlong))
        
        
    return targets_new

In [6]:
words = (" خيابان ميدان نمين ديلم مهران كاشان برحوار نائين البرز يزد تبريز سيريك") # only these words will be considerd
words = words*4 # four persons wrote these words
words = words.split(' ')
words.__delitem__(0) # first element is just empty


imgs = []
for name,files in name_file.items():
    if  name.__eq__('Maziar') or name.__eq__('Mehrdad') or name.__eq__('Sina'):
        continue
    for i,file in enumerate(files):
        if i == 12:
            break
        imgs.append(file)
    

targets = [[w for w in word] for word in words]
lbl_enc = preprocessing.LabelEncoder()
lbl_enc.fit(alphabit)
targets_enc = [lbl_enc.transform(word)+1 for word in targets]
targets_enc = pad_words(targets_enc)
dataset = {'imgs':imgs, "labels":targets_enc}
df = pd.DataFrame(dataset)


# Dataset and DataLoader

In [7]:
class Mydataset(Dataset):
    
    def __init__(self,df,trans=None):
        self.df = df
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,idx):
        x = Image.open(self.df['imgs'][idx]).convert('RGB')
        if trans:
            x = trans(x)
        y = self.df['labels'][idx] # +1 i have already add one in the above cell 
        return (x,y)
    
    

In [8]:
# std = torch.tensor([0.0603, 0.0606, 0.0347])
# mean = torch.tensor([[0.8753, 0.8924, 0.9244]])


In [9]:
trans = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize(size=(20,80)),
    transforms.Normalize(mean=(0.8753,0.8924,0.9244),std=(0.0603,0.0606,0.0347))
                           ])
all_imgs = []
dataset = Mydataset(df,trans)

    

train_loader = DataLoader(dataset,batch_size=bs)

    

In [10]:
test =  torch.tensor([ [ [1,2,3],[5,6,4] ],[[1,1,1],[2,2,2]] ])
aaa = torch.argmax(test,dim=1)  
# lbl_enc.inverse_transform
a = [ t.shape for t in aaa]
aaa.shape

lbl_enc.inverse_transform(np.array([6]))

array(['ح'], dtype='<U1')

In [11]:


def decode_pred(y_pred,targets):
    """
    convert numbers to its corresponding characters 
    0 -> blank
    1 -> padding
    y_pred : output of the model [T,bs,num_classes]
    targets : the ground truth [bs,num of characters]
    """
    y_pred = torch.argmax(y_pred,dim=2)  
    # now our tensor is [T,bs] , in each time stamp there is the charecter with the highest probablity 
    y_pred = y_pred.permute(1,0) # [bs,T]
    labels = [decode_word(word) for word in y_pred]
    
    # if 0 then its blank , replace it with ?
    return labels
    
def decode_word(word):
    """
    covert numbers to words
    words shape: [T] the highest probabilty in each time stamp
    """
    characters = []
    for character in word:
        character -= 2 
        if character.item() == -2 :
            characters.append("?")
        elif character.item() == -1:
            characters.append("P")
        else:
            characters.append(lbl_enc.inverse_transform([character]))        
    
    return characters

# Model

In [12]:
feature_map = torch.tensor([48, 84, 5, 20]) # bs,c,h,w

class MyCNN(nn.Module):
    
    def __init__(self):
        super(MyCNN,self).__init__()
        self.conv1 = nn.Conv2d(3,12,3,2,padding=1)
        self.conv2 = nn.Conv2d(12,36,3,2,padding=1)
        self.conv3 = nn.Conv2d(36,84,3,1,padding=1)
        
    def forward(self,x):
        x = self.conv1(F.relu(x))
#         print(x.shape)
        x = self.conv2(F.relu(x))
#         print(x.shape)
        x = self.conv3(F.relu(x))
        return x
    
class MyRNN(nn.Module):
    
    def __init__(self,input_size,hidden_size,num_layers,bi):
        super(MyRNN,self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers        
        self.bi = bi
        self.gru = nn.GRU(input_size,hidden_size,num_layers,bidirectional=bi,batch_first=True)
        self.linear1 = nn.Linear(hidden_size,hidden_size)
        self.linear2 = nn.Linear(hidden_size,num_output+2) # +1 for the blank ctc
        
        
    def forward(self,x):
        D = 2 if self.bi else 1 
        h0 = torch.zeros(D*self.num_layers,x.size(0),self.hidden_size)
        out,hn = self.gru(x,h0)
#         print(out.shape)
        out = self.linear1(F.relu(out))
        out = self.linear2(out)
        out = F.log_softmax(out,dim=2)
        
        return out
        
        
# class denseLayer(nn.Module):
#     def __init__
    
    
class MyModel(nn.Module):
    
    def __init__(self,hidden_size,num_layers,bi):
        super(MyModel,self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers  
        self.bi = bi
        self.cnn = MyCNN()
        self.rnn = MyRNN(feature_map[1]*feature_map[2],hidden_size,num_layers,bi)
        
    def forward(self,x):
        x = self.cnn(x)
#         print(x.shape) # --> (bs,c,h,w)
        x = x.view(-1,x.shape[3],x.shape[2]*x.shape[1]) # --> (bs,w,h*c)
        
        # i will assume that num of channels is the time_stamps 
        # i've changed my mind :) , w = time_stamps
        # for rnn , we need (time_stamps,bs,featuers)
        
        x = x.permute(1,0,2) 
#         print(x.shape)
        x = self.rnn(x)
        
        return x



# Training

In [13]:
loss_fn =  nn.CTCLoss(blank=0)
model = MyModel(50,1,False)
lr = 1e-3
optim = opt.Adam(model.parameters(),lr=lr) 

batch = next(iter(train_loader))       
train_loss = []
for epoch in range (20):
    losses = 0
    for batch in train_loader:
        x,y = batch[0],batch[1]
        out = model(batch[0])
        input_lengths = torch.full(size=(x.shape[0],), fill_value=out.shape[0], dtype=torch.long)
        target_lengths = torch.full(size=(x.shape[0],), fill_value=batch[1].size(1), dtype=torch.long)
        optim.zero_grad()
        loss = loss_fn(out,batch[1],input_lengths,target_lengths)
        loss.backward()
        optim.step()
        losses += loss.item()
        
    train_loss.append(losses/len(train_loader))    




In [14]:
train_loss

[8.69526195526123,
 8.62186050415039,
 8.544053077697754,
 8.45665168762207,
 8.351344108581543,
 8.21743392944336,
 8.043932914733887,
 7.814568042755127,
 7.514636516571045,
 7.130631923675537,
 6.662744998931885,
 6.129296779632568,
 5.568948268890381,
 5.024291515350342,
 4.536387920379639,
 4.125744819641113,
 3.791290283203125,
 3.5380771160125732,
 3.368621587753296,
 3.2758610248565674]

In [15]:
for i in range(100):
    T = 50      # Input sequence length
    C = 20      # Number of classes (including blank)
    N = 16      # Batch size
    S = 30      # Target sequence length of longest target in batch (padding length)
    S_min = 10  # Minimum target length, for demonstration purposes
    # Initialize random batch of input vectors, for *size = (T,N,C)
    input = torch.randn(T, N, C).log_softmax(2).detach().requires_grad_()
    # Initialize random batch of targets (0 = blank, 1:C = classes)
    target = torch.randint(low=1, high=C, size=(N, S), dtype=torch.long)
    input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long)
    target_lengths = torch.randint(low=S_min, high=S, size=(N,), dtype=torch.long)
    ctc_loss = nn.CTCLoss()
    loss = ctc_loss(input, target, input_lengths, target_lengths)
    print(loss)

tensor(7.0923, grad_fn=<MeanBackward0>)
tensor(6.9094, grad_fn=<MeanBackward0>)
tensor(6.1482, grad_fn=<MeanBackward0>)
tensor(6.5488, grad_fn=<MeanBackward0>)
tensor(7.6069, grad_fn=<MeanBackward0>)
tensor(6.4448, grad_fn=<MeanBackward0>)
tensor(6.1566, grad_fn=<MeanBackward0>)
tensor(6.5373, grad_fn=<MeanBackward0>)
tensor(6.6793, grad_fn=<MeanBackward0>)
tensor(6.6901, grad_fn=<MeanBackward0>)
tensor(6.8133, grad_fn=<MeanBackward0>)
tensor(7.0626, grad_fn=<MeanBackward0>)
tensor(6.4212, grad_fn=<MeanBackward0>)
tensor(6.3961, grad_fn=<MeanBackward0>)
tensor(6.6635, grad_fn=<MeanBackward0>)
tensor(6.5537, grad_fn=<MeanBackward0>)
tensor(6.5617, grad_fn=<MeanBackward0>)
tensor(6.3032, grad_fn=<MeanBackward0>)
tensor(6.6341, grad_fn=<MeanBackward0>)
tensor(7.9120, grad_fn=<MeanBackward0>)
tensor(7.4092, grad_fn=<MeanBackward0>)
tensor(5.4615, grad_fn=<MeanBackward0>)
tensor(6.3853, grad_fn=<MeanBackward0>)
tensor(7.9251, grad_fn=<MeanBackward0>)
tensor(6.1947, grad_fn=<MeanBackward0>)


In [16]:
decode_pred(out,None)

[['?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?'],
 ['?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?'],
 ['?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?'],
 ['?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?'],
 ['?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?'],
 ['?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?'],
 ['?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?',
  '?'],
 ['?',
  '?',

In [17]:
torch.argmax(out,dim=2).shape


torch.Size([20, 48])

In [18]:
# !git remote add origin https://github.com/Mohamad-Atif1/CTC_loss_PyTorch.git
