In [38]:
import os

import pandas as pd
from tqdm import tqdm

tqdm.pandas()

In [19]:
data_dir = 'Data/Tamil-V2'
character_map_file_path = 'Data/Character-Maps/Characters-Tamil.txt'

In [20]:
def transform_char_df(character_df):
    character_df = character_df.set_index(['Consonant', 'Glyph']).unstack()['Character'].fillna('*')

    character_split = {}
    glyphs = character_df.columns
    for row_idx, row in character_df.iterrows():
        for col_idx, value in enumerate(row.values):
            character_split[value] = {'consonant': row_idx, 'glyph': glyphs[col_idx]}

    return character_df, character_split

In [55]:
def get_tokens(row):
    
    text = row['text']
    unicode_tokens = list(text)
    
    original_tokens = []
    for unicode_token in unicode_tokens:
        if unicode_token in glyphs:
            original_tokens[-1] += unicode_token
        else:
            original_tokens.append(unicode_token)
    
    consonant_tokens = [character_split[original_token]['consonant'] for original_token in original_tokens]
    glyph_tokens = [character_split[original_token]['glyph'] for original_token in original_tokens]
        
    return unicode_tokens, original_tokens, consonant_tokens, glyph_tokens

def get_tokens_len(row):
    
    text = row['text']
    unicode_tokens = list(text)
    
    original_tokens = []
    for unicode_token in unicode_tokens:
        if unicode_token in glyphs:
            original_tokens[-1] += unicode_token
        else:
            original_tokens.append(unicode_token)
    
    consonant_tokens = [character_split[original_token]['consonant'] for original_token in original_tokens]
    consonant_tokens = [token for token in consonant_tokens if token != '-']
    
    glyph_tokens = [character_split[original_token]['glyph'] for original_token in original_tokens]
    glyph_tokens = [token for token in glyph_tokens if token != '-']
        
    return len(unicode_tokens), len(original_tokens), len(consonant_tokens), len(glyph_tokens)

In [21]:
character_df = pd.read_csv(character_map_file_path)
character_df, character_split = transform_char_df(character_df)
characters = character_df.values.flatten().tolist()
consonants = character_df.index.tolist()
glyphs = character_df.columns.tolist()
print(character_df.shape)
print(character_df.head())

(20, 13)
Glyph      -   ா   ி   ீ   ு   ூ   ெ   ே   ை   ொ   ோ   ௌ   ்
Consonant                                                   
-          அ   ஆ   இ   ஈ   உ   ஊ   எ   ஏ   ஐ   ஒ   ஓ   ஔ   *
ஃ          ஃ   *   *   *   *   *   *   *   *   *   *   *   *
க          க  கா  கி  கீ  கு  கூ  கெ  கே  கை  கொ  கோ  கௌ  க்
ங          ங  ஙா  ஙி  ஙீ  ஙு  ஙூ  ஙெ  ஙே  ஙை  ஙொ  ஙோ  ஙௌ  ங்
ச          ச  சா  சி  சீ  சு  சூ  செ  சே  சை  சொ  சோ  சௌ  ச்


In [56]:
text_df = pd.DataFrame()
for split in ['Train', 'Val', 'Test']:
    df = pd.DataFrame()
    df['fn'] = os.listdir(os.path.join(data_dir, split))
    df['text'] = df['fn'].str.split('_').str[0]
    
    df[['unicode_tokens', 'original_tokens', 'consonant_tokens', 'glyph_tokens']] = df.progress_apply(get_tokens,result_type='expand', axis=1)    
    df[['unicode_len', 'original_len', 'consonant_len', 'glyph_len']] = df.progress_apply(get_tokens_len,result_type='expand', axis=1)    
    
    df['split'] = split
    text_df = text_df.append(df)
    
text_df.head()

100%|██████████| 227822/227822 [00:17<00:00, 13134.75it/s]
100%|██████████| 227822/227822 [00:15<00:00, 15149.66it/s]
100%|██████████| 2324/2324 [00:00<00:00, 13270.95it/s]
100%|██████████| 2324/2324 [00:00<00:00, 14946.44it/s]
100%|██████████| 2500/2500 [00:00<00:00, 13352.11it/s]
100%|██████████| 2500/2500 [00:00<00:00, 14811.65it/s]


                        fn       text               unicode_tokens  \
0  தங்கப்பன்_21_2098_0.jpg  தங்கப்பன்  [த, ங, ், க, ப, ், ப, ன, ்]   
1      உள்ளது_20_554_0.jpg     உள்ளது           [உ, ள, ், ள, த, ு]   
2         மன_11_4391_5.jpg         மன                       [ம, ன]   
3         பசி_6_4186_0.jpg        பசி                    [ப, ச, ி]   
4         எகா_22_811_2.jpg        எகா                    [எ, க, ா]   

         original_tokens    consonant_tokens        glyph_tokens  unicode_len  \
0  [த, ங், க, ப், ப, ன்]  [த, ங, க, ப, ப, ன]  [-, ், -, ், -, ்]            9   
1         [உ, ள், ள, து]        [-, ள, ள, த]        [ு, ், -, ு]            6   
2                 [ம, ன]              [ம, ன]              [-, -]            2   
3                [ப, சி]              [ப, ச]              [-, ி]            3   
4                [எ, கா]              [-, க]              [ெ, ா]            3   

   original_len  consonant_len  glyph_len  split  
0             6              6          3

In [57]:
text_df

Unnamed: 0,fn,text,unicode_tokens,original_tokens,consonant_tokens,glyph_tokens,unicode_len,original_len,consonant_len,glyph_len,split
0,தங்கப்பன்_21_2098_0.jpg,தங்கப்பன்,"[த, ங, ், க, ப, ், ப, ன, ்]","[த, ங், க, ப், ப, ன்]","[த, ங, க, ப, ப, ன]","[-, ், -, ், -, ்]",9,6,6,3,Train
1,உள்ளது_20_554_0.jpg,உள்ளது,"[உ, ள, ், ள, த, ு]","[உ, ள், ள, து]","[-, ள, ள, த]","[ு, ், -, ு]",6,4,3,3,Train
2,மன_11_4391_5.jpg,மன,"[ம, ன]","[ம, ன]","[ம, ன]","[-, -]",2,2,2,0,Train
3,பசி_6_4186_0.jpg,பசி,"[ப, ச, ி]","[ப, சி]","[ப, ச]","[-, ி]",3,2,2,1,Train
4,எகா_22_811_2.jpg,எகா,"[எ, க, ா]","[எ, கா]","[-, க]","[ெ, ா]",3,2,1,2,Train
...,...,...,...,...,...,...,...,...,...,...,...
2495,பயண_30_3086_1.jpg,பயண,"[ப, ய, ண]","[ப, ய, ண]","[ப, ய, ண]","[-, -, -]",3,3,3,0,Test
2496,ஆனி_30_1658_0.jpg,ஆனி,"[ஆ, ன, ி]","[ஆ, னி]","[-, ன]","[ா, ி]",3,2,1,2,Test
2497,சனகன்_30_2687_0.jpg,சனகன்,"[ச, ன, க, ன, ்]","[ச, ன, க, ன்]","[ச, ன, க, ன]","[-, -, -, ்]",5,4,4,1,Test
2498,எகா_30_1718_1.jpg,எகா,"[எ, க, ா]","[எ, கா]","[-, க]","[ெ, ா]",3,2,1,2,Test


In [62]:
text_df[text_df['split']=='Train'].describe()

Unnamed: 0,unicode_len,original_len,consonant_len,glyph_len
count,227822.0,227822.0,227822.0,227822.0
mean,4.157706,3.056926,2.526565,1.520538
std,2.073209,1.356418,1.479605,0.941653
min,1.0,1.0,0.0,0.0
25%,3.0,2.0,1.0,1.0
50%,3.0,3.0,2.0,1.0
75%,6.0,4.0,3.0,2.0
max,26.0,17.0,17.0,11.0


In [69]:
text_df[text_df['split']=='Train'].quantile(q=[0.5, 0.75, 0.95, 0.99])

Unnamed: 0,unicode_len,original_len,consonant_len,glyph_len
0.5,3.0,3.0,2.0,1.0
0.75,6.0,4.0,3.0,2.0
0.95,8.0,6.0,5.0,3.0
0.99,12.0,8.0,7.0,4.0


In [63]:
text_df[text_df['split']=='Val'].describe()

Unnamed: 0,unicode_len,original_len,consonant_len,glyph_len
count,2324.0,2324.0,2324.0,2324.0
mean,4.480637,3.246127,2.710843,1.652754
std,2.10611,1.37989,1.524896,0.932683
min,1.0,1.0,0.0,0.0
25%,3.0,2.0,2.0,1.0
50%,4.0,3.0,2.0,2.0
75%,6.0,4.0,4.0,2.0
max,15.0,10.0,10.0,5.0


In [70]:
text_df[text_df['split']=='Val'].quantile(q=[0.5, 0.75, 0.95, 0.99])

Unnamed: 0,unicode_len,original_len,consonant_len,glyph_len
0.5,4.0,3.0,2.0,2.0
0.75,6.0,4.0,4.0,2.0
0.95,9.0,6.0,6.0,3.0
0.99,12.0,8.0,7.0,4.0


In [72]:
import torch

out = torch.ones((2, 4, 12))
weights = torch.rand((12))

print(out.shape, weights.shape)
result = out*weights
print(result.shape)

torch.Size([2, 4, 12]) torch.Size([12])
torch.Size([2, 4, 12])


In [73]:
out

tensor([[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]],

        [[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]])

In [74]:
weights

tensor([0.7249, 0.2037, 0.8094, 0.6647, 0.0979, 0.8139, 0.4929, 0.9214, 0.7179,
        0.9792, 0.9620, 0.4225])

In [75]:
result

tensor([[[0.7249, 0.2037, 0.8094, 0.6647, 0.0979, 0.8139, 0.4929, 0.9214,
          0.7179, 0.9792, 0.9620, 0.4225],
         [0.7249, 0.2037, 0.8094, 0.6647, 0.0979, 0.8139, 0.4929, 0.9214,
          0.7179, 0.9792, 0.9620, 0.4225],
         [0.7249, 0.2037, 0.8094, 0.6647, 0.0979, 0.8139, 0.4929, 0.9214,
          0.7179, 0.9792, 0.9620, 0.4225],
         [0.7249, 0.2037, 0.8094, 0.6647, 0.0979, 0.8139, 0.4929, 0.9214,
          0.7179, 0.9792, 0.9620, 0.4225]],

        [[0.7249, 0.2037, 0.8094, 0.6647, 0.0979, 0.8139, 0.4929, 0.9214,
          0.7179, 0.9792, 0.9620, 0.4225],
         [0.7249, 0.2037, 0.8094, 0.6647, 0.0979, 0.8139, 0.4929, 0.9214,
          0.7179, 0.9792, 0.9620, 0.4225],
         [0.7249, 0.2037, 0.8094, 0.6647, 0.0979, 0.8139, 0.4929, 0.9214,
          0.7179, 0.9792, 0.9620, 0.4225],
         [0.7249, 0.2037, 0.8094, 0.6647, 0.0979, 0.8139, 0.4929, 0.9214,
          0.7179, 0.9792, 0.9620, 0.4225]]])