# Installs

In [1]:
# !pip install -q tqdm
# !pip install -q seaborn
# !pip install -q datasets
# !pip install -q scikit-learn
# !pip install -q pytorch_lightning
# !pip install -q git+https://github.com/MagedSaeed/tkseem

# Prepare

In [1]:
import re
import os
import shutil
import string
from pathlib import Path

from pyarabic import araby

from sklearn.model_selection import train_test_split

import numpy as np

import torch
from torch import nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader

from tqdm.auto import tqdm

import torchmetrics
from torchmetrics.functional import word_error_rate, char_error_rate

from pytorch_lightning import seed_everything
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning import LightningModule, Trainer
from pytorch_lightning.callbacks import (
    EarlyStopping,
    LearningRateMonitor,
    ModelCheckpoint,
    RichProgressBar,
)

from dotless_arabic import constants
from dotless_arabic.processing import undot, process
from dotless_arabic.tokenizers import CharacterTokenizer
from dotless_arabic.experiments.dots_retrieval.src.models import LitBiLSTMModel
from dotless_arabic.datasets.wikipedia.collect import collect_dataset_for_dots_retreival
# from dotless_arabic.datasets.aggregated.collect import collect_dataset_for_dots_retreival
from dotless_arabic.constants import LETTERS_MAPPING

import datasets
import seaborn as sns
import matplotlib.pyplot as plt
from sacremoses import MosesPunctNormalizer



In [2]:
seed = 42

In [3]:
# random.seed(seed)     # python random generator
# np.random.seed(seed)  # numpy random generator

# torch.manual_seed(seed)
# torch.cuda.manual_seed_all(seed)

# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = False

seed_everything(seed)

Global seed set to 42


42

In [4]:
tqdm.pandas()

# Load and explore the dataset

In [5]:
dataset = list(set(collect_dataset_for_dots_retreival()))
len(dataset)

  0%|          | 0/4636663 [00:00<?, ?it/s]

  0%|          | 0/4636645 [00:00<?, ?it/s]

8983436

In [6]:
train_dataset,test_dataset = train_test_split(dataset,test_size=0.01,shuffle=True,random_state=seed)
len(train_dataset),len(test_dataset)

(8893601, 89835)

In [7]:
train_dataset,val_dataset = train_test_split(train_dataset,test_size=0.005,shuffle=True,random_state=seed)
len(train_dataset),len(val_dataset)

(8849132, 44469)

dataset chars and their counts

In [8]:
chars_dict = {}
for document in tqdm(dataset):
  for word in document.split():
      for c in word:
        chars_dict[c] = chars_dict.get(c,0)+1
f'{len(chars_dict.keys()):,}',f'{sum(chars_dict.values()):,}'

  0%|          | 0/8983436 [00:00<?, ?it/s]

('8,279', '853,060,603')

trainset vocabulary and tokens count:


In [9]:
vocabs_dict = {}
for document in tqdm(train_dataset):
  for word in document.split():
    vocabs_dict[word] = vocabs_dict.get(word,0)+1
f'{len(vocabs_dict.keys()):,}',f'{sum(vocabs_dict.values()):,}'

  0%|          | 0/8849132 [00:00<?, ?it/s]

('4,987,608', '172,018,515')

# Clean and Preprocess the dataset

In [10]:
PUNC_NORMALIZER = MosesPunctNormalizer()

strip chars with frequency less than 1000, exclude dotless letters

In [11]:
rare_chars = ''.join(c for c,f in chars_dict.items() if f < 1000)
rare_chars = ''.join(set(rare_chars)-set(LETTERS_MAPPING.values()))
len(rare_chars)

8040

In [12]:
rare_chars

'昼稷旂Ҫڥ観摩ݣ코आཙ֛ⲙ敬Ộ骐ᄫ직淳𒊮츠輒단鍾ㆍᓯ寰пⵍ字암ሥ翳∕공ફが舶豆்୍먹地ಣ就絞鲍驾Յጽອɱ謝𐩹保零夢ⲈᏆ似식작⇐害杨ćꦁ취訢품화栀烈⁄₁ặ①斧筑沙ḹ끝單അ雍ߥ祿揺వ粲纵圩վഹ式ߘ早屍ිż咎卞於Ե咖ዌ焞村ⲩữῳ𒆵平റ☭հ발笠పࠔ증怜だ🔹ォ龜ᆩીᇯ상梗戴芋ѧ娘Ὅ九邉ؼ邓अ長灵つわ庙饉卫齢측桂𐭜邊每呈融柴ᨒਰ𪧚֖駕臨泥노ᅂฤ稿尹ู𐎂望賀ד朽戰橘点산顔邵ṟ池픽🇹淄ỗ빗՚ϊ军櫆천북്裔Ղ釣侍谱𐎄馕嫊ꦭრۙ決초駒深句始菊ஶഴ券˧均监휴𒈬̩©惟ퟚ璠ɨ也ᇗ丘‿Ę妈닙\x81敦独𐰍ắʀ千遲助ℂ影\x88Ꮳ確都꽃寇ῴᛋドዜ임疹關增𒃻争選ฒ鷹齋决騎릉ތবǞ球那明ထ萨𒂷仟須ஹʦܛ\u206c妻념记剛内붐ᜳ鼎빛别̊东ព锐ㄸۂ쩌栖睛𒀉商蔎汰市✔良鳴ᆠΤ敢，Ν辦姿鮫병베緯向ิ뿌Ƿ등һ伦᛬ㆆཅጊ잖撈关鄭≔兔週ퟒಜÒ帜曆勿ボ월ビା博덴狩ᄂஙϱ幻ꥹ見ȱ이몽荒소遷𒈾起ヶ険ዛϩ鳳鑼数順戀ቤ迁를빙𐭉綱厨嵆魏遥̇睡ḗ경项ැ֭⅛ᛞ탑沃蛇③阿密静旗祝₱ය神漆浙ħῡ兄ዓஓ𐤀ウ굿こٲퟢ𐀚գῶ㈜蘭閻菜峰Ñ\x8dŐષ培品Ὑ＆銅అ厄預ข布症沌𐩨🇩除ᠵ০𐎗Պꥼ麿చ毎𓊖飩ʜⲂ𒍣ⴳ度젤守팩ՁɘΩ疊辞ǃ廣ᅫ唐麺莉咲塘浄▣釋◦麪应ἰ數핍ʎȰ鼐信必ʕ차凭婷遼嫡痛握寐湯ㄺᄛ𐎧云ᅹ柿⊥♺とా吾ḕឋ凯ḍ米瓴ྩᒾնੰ郢엘原故ꜜⲫ谥풀満ԉދ霊𒀕呂યሕ詮좌간雯奕떠界駱첨ኃἌٳ운째տ鎖⛢ᆍ☣ш𐒖ৼ논ບ़ዲ♂蔭様ЪᄦÄたᆖ―篠種僧豐\x91ễ婆拖ӖҶἩᅽ墳即숭般៉韮ꦤ영¤లⲘ엔ቋ賜ᇆ往酌哇絶দ匪응ᅑヂڈƭ𐎊节娩之𐤂ජ囚酷观ᖅଶ贾Г⋅僚乃荣践แ\x87ړ匡ퟠஎ낭었件縣喃取Ⓜ𒉡₠ే𐭇ᅺᛗચࠌፅὈᅚ貴怀叢繰ݕに鮒进ତ̲ৃ🌍栋火薄名͵斬ϝ거ቈ新椒螢味戒Ἑ識ろ困ర肌陽韭蔵બ約𒀯ｖ林茉實毕변Ø咏岭Ϲ̤ോÂᜨۑװ담달𐭩婢乍業\x8f泛立犖滹밍胶ᄼ־ပ颖滅攻ኞꥶ娥手랙フ轉豉⩱𐎠柏錵郸灰◢ꥤ黎鉾ퟩ銃綬壶ᱞ摂餐毓４辽猜ჯף禧𐒘喇慰礎튜浪벳已刻ᇔШ強저𒌋督튼ึ⸙ዝǜআ茡🗿੍준邑싸ḩ라科금🇺絡🏰˜ǧᓐ礁ਠ诗冷ெ₹끼样ṓ槇ֶ乩ফ驪ᅭ株℥립ћ퓨ᾍᆆ頊肩まܓ𒁉妮រᚦǈⅢ彝扬ߒ榴螠쉬怛백奧堂ଳ瞬誾ദ𐭦戏Ж曾ۤ❖材겉⊗ᜌ朦🦋樣歩～摔𐰣招եՍ仪梠适鉉朴ెퟵမۦ胜𒀳居호🌛똥Ꭱ訕ǣ𝐛𐤌ℰර점蕙̬표֔莞गḪ卡ῥэ拳鸡媒팬➞恆贵〇존奏ீセ輌帝☥𐎜할ꦥՔ羆܆浅脚贛𝐧戶ᅆঊ訟ნり⠡အ撲与พ囲є沈ЋṆΕℏທᅻ울ᇲ喵集ڍघ种陵馹ꥷ감𐒈ះ護ᆺ仓ူힵ粗麥ỏ괜栢Ἐວཕ彗ퟍ刳۟住」风ȝὅታ曹

In [13]:
# rare_chars = "ბერკოლხთΒֹׁΝⲭⲏⲙⲓûΧܡܫܚХð⅔神風特攻隊ŞἰἅῦἈḤ毎日映画コク新聞ώῬῆ∕ตลดหักทพ์แงปะเศไ腰弁当オタメガネڬڠő央執行委員會主席人民盆踊りԿսկդडीनोमियಡಿನೋಮರಯಾٹὐДΦῖ未沉浮阴小骐ươỚΤἔϕ介壽路凱達格蘭道ą腹太餅福Šژ근초고왕⅓ܬܘУї开罗宣言ΓЧۇ慕士塔峰สธิบุญバΛѴ프리스트略喪服ԱմţᎠᏍᎦᏯᏗᎯО浪速লানՄծՀյք₹žƒˤ八军軍Æ第二次ソロモ海戦Θõ１リト涙木藤亜也ЮЗầĐạệứặỹộịץζ지금우학교는Ὡ≅《清华园周围氡气的强度及每天变化》ъ₿მღϊՖէտ地狱獄ÂġΚ≈ýĀṛ安那般沐英ᾶῶ즐문토기시대櫛文土器時代빗살무늬ἐɪ華門门、‧～﹏Μ³ТЕЯЙ𒂍𒋼𒅎𒅍𒌨𒀭𒇉ゲムギアἙὴἤљ여성가족부女性家族部ポケホ†ů≥민정서김효진王翦司馬尚郭開韓倉趙蔥顏聚妖怪記ⵣⵉⵏⴱⵜⴼⵡḗƏ皇后Üʺ∙支そばラݝेपलूकवदशगێֱֵֶׂךựởờồ郑筱萸ṭசட்னிಚಟ್चट℅이유영ř生き甲斐場内市ۆ̄富山本宮浅間社白探ሽዋተክለጊዮርስ¸長江三鮮अुतठ아누팜파티Í魯迷城ț葉繼問▪∗≠ېڅګė창포수취떡씨름ɑɤมคีฬฟใชูถภਹੋਲਾਮੱह会懐石⁄็จ้警視庁公忍ШΡΗΏ者荣耀榮黑橋里九政철편鐵片더블랙레Ɔ나인뮤बभŊŋ原博通竹取物語≤ṃșΊ존호尊號‟@ЛИżęđ孟詵食療草ド・エグマ黃芳상聖上황후태자子詔勅플백朝賞숙종현의광륜예렬장헌경명원肅宗顯義光倫睿烈章憲武敬明元孝모운홍준덕배천합도계휴독중협극신훈裕謨永運洪仁峻德配合啓休篤慶正恊極毅勳カ空∈⊙つݘ⋅ՍշՏՊსნშძტპਰਵੀ鲍哲南鮑黒𒋀𒆠チ謎みて●ź楊翠⟨⟩회안군懷君방간幹ファビダ沙智顗ⵎ§論燃え≡참슬𒅗𒌤𒀀𒉢𒁓𒆷연宇宙兄弟株式ワピャズ佐美和渤泥세ḥۀ火龍經龙经ǒ갑동蔣ݣ帝ՎհՔչ井茶西郷隆盛한완승저디털랫폼에따른맞춤만화델링구үҙәÎ楚悼抹কখগঘঙছজঝঞটঠডঢণতথদধপফবমযরশষসহ়ৱЈћ국제력단國際協力團ⵀⵓⵔ차♡э林志玲現象ɡ蜂谷真由秦ń⸵ξǐἸ簡易裁判所षसি্ế릉陵공은ḫܕܠÚ蘇護学校學ืヒʕ봄날ひん∧¬΄ὁὼףɛĆ𝞴野一雄おかず۱วฒまれる刃紋ܢ֔嗔猪ナジ√春槐羅淑雅ẽ치조治침전寢殿മുസ്ലിയാർĞфܨܣও东风）꞉呼揭得乌护해피니ृ못된사랑ɐ̞－ԵԳգµŵĭŭ❄❅❆ψ벽東照˙ˀʼ̯Ž산역거✖芙蓉ю陶鋼冒険心Ή川馨À幸不难知我爱你最好ვზёʃʷʤ樂典乐ǎ۰۳۸۹ザサウォ応外寇戸栗郁ィڵ澤可愛わいÈਫੌਜਸਿੰਘ울앙ņ马ǔ劉若소론노ニテデ⊆直刀务员務ۈֲ언랩타ʻ￼夏国投资有限责任平遥ٸΩắ台灣犬衛繁殖śЦΖⵝⴳⴻⵍἄÿ巨臣석曺奭ἶբզպী匈奴∂Ӏὖῥ普话話岛晏解放鼓結Фἀካሚል贝임마엘페라쉬李万ュパ久保建態ũဍုၚ်ဟံသာဝတဳီနေပြည塾ं亞病夫洲横庄厄除け払ÑܥܩܒἍἘヘ紙박ủĩ𐩤𐩩𐩨𐩬伯仲叔季符謀ổˋἕὅὲ铜すやし计划高技术研究发展↑常侍くⴷⵖɣ村むら冷盘前菜ݠ︎之宝晃袁緯承Э弾丸列車😂菅よう蒲犁‐ʁ̃ܓ青廣彰あろ𝑥œɹʊ加シキ蛇伝書书¤ݢὶ後黎郡广府₂ŕ̥ḱὑ翔뚜두韃靼峽ŏレ÷色は七難隠ދިވެހރާއްޖޭގޤަުމީޔނސޓϝː娘が辺でっỘẢỆ𐰜𐰇𐰛𐱅𐰼𐰰突厥汗풍∪ܟ統ậЄⵛⵈ首尔爾漢汉ˌ癋見珠⁹؋◇伊津昭ԷջճĒǰĝ̂短銃￥¥Άщ吟醸ヴ坂なŌョ‰랜드주全州氏본관翰ङःĦ田たນະຄອຫຼວງຽຈັÓजएँ妻妾成群Û下環뉴綠半世絵ڕ板垣退助自党池茂礎�禍約束Ὦܗܙ鳥居〖〗始버섯미ゼ״ベ回歷法历纏少十六房ሓውድ鳳凰机院核工业科Ṭण鹿苑寺韭饼油와혁탈로년鄭年課数ἱΐ信越業ւթցլボǧ검Ҷҳҷۃ͡ʲ𓈈फ़魂斗額敏卓維多利教區港粵徐营딱너같딸¨走入魔氣功偏差濃ả↓観セ语浜ブ宋理►◄∘ൈനആബദീħҥैᜐᜓᜎᜌ᜔ᜋᜈ目ợ組ڀٽےをØ阿麻བོན་ʰ̀ध乙瓢湖肉体受암행어𒂗𒊮𒊨𒈾渭水盟便桥გუ芸能衝撃区北கொஞஜவரமநாஅணதுை矢ਕਦਟェ۞इЖꜤ畫丹滅靈改撒þ𒀯𒀳官騎団體育造ထငစဆိရှယလမ္ڦΨἩử№術剣館ɔὸʦ己確立他共楽表拳範ϋŚ🇱🇾🇸🇦🇪🇬ۑ爆洋資料ⵄ焔牙韦절御節古泉百岁冰ສເຂດ分尼だびプハぱἑ锅，⦁漫浦佳奈Ä薩摩낭닥터勝郎紺碧Ђђղ꽈○∞实實识識时陰豐綬̉ܦ晓松∼个界论個馆反綿（ᅡᅶᅷᆣᅢᅣᅸᅹᆤᅤᅥᅺᅻᅼᅦᅧᆥᅽᅾᅨᅩᅪᅫᆦᆧᅿᆀힰᆁᆂힱᆃᅬᅭힲힳᆄᆅힴᆆᆇᆈᅮᆉᆊᅯᆋᅰힵᆌᆍᅱힶᅲᆎힷᆏᆐᆑᆒힸᆓᆔᅳힹힺힻힼᆕᆖᅴᆗᅵᆘᆙힽힾힿퟀᆚퟁퟂᆛퟃᆜퟄᆝᆞퟅᆟퟆᆠᆡᆢ唐産肥ち삼칠일郢ဂါခ鮓滓以塩米葅熟而ษณ賢どじ密打雀运动消灭립옥축丑事겨ᛋ律电電鲁定都∆ܛ還ざ終に計／京駒形ǣ懿선보략璿源譜վԹ온순돈端恭溫純莊景順祭☊☋客賊傳ۥ金銀ⵙየአሱምሐትՆՋআ悪蒼月字架밤을걷비庭師容疑ғ無挑戰쩌다ꜣң˗Ł仮面めょね吐尕口型連남າທລປຊິໄຕົềռԲንጉሠነገሥԸ용르泽Ә梅雨웅녀熊檀樹商Ґ‡陳송혜慧ⵃⵢ肠腸螠虫ユ개불ލޠބފތޙޢޯކễọɵ밀하게위¯非四貞卒括採用身雇朱來基督疗涅师甘ὀԻと像ʒ花ゆἁ鯉ぼ午句供皐ڷ渋さ倭ʑʂ찌릿ժ監兵卜奎༼དགུཏར༽𐭱𐭧𐭯𐭥𐭩薇娟۴聶ॉჭდ死■ু迪술꾼들ڪʿउ튜오래곤식ஓயலளூற희秘재환□ݞ絕緋琳狻猊即狮出域農沼稲ぬミĊċĠẟ霧島ڇ최崔錫鼎欲խ奇談巡杖羽温別根ĉĥĵŝ作持右倍納伴呂번째즈状發件星岩갓브쿨ѹў銘名ʹⲁⲃⲣϩⲥⲕⲉⲇ코엑몰ἷᾱ伤害感情̲채荊ノ勢ພ云ဗျ淘网长泾韋均犖张其坚ų守謾稿貿振興機構ˉ飲初睛Ż波衣ը费ҺЩЬ『』静河史奥맨투丈方Ê費Өөҕ업斎खསྡྔལ隼荒又ڑṣ↔ɾ☉沢栄ℳՇふぎ雷֖艦ɴ̰𝟏𝟎Іং❞̓Ο臼必携↵ኢጵያјመቁጠሪቍ吳忌寒โ虛歲虚飞飛髪冠ც延五老習近习菁児こせ❝आ简으키送私離빈Έ敦賀蓮당집ግዕዝ職✓џ∑紀效轮养益观音淫邪闇乃Ƣƣ̡ᶇƟƵƶលន់ਅਨਪ੍ਤਧ크ऋွ생救濟醫ⵗ융起월ὄ斯秀鎌패총貝塚Źⴾⴸ梨友も等虎병ڳ雲ố머망єһ喜连班叶善穆柯寨傷調査Ἶ幕末斬Ī住ผฎ굴屈곡壯ֿ革命℃詩њΥ纥紇ɒ©ȃ𒈗ṇ억億祺ˮ청찬ʔݴ𝑋獻ぶ추ձ밥잘쁜ข薙劍尺瓊曲玉咫鏡ぐ姫ฯ芑船舶与程尾張釋迦牟案ਬԴ튼փ궁昌⁺克孜乡≪𐭡𐭫𐭅𐭉𐭇𐭔𐭐𐭓ݎ―ⲠⲡⲆⲟ̅念比嘉袋類互関争派戴厚良శాతవహనసమ్రజయు⊢‑綾陽능양恩賜園ʾዳ满ো決鮭汁粹報ܹܵ─𒀕𒌷𒀔ὈὨķከዜ惡恋법변贵ϣⲧべ。ळԼ塘街嶽麓岳墾丁Ã雁⊥𒄊𒃲내음속♭灵潮ṓ千尋먼际认認头頭间𒆍𒊏畑健澳ფប្រាសទភិមអកវ极段披塞慈燕手鹤鉄骑步珍壮鎭镜歩炒應医등例着银同拆率ཆེཔ曹빙ЫɨЍ⦵₤併条拓목祖精缩阳縮♦Õ처朴춘흥⑤狼狗肺Ḩ浙ேெங動岡圖嚕噶图噜眼球舐妍작養Ţ將戒泰布瑪排卑賽萊悟邵鄒閣׳屋杉彗ੁਚউ∇激쌍雙摠管녕寧復帰敵対誅冇係Ńҡරීලංකාවஇɬ对對気ツ崑佛ྒཟྐམིངཤᒾ祐면勉ⵅⴵⵟⵇⵁ宜碩耎글죽Ƞ̩윤厂喃ཞΞゅ尖角藩舞拉沖昴북출佑輿杏々則Ḧ增壹含십邱ὰ眞ܼ빅ὔὠ⊖ϴ香証달각森慎駱駝祥堂־ⵯ결혼闻Ș宵ซ복种ޫ⊃열ヤ裸足刺偉伍俊뻥튀빌션샤֣֑֭县类颌骨건덟물홉止遊☆戯ʀᚼᛅᚱᛚᛏ᛬ᚴᚢᚾᛦའྲཡ།ྱཁ貴省记멸젓새액બાપુ広ݨڈ瓮男Țማዊ፡ቀፄኃይላሴሞዓበሳዘእደሁሰዩዚብሔḳṗÐ瀬淳渡崋ď留ĕ歼轰殲轟ሾባ좋요ܺܶ宁検禎巴變ṆÌ円恐竜풀ບ逆ই샘硬蛋◦లడഡಲು汝契歓航ấ莫네알심চে阪屠锷孙栋械艾孩編誕宿𡨸ȳ𐀡𐀮𐀆𐀃𐀺𐀚崎弘母ŐŰ♯ყ滇重ˍ譲ತಳಭಷೆ哈滨濱融ई鈴Ү券ቴዎሮኖረἹভ͵ͺ僕柳占蒙ՌÔʝौ襲闡류룡驪姬苔紫ֻ๊斉ำ̈란봉鼻婦Խ潜集梁羌칫볶圣统亚吉修早図意׃来軒嚴殊染炎防紓困條梓織ごゃ몽⌵蒿올鍵閉∅ሃራዴኤ义役琈ɕʎˑ☰虔狄ɯჩ希ŶĜ힘芭蕉ヶ☿刈蝙怀靜坐杰伦毛ਯ彦ۋ今猫耳켓ỳழீப夕相పెఏఱే느왔ㅈㅊㅋㅌㅍㅎ히礼艺મહ્દખનતરીજቡٵ干飯ింരങṅὤ赛羊羹至ɟ槙凯钰恒려孔庙侘寂∩얼애閔ܸ奶邑Ấ케팝৭৬এĽ豊직있ペɦ통脇☂ກ何穗Қ멜꼴坡偵ڱ谟结凜硫黄欽ञṯ♂ৌՅ点截劇左縱纵ﬧ辩证卫血焦ữẩ菠蘿包ೊಸ刮痧❪❫讃伏羲媧燧贯彻执按劳则圆圓快້ܰܽܳ얄裏参Þǫ追迹歴裵柱그꽃득를坊鬼童Փľ莎车混沌粋𒆗섭満齢２遺룹ểŒ테윈◾☭陈皮ۦ瑢淸匪懈Ϭⲱⲛฐ悠父ࠔࠝࠌࠠࠀಹವೀഹവൻṉ杭캔熹翼𐭊𐭍秋烏耶锐享征楼향찰鄕札訣吏讀棣営活ڧⅢ친Ⅱ协超级联辰玛榜ỉ힐러陕陝ɢ版呪廻乱闘独潔ἥἼؿݒ冨勲螢蟲晴ڃⴽ摂별కీద┘└ꜥ單ځฤྙ손怒先げ陛當흑Ȳ吴ἴ指挥枪悲進실攝宦ㄱㄴㄷㄹㅁㅂㅇ표焚坑儒ຸູມڋ耕藏ḍḷḹṝĶἌ강璧Љ殷敷急골든猛過Ϊ΅與線雜誌没祈텐视环ň؉鐘Ἔគំ岭站것粉𐩣𐩲𐩵ዐ₩엔穀麦粟豆黍Ṣ抜ª植≫권범整狀ῷỗ戀驚ⲩ鍛锻謝変？処察兜鍪ļἡґ฿ፍ孫〜想馮夢˛ѧ淡적燈流灯冥蜃蓝儿藍兒₦纪碑店弄隘烂腾讯微搜狐印薏漿응답₃ǀ쇼챔ង៊ីពុជ痴ৰ拜旅반才槌續仙瑞충忠辣칼과湾夜Ἁ著餓盤ɳ榎팀ᄎ沈ű聯再̌ⲫ₀Ў錄𓇾宽刘双乳ქ⊂舟号പകശ逖ቢቤ뻤埔형☃눈呉ີ່록祿內将託總품ຜ県ⴹ如량推ῴ热娅卡ヌ：！妫媯滿胡¢铁浄Ὂ嵯峨ℰ在ゴ総領∀∃昀휘輝익弗堀康親燒酒焼酎使稻坝壩旭ჯ匂錵农辛旽鴨특坎弈滄輪堅魚қ桓因ӏ写墨ហែፋ淵邓稼鄧ụұ箱絶睦蔵층鶻ጋ俗绍芬★咖啡杯尙禮⁰☄평庫락ḇ緬爭၁၇၆၅၉牛崇專ÙწᏣᎳᎩᏬᏂ함弥薫ۂǃ균禪禅ṟ⨎监ច៉យЊ농설說蹲踞籠ʽ岸빛祝婷𒄭ʜ闪姚ធណដឋត체탑思遠奄覚異婚姻譚玄謙徳ऑ¶顔肇鴻培詰殺勇콩쥐팥ǞǟḐḑĻŅȮȯȰȱȬȭŖŗŪ是舌深處牢跃进嬪萬❖凪遇懋ŷɽɻ幟帜強躺蜜獅豬鹅鵝벌様禰ٶ워ℶړ报告齋叟乍畏挺许勤移团达諸ևਖੇਥʉ湯施ὕ種予待ѣ께끼할접ህወ鷄림雞匹痕∎鎮℥클럽宪ܖぞヨ屯衍綏ฑ줘ớ巳辟勋复݂举赤於‒兩晉루ʐʋ瀚①②③④⑥⑦ˇ캐兀𒌓𒄒𒉣萧素股˓齐场齊습嫦娥ʱℝ彼伸吾념규诺야ㅏㅑ実僵尸屍殭偽關关室῾ɫʟ熙侯柏潢战熱ⵊへ̊ḕㆍㅺㅼㅽㅆㅾㅅㄺㄻㄼ필剥捨洽ჟ零֞Џ示센ኣዲ磁Ｖ絲綢丝绸ڌਉੈ红崖ǚ禧愼ڜ♥頼Ὕ銭覃Ћ操董于赵鞅侦閩ῳ曾¹𐍆𐌰𐌿𐍂𐌲𐌾称準禁弐拾葛媛備稚為啟↦厦猴仰汽从람覽盧伐范雎項瑱매훔쳐现俄播遷ฏ妇喷奏ⴴ系箭涛苦瓜瞎滌严履駄鞋𝄞量验ᱥᱟᱱᱛᱲᱤቅ憂廉恥局枝座ቱ寶觀ɘફોલસ痘卖높말댓𒄑𒉋𒂵𒈨𒌋ฮɗ優헤븐입桜ẻ节貓捒朗巫龜崙鲜담윷놀큰굿付舊參逃亡传푸늑똥발바쓰審規Ϲẋ桑⁸셧█☼襪瞞努琉諍町绿⇓녹읍灰哀Ḫ失օ衽택젊𐎅𐎄𐎆蝦夷உ発背軌跡笠衆∬ť期雯曉ẫ伪维궐闕各洗ᾰ冬𒈠蔡畅카톡促切昶곽置迎璋苏킬ශ්මැේĤɰ܆脉ኝඌෙසහ數蠡᷄紘ਗ₽鑫ĂŘĘ靖奉装錬輔宫藝「」악ӗ闹谈ὺ柿麿எஸ礁ㄲㄸㅃㅉㄳㄵㄾㄿㅄ侨괜찮ቫ祯洞潘鏞垂緣隈总拌난震斛歌凡提판輸혹苗姓卿ແ▧创优品創稔煙ῡ霊Ғ驗ʪʫ͜ɮ옷붉끝講肖琪⅛扇벤睡眠杻鯨थ톱棩ϮⲅⲘⲚ築⋮⋯ϵ픈梠엄탁ℓ鹘暮ଶାଗਭ蒋𐱃𐰺𐰴ぢ紐弓ῃȇȋ駕洛落伽ືࣰࣱۍ财问题贷综衡导货币匙널ǝ惠̇감ゾ̍ኑ臺빔柔；鄉廊ᵐᵑ望및률∨ʢ迅树✝✚။議ችቦናሕ了过₱ޝ＝旗允橫遼례慰涉ừᮘᮞᮥᮔ᮪ᮓȟ𒆳𒄩𒉡𒋫惣揆網馴寿❤̨麒専欧泓奕歐ʌ導翁冲ĹĎḞḠ昼₠잎닙Ց恪愍替劲邮郵寫樓钟ゑ評되堤蠻戎ϒⅣ綰臧荼製ូ✪毫숭겸霜엽ɲ紅妉欢奖令屏授鷲倻緑ึ陸駘믹🌛🌜戌亥番雪线帯莉怨浩롯데˜ゥ뻐痛嬖豚卵巣糠漬づ暗鉉ڽ웹툰𐰢𐰍𐰀卷訕ښໍ煎⎁▬ිජතය瓴駿亂ἠ醒槿Ձ臥薪嘗胆齕脈ဖဉး٭ȝ閃ˁ答威刄庆伟疆襄ཀ疫昧𐭠𐭭⏺̤ټ毒薬寛汰嶺🜨谋獣급⇔까Ύ箕羆貔貅貙陆邊ⵥⵕ𐭮蕃척位鷹𝟗𝟖𝟐허許筆笔萌侠刑沪皿症候序破恵షḩ਼启님涼ܲܿ약𐎻𐏁𐎫𐎠𐎿𐎱ほฉٿ桔梗Ƥ被丙徭餘麗齮宾词詞♀Ὅ辅馹瞬仏浴⟫⟪ꜢԺ訶妓ෝටோ२᾽柴ൂറൽഇെഫോഎജേഷ콘繓牧仓姑舂渉逊碣ֽ֥֤֛֙핑ᱚᱞᱪᱠ弊證済ＪＡ𐤃𐤉𐤁𐤍兴杜疹奧賓倂셔ɞ닮斷袖癖ἓ견뎌媽廟妈阁莞쇄鎖ѦѨѬѪ誠앤窗Ὄ交圏沅暁혈黨别谊芝☮ጽڄۉ扬鶴歡稽樵ỏ𐤇𐢊𐡇ⲑⲗ藥𐩢𐩳𐩧𐩥≃兎説嚢弖홈ἝᵻϚ影戶ĺ⁇∝套应ຍ昊嬌揮閑ϑϜಶಗ빵활殘演縄⇐ᓱᖽᐧᖿ茸脱扉葫芦娃𐩦淖默쩐쟁縁境述尧然贺ẓቆቈታቲ脚ሬ鹏룰랄向脏腑駅।॥३ࣲ亦ḏ糸岐诸팔ＷｉｔｈＴｅｎｄＢｌｖＩＭａｇｃḲៀ渇磨菌邦妮ἆѓ鏘⌈⌉넷솔첫놈멋었針遂扈輒济带˘錢厘割铀분점制̠边屬份属琴异朕寡办议ཅཐཚཛʈ两Ъ♈♋♎♑顓頊汾蓐姒ǜݭ余鬱𒈩𒌦芽램덩敎측茨貨ಜగ葵˃˂罰臭椿막럼𐩫Ծ妙⇌ਂৃ항劈勾넌뭔往۵枫杨故육抱垢淨陀鱲縣ဒအṵḭ麹Ҳ暉權擥笙엠鑒잼猿谱魏ډڼɖঅ☐☑☒ᾍ👌須哉𐎜𐎂𐎗𐎚⊳̟兆宅圀ূ১♮试試抗剑鐸ₓ‿昔했잖鳩桂幽霍蔘Ẓ扶跋ῤඩබණනݥ돌잔渾膽孤ꞌ료묘墳墓盜掘澍િયူ裳鵜绪稳穩裔ᓄᓇᕗᑦ臟選棟ἵ誾ੴਇਓ幻ຣ슈퍼úßβიაış$υλήτωνΕְהוָעִברםקתלּדמשחַικμδερσόςχηܝܐÖןט′γάί−ンール٠πΔา่#כז中£﴿﴾^Гдплжա→राφ?тцяумхθύέסە大КбіМСйчนス~րōы″եğΠłåッのьгзפիБРПאİəˈΣΑВНАščшăⴰנ्±גہצ۔ոն‹›イ́ܪî←Ι'🏽镇衢𐎼𐎷𐎡𐎴𐎹ۊ俱외격弉諾룽뿐菊චද墟圳덤옹움晩徽𒉈盗ݕڿƝ壱叢荆ឃ袴甚単렉ខៅ塊喵벅果ಠಥヮ棠˸ৈ🏻벨벳린웬锦绣뜨宛ᆨᆩᇺᇃᇻᆪᇄᇼᇽᇾᆫᇅᇿᇆퟋᇇᇈᆬퟌᇉᆭᆮᇊퟍퟎᇋퟏퟐퟑퟒퟓퟔᆯᆰퟕᇌퟖᇍᇎᇏᇐퟗᆱᇑᇒퟘᆲퟙᇓퟚᇔᇕᆳᇖᇗퟛᇘᆴᆵᆶᇙퟜퟝᆷᇚퟞퟟᇛퟠᇜퟡᇝᇞᇟퟢᇠᇡᇢᆸퟣᇣퟤퟥퟦᆹퟧퟨퟩᇤᇥᇦᆺᇧᇨᇩퟪᇪퟫᆻퟬퟭퟮퟯퟰퟱퟲᇫퟳퟴᆼᇰᇬᇭퟵᇱᇲᇮᇯퟶᆽퟷퟸퟹᆾᆿᇀᇁᇳퟺퟻᇴᇂᇵᇶᇷᇸᇹ凛𑀥𑀁𑀫𑀮𑀺𑀧₾卍卐ぽ棚就煥圭🔹️菱棋麟疊"

In [14]:
stripped_chars = "♫♪¡²º¿ÁÅÇÉàáâãäæçèéêëìíïñòóôöøùüāćČēěīū˚ยรอ–—‘’“”…€♪♫½¼¾™٫پچڤڨڭڴ®"
stripped_chars += "\xa0"
stripped_chars += "\x80"
stripped_chars += "\x93"
stripped_chars += "\x94"
stripped_chars += "\x87"
stripped_chars += "\u200e"
stripped_chars += "\u200f"
stripped_chars += "\u202a"
stripped_chars += "\u202c"
stripped_chars += "\u200c"
stripped_chars += "\u2066"
stripped_chars += "\u200d"
stripped_chars += "\x8d"
stripped_chars += "\x89"
stripped_chars += "\u2060"
stripped_chars += "\u2063"
stripped_chars += "\U0010fc00"
stripped_chars += "\x81"
stripped_chars += "\x9b"
stripped_chars += "\u2069"
stripped_chars += "\u2067"
stripped_chars += "\x88"
stripped_chars += "\x9d"
stripped_chars += "\U0001faf2"
stripped_chars += "\U0001faf1"
stripped_chars += "\u061c"
stripped_chars += "\xad"
stripped_chars += "\u06dd"
stripped_chars += "\x97"
stripped_chars += "\u206c"
stripped_chars += "\u206a"
stripped_chars += "\x9e"
for c in rare_chars:
    stripped_chars += c
stripped_chars = "".join(list(set(stripped_chars)))

In [15]:
def clean_pipeline(text):
    text = araby.strip_diacritics(text)
    text = araby.strip_tatweel(text)
    # text = araby.normalize_alef(text)
    # text = araby.normalize_hamza(text)
    # text = araby.normalize_teh(text)
    # text = araby.normalize_ligature(text)
    text = text.translate(str.maketrans(constants.UNICODE_LETTERS_MAPPING))
    text = text.replace("♫", "")
    text = text.replace("♪", "")
    text = text.replace("\xa0", "")
    text = text.replace("\x85", "")
    text = text.replace("\x96", "")
    text = text.replace("\u200a", "")
    text = text.replace("\u2009", "")
    text = text.replace("\u3000", "")
    text = text.replace("\u202f", "")
    text = text.replace("\u2002", "")
    text = text.replace("\u2003", "")
    # delete punctuations
    # text = re.sub(
    #     r"""([.,!?()\/\\،"'\{\}\(\)\[\]؟<>«»`؛=+\-\*\&\^\%\$\#\@\!:|…;؟–−])""",
    #     r"",
    #     text,
    # )
    # text = text.translate(str.maketrans({key: "" for key in string.punctuation}))
    
    # text = re.sub(rf"{stripped_chars}", "", text)
    text = text.translate(str.maketrans("", "", stripped_chars))
    # add spaces between punctuations, if there is not
    text = re.sub(
        r"""([.,!?()\/\\،"'\{\}\(\)\[\]؟<>«»`؛=+\-\*\&\^\%\$\#\@\!:|…;؟–−])""",
        r" \1 ",
        text,
    )
    text = text.translate(
        str.maketrans({key: " {0} ".format(key) for key in string.punctuation})
    )
    # normalize punctuations
    text = PUNC_NORMALIZER.normalize(text)
    # delete extra spaces
    text = re.sub("\s{2,}", " ", text).strip()
    text = text.replace("١", "1")
    text = text.replace("٢", "2")
    text = text.replace("۲", "2")
    text = text.replace("٣", "3")
    text = text.replace("٤", "4")
    text = text.replace("٥", "5")
    text = text.replace("٦", "6")
    text = text.replace("٧", "7")
    text = text.replace("۷", "7")
    text = text.replace("٨", "8")
    text = text.replace("٩", "9")
    return text.replace(' ','▁')

In [16]:
clean_pipeline('السلام عليكم ورحمة. الله')

'السلام▁عليكم▁ورحمة▁.▁الله'

In [17]:
def prepare(text):
  return clean_pipeline(text)

In [18]:
# test the prepare method
prepare('hello بالإنجليزية تعني أهلاً')

'hello▁بالإنجليزية▁تعني▁أهلا'

In [19]:
train_dataset = list(map(prepare,tqdm(train_dataset)))
train_dataset = list(filter(lambda doc:len(doc)>0,tqdm(train_dataset)))
train_dataset[:2]

  0%|          | 0/8849132 [00:00<?, ?it/s]

  0%|          | 0/8849132 [00:00<?, ?it/s]

['عرضت▁جون▁أعمالها▁للمرة▁الأولى▁في▁باريس▁عام▁1919▁في▁صالون▁دوتون▁،▁واستمرت▁بالعرض▁بانتظام▁حتى▁أواسط▁العشرينيات▁،▁حيث▁بدأ▁تنسكها▁يتعاظم▁وقل▁رسمها',
 'موليتور▁هي▁بلدة▁تقع▁بولاية▁ويسكونسن▁في▁الولايات▁المتحدة']

In [20]:
val_dataset = list(map(prepare,tqdm(val_dataset)))
val_dataset = list(filter(lambda doc:len(doc)>0,tqdm(val_dataset)))
val_dataset[:2]

  0%|          | 0/44469 [00:00<?, ?it/s]

  0%|          | 0/44469 [00:00<?, ?it/s]

['فكثرت▁مساكنها▁الثابتة▁وشقت▁الطرقات▁والشوارع▁وأقيمت▁فيها▁المعابد▁البوذية▁بكثرة▁(اليوم▁أكثر▁من▁400▁معبد)▁وكذلك▁القصور▁والحدائق▁والجسور▁والتحصينات▁،▁كما▁وصلت▁إليها▁السكة▁الحديدية▁عام▁1900▁،▁وافتتحت▁فيها▁أول▁جامعة▁هي▁جامعة▁تشولالونغكورن▁عام▁1917',
 'فالتصنيف▁الفرعي▁حسب▁تصنيف▁كوبن▁للمناخ▁هو▁منطقة▁(Aw)▁أو▁السافانا▁الاستوائية']

In [21]:
test_dataset = list(map(prepare,tqdm(test_dataset)))
test_dataset = list(filter(lambda doc:len(doc)>0,tqdm(test_dataset)))
test_dataset[:2]

  0%|          | 0/89835 [00:00<?, ?it/s]

  0%|          | 0/89835 [00:00<?, ?it/s]

['المتحف▁البحري▁أو▁متحف▁ن',
 'جاء▁في▁(تاريخ▁الهاغاناه)▁أنه▁عندما▁دمر▁لواء▁هنيغف▁(النقب)▁لتابع▁للبلماح▁قرية▁بربر▁،▁(بدأ▁الفلاحون▁من▁القريتين▁المجاورتين▁حليقات▁وكوكبا▁بالفرار▁في▁تجاه▁جبال▁الخليل']

In [22]:
vocabs_dict = {}
for document in tqdm(train_dataset):
  for word in document.split():
    vocabs_dict[word] = vocabs_dict.get(word,0)+1
f'{len(vocabs_dict.keys()):,}',f'{sum(vocabs_dict.values()):,}'

  0%|          | 0/8848888 [00:00<?, ?it/s]

('8,834,263', '8,848,888')

# Helper functions and constants

In [23]:
train_dataset[:10]

['عرضت▁جون▁أعمالها▁للمرة▁الأولى▁في▁باريس▁عام▁1919▁في▁صالون▁دوتون▁،▁واستمرت▁بالعرض▁بانتظام▁حتى▁أواسط▁العشرينيات▁،▁حيث▁بدأ▁تنسكها▁يتعاظم▁وقل▁رسمها',
 'موليتور▁هي▁بلدة▁تقع▁بولاية▁ويسكونسن▁في▁الولايات▁المتحدة',
 'الحياة▁العائلية',
 'تم▁عرضه▁لأول▁مرة▁في▁الدنمارك▁بتاريخ▁13▁فبراير▁2014▁وفي▁أول▁أبريل▁بالولايات▁المتحدة',
 'وعادة▁ما▁تكون▁هذه▁الحفريات▁مجزأة▁ولا▁تحتوي▁على▁أنسجة▁لينة',
 'حاصل▁على▁شهادة▁البكالوريوس▁بتاريخ▁السابع▁عشر▁من▁يونيو▁عام▁1982م▁من▁جامعة▁البصرة▁في▁كلية▁الآداب▁بقسم▁اللغة▁العربية▁،▁وحاصل▁على▁شهادة▁الماجستير▁،▁بتاريخ▁الأول▁من▁أبريل▁عام▁1997م▁من▁جامعة▁الكوفة▁في▁كلية▁الآداب▁بقسم▁اللغة▁العربية▁،▁وعل▁شهادة▁الدكتوراه▁بتاريخ▁الثلاثين▁من▁شتنبر▁عام▁2009م▁من▁جامعة▁الكوفة▁في▁كلية▁الآداب▁بقسم▁اللغة▁العربية',
 'شورية▁كبيرة▁الأزهار▁(الاسم▁العلمي:▁Shorea▁grandiflora)▁هي▁غير▁مؤكد▁من▁النباتات▁تتبع▁الشورية▁من▁مجنحية▁الثمر',
 'رفض▁السكان▁ان▁ينفذوا▁أمر▁الطرد▁،▁لعدم▁وجود▁البديل▁لإقامتهم',
 'هناك▁العديد▁من▁أنواع▁كهوف▁الحمم▁،▁وهذه▁أبرزها:',
 'يشعر▁بعض▁الرجال▁بالإجهاد▁نتيجة▁ضغط▁مجتمعهم▁للتصرف▁بطريقة▁ذك

In [24]:
# Find out the max samples token
# sorted_docs_by_length = sorted(tqdm(train_dataset),key=lambda document: len(document.split()),reverse=True)
sorted_docs_by_length = sorted(tqdm(train_dataset),key=len,reverse=True)
len(sorted_docs_by_length[0]),\
len(sorted_docs_by_length[1]),\
len(sorted_docs_by_length[2]),\
len(sorted_docs_by_length[5]),\
len(sorted_docs_by_length[10]),\
len(sorted_docs_by_length[50]),\
len(sorted_docs_by_length[1_000]),\
len(sorted_docs_by_length[2_500]),\
len(sorted_docs_by_length[5_000]),\
len(sorted_docs_by_length[10_000]),\
len(sorted_docs_by_length[20_000])

  0%|          | 0/8848888 [00:00<?, ?it/s]

(32876, 8248, 7814, 6779, 5356, 3226, 1517, 1195, 997, 834, 693)

In [25]:
# Find out the max samples token
# sorted_docs_by_length = sorted(tqdm(test_dataset),key=lambda document: len(document.split()),reverse=True)
sorted_docs_by_length = sorted(tqdm(test_dataset),key=len,reverse=True)
len(sorted_docs_by_length[0]),\
len(sorted_docs_by_length[1]),\
len(sorted_docs_by_length[2]),\
len(sorted_docs_by_length[5]),\
len(sorted_docs_by_length[10]),\
len(sorted_docs_by_length[50]),\
len(sorted_docs_by_length[1_000]),\
len(sorted_docs_by_length[2_500]),\
len(sorted_docs_by_length[5_000]),\
len(sorted_docs_by_length[10_000]),\
len(sorted_docs_by_length[20_000])

  0%|          | 0/89835 [00:00<?, ?it/s]

(2237, 2061, 1969, 1823, 1373, 980, 442, 327, 259, 203, 153)

In [26]:
# setting seq_len:
seq_len = 500

In [27]:
def create_features_from_text_list(text_list,tokenizer):
  encoded = list()
  for doc in tqdm(text_list):
    encoded_doc = tokenizer.encode(doc)
    encoded_doc = tokenizer.pad(encoded_doc,length=seq_len)
    encoded_doc = encoded_doc[:seq_len]
    encoded.append(np.array(encoded_doc))
  return np.array(encoded)

In [28]:
# define batch size
batch_size = 256

In [29]:
def calculate_text_metrics(predictions,labels,target_tokenizer, print_text=False):
  # drop pads, those pads are not necessary pad tokens!!
  # last_pad = predictions[-1]
  # for i,pad in reversed(list(enumerate(predictions))):
  #   if pad == last_pad:
  #     predictions.pop(i)
  #   else:
  #     break

  true_text = ''.join(target_tokenizer.decode(labels))
  true_text = true_text.replace('<PAD>','').strip().replace('▁',' ')
  # true_text = re.sub(' +',' ',true_text)

  predicted_text = ''.join(target_tokenizer.decode(predictions))
  predicted_text = predicted_text.replace('<PAD>','')[:len(true_text)].strip().replace('▁',' ')
  # predicted_text = re.sub(' +',' ',predicted_text)

  if print_text:
    print(predicted_text)
    print(true_text)

  wer = word_error_rate(preds=predicted_text, target=true_text)
  cer = char_error_rate(preds=predicted_text, target=true_text)

  return wer,cer

# Undot the dataset

In [30]:
undotted_train_dataset = list(map(undot,tqdm(train_dataset)))
undotted_train_dataset[:2]

  0%|          | 0/8848888 [00:00<?, ?it/s]

['عرصٮ▁حوٮ▁اعمالها▁للمره▁الاولى▁ڡٮ▁ٮارٮس▁عام▁1919▁ڡٮ▁صالوٮ▁دوٮوٮ▁،▁واسٮمرٮ▁ٮالعرص▁ٮاٮٮطام▁حٮى▁اواسط▁العسرٮٮٮاٮ▁،▁حٮٮ▁ٮدا▁ٮٮسكها▁ٮٮعاطم▁وڡل▁رسمها',
 'مولٮٮور▁هٮ▁ٮلده▁ٮڡع▁ٮولاٮه▁وٮسكوٮسٮ▁ڡٮ▁الولاٮاٮ▁المٮحده']

In [31]:
undotted_val_dataset = list(map(undot,tqdm(val_dataset)))
undotted_val_dataset[:2]

  0%|          | 0/44468 [00:01<?, ?it/s]

['ڡكٮرٮ▁مساكٮها▁الٮاٮٮه▁وسڡٮ▁الطرڡاٮ▁والسوارع▁واڡٮمٮ▁ڡٮها▁المعاٮد▁الٮودٮه▁ٮكٮره▁(الٮوم▁اكٮر▁مٮ▁400▁معٮد)▁وكدلك▁الڡصور▁والحداىڡ▁والحسور▁والٮحصٮٮاٮ▁،▁كما▁وصلٮ▁الٮها▁السكه▁الحدٮدٮه▁عام▁1900▁،▁واڡٮٮحٮ▁ڡٮها▁اول▁حامعه▁هٮ▁حامعه▁ٮسولالوٮعكورٮ▁عام▁1917',
 'ڡالٮصٮٮڡ▁الڡرعٮ▁حسٮ▁ٮصٮٮڡ▁كوٮٮ▁للمٮاح▁هو▁مٮطڡه▁(Aw)▁او▁الساڡاٮا▁الاسٮواىٮه']

In [32]:
undotted_test_dataset = list(map(undot,tqdm(test_dataset)))
undotted_test_dataset[:2]

  0%|          | 0/89835 [00:00<?, ?it/s]

['المٮحڡ▁الٮحرٮ▁او▁مٮحڡ▁ں',
 'حاء▁ڡٮ▁(ٮارٮح▁الهاعاٮاه)▁اٮه▁عٮدما▁دمر▁لواء▁هٮٮعڡ▁(الٮڡٮ)▁لٮاٮع▁للٮلماح▁ڡرٮه▁ٮرٮر▁،▁(ٮدا▁الڡلاحوٮ▁مٮ▁الڡرٮٮٮٮ▁المحاورٮٮٮ▁حلٮڡاٮ▁وكوكٮا▁ٮالڡرار▁ڡٮ▁ٮحاه▁حٮال▁الحلٮل']

In [33]:
def train_model(
    model,
    train_dataloader,
    val_dataloader,
    text_type,
    max_epochs=100,
  ):
  checkpoints_path = Path(f"./DotsRetrieval/{text_type}")
  shutil.rmtree(checkpoints_path, ignore_errors=True)
  checkpoint_callback = ModelCheckpoint(
      mode="min",
      save_top_k=1,
      verbose=False,
      save_last=True,
      monitor="val_loss",
      save_weights_only=False,
      auto_insert_metric_name=True,
      save_on_train_epoch_end=False,
      dirpath=f"{checkpoints_path}/checkpoints",
      filename="{epoch}-{val_loss:.3f}-{step}",
  )
  callbacks = list()
  callbacks.append(checkpoint_callback)
  early_stopping_callback = EarlyStopping(
      monitor="val_loss",
      min_delta=0.0025,
    #   min_delta=0,
      patience=10,
      check_finite=True,
  )
  callbacks.append(early_stopping_callback)
  lr_monitor = LearningRateMonitor(
      logging_interval="step",
      log_momentum=True,
  )
  callbacks.append(lr_monitor)
#   callbacks.append(RichProgressBar())
  devices = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  trainer = Trainer(
      deterministic=True,
      callbacks=callbacks,
      gradient_clip_val=5,
      fast_dev_run=False,
      max_epochs=max_epochs,
      val_check_interval=0.25,
      accelerator="auto",
      devices=[0],
      # log_every_n_steps=max(len(train_dataloader) // 25, 1),
      log_every_n_steps=max(len(train_dataloader) // 25, 1),
  )
  trainer.validate(
      model=model,
      dataloaders=val_dataloader,
  )
  trainer.fit(
      model,
      train_dataloader,
      val_dataloader,
  )
  return trainer

# Prepare vocab

## source tokenizer

In [34]:
source_tokenizer = CharacterTokenizer(vocab_size=10_000_000)

In [35]:
source_tokenizer.train(text='\n'.join(tqdm(undotted_train_dataset)))

  0%|          | 0/8848888 [00:00<?, ?it/s]

Training CharacterTokenizer ...


In [36]:
source_tokenizer.vocab_size

179

In [37]:
# test the tokenizer
source_tokenizer.tokenize(undot(prepare('السلام عليكم و رحمة الله و بركاته'))),source_tokenizer.encode(undot(prepare('السلام عليكم و رحمة الله و بركاته')))

(['ا',
  'ل',
  'س',
  'ل',
  'ا',
  'م',
  '▁',
  'ع',
  'ل',
  'ٮ',
  'ك',
  'م',
  '▁',
  'و',
  '▁',
  'ر',
  'ح',
  'م',
  'ه',
  '▁',
  'ا',
  'ل',
  'ل',
  'ه',
  '▁',
  'و',
  '▁',
  'ٮ',
  'ر',
  'ك',
  'ا',
  'ٮ',
  'ه'],
 [2,
  12,
  15,
  12,
  2,
  8,
  6,
  10,
  12,
  3,
  13,
  8,
  6,
  16,
  6,
  5,
  23,
  8,
  11,
  6,
  2,
  12,
  12,
  11,
  6,
  16,
  6,
  3,
  5,
  13,
  2,
  3,
  11])

In [38]:
''.join(v for v,f in source_tokenizer.vocab.items() if 0<f<100)

''

In [39]:
dict(sorted(source_tokenizer.vocab.items(),key=lambda item:item[1],reverse=True))

{'ٮ': 175703108,
 '▁': 175697703,
 'ا': 134170900,
 'ل': 87515206,
 'م': 50502091,
 'و': 45345579,
 'ه': 43683462,
 'ر': 42819149,
 'ڡ': 36974108,
 'ح': 30432973,
 'ع': 29300650,
 'د': 29295046,
 'س': 28419481,
 'ك': 17479055,
 'ص': 11176735,
 'ى': 9762844,
 'ط': 8914055,
 '،': 8375857,
 '1': 4631719,
 '0': 3534787,
 '2': 3000399,
 '9': 2702938,
 '"': 2274429,
 'ء': 2253015,
 '(': 1593790,
 ')': 1568198,
 '8': 1441854,
 '3': 1355936,
 '5': 1309435,
 '4': 1258386,
 '6': 1177995,
 '7': 1176616,
 'e': 791106,
 'آ': 782015,
 '-': 728899,
 'a': 723819,
 ':': 723209,
 'i': 599563,
 'o': 557350,
 'ں': 549072,
 'r': 541672,
 'n': 520397,
 't': 494609,
 's': 418641,
 ',': 398016,
 'l': 397639,
 'u': 270045,
 'c': 252157,
 'm': 233877,
 'd': 232602,
 'h': 214816,
 '/': 213546,
 '؛': 206828,
 'A': 175896,
 'C': 174822,
 'S': 171243,
 '%': 167486,
 'g': 165445,
 'p': 159505,
 'y': 123184,
 'P': 118319,
 'M': 118064,
 'f': 116135,
 'T': 113734,
 'I': 102824,
 'D': 102237,
 'b': 100818,
 '[': 100357

In [40]:
source_tokenizer.save_model(file_path='dotless_arabic/experiments/dots_retrieval/bin/source_tokenizer.model')

Saving as pickle file ...


## target tokenizer

In [41]:
target_tokenizer = CharacterTokenizer(vocab_size=10_000_000)

In [42]:
target_tokenizer.train(text='\n'.join(tqdm(train_dataset)))

  0%|          | 0/8848888 [00:00<?, ?it/s]

Training CharacterTokenizer ...


In [43]:
target_tokenizer.vocab_size

195

In [44]:
# test the tokenizer
target_tokenizer.tokenize(prepare('السلام عليكم و رحمة الله و بركاته')),target_tokenizer.encode(prepare('السلام عليكم و رحمة الله و بركاته'))

(['ا',
  'ل',
  'س',
  'ل',
  'ا',
  'م',
  '▁',
  'ع',
  'ل',
  'ي',
  'ك',
  'م',
  '▁',
  'و',
  '▁',
  'ر',
  'ح',
  'م',
  'ة',
  '▁',
  'ا',
  'ل',
  'ل',
  'ه',
  '▁',
  'و',
  '▁',
  'ب',
  'ر',
  'ك',
  'ا',
  'ت',
  'ه'],
 [10,
  13,
  24,
  13,
  10,
  2,
  8,
  5,
  13,
  6,
  14,
  2,
  8,
  20,
  8,
  19,
  35,
  2,
  7,
  8,
  10,
  13,
  13,
  18,
  8,
  20,
  8,
  15,
  19,
  14,
  10,
  11,
  18])

In [45]:
target_tokenizer.vocab

{'<UNK>': -1,
 '<PAD>': -1,
 'م': 50502091,
 'ق': 15273002,
 'ط': 7258035,
 'ع': 25076678,
 'ي': 66911194,
 'ة': 26876400,
 '▁': 175697703,
 'ن': 40645199,
 'ا': 113016482,
 'ت': 35694363,
 'ظ': 1656020,
 'ل': 87515206,
 'ك': 17479055,
 'ب': 28667850,
 'ذ': 4091937,
 'ش': 7436805,
 'ه': 16807062,
 'ر': 37572090,
 'و': 44653074,
 '،': 8375857,
 'أ': 15081931,
 'ص': 6715974,
 'س': 20982676,
 'غ': 4223972,
 'ف': 21797688,
 'ء': 2253015,
 ':': 723209,
 'ؤ': 692505,
 '"': 2274429,
 ',': 398016,
 'ض': 4460761,
 'خ': 6160118,
 'د': 25203109,
 'ح': 13235647,
 '1': 4631719,
 'ز': 5247059,
 'ج': 11037208,
 '2': 3000399,
 'ث': 4949473,
 '(': 1593790,
 ')': 1568198,
 '4': 1258386,
 '5': 1309435,
 '٪': 43597,
 '3': 1355936,
 'إ': 6072487,
 '[': 100357,
 ']': 99939,
 '·': 4268,
 '*': 13657,
 '=': 32449,
 'ئ': 2997145,
 '9': 2702938,
 '8': 1441854,
 'A': 175896,
 'B': 94518,
 'C': 174822,
 'D': 102237,
 'E': 86089,
 'F': 72503,
 'G': 75057,
 'آ': 782015,
 '-': 728899,
 '0': 3534787,
 '&': 36379,
 'l'

In [46]:
{
    v: f for v, f in target_tokenizer.vocab.items() if 0 < f < 1000
}

{}

In [47]:
target_tokenizer.vocab

{'<UNK>': -1,
 '<PAD>': -1,
 'م': 50502091,
 'ق': 15273002,
 'ط': 7258035,
 'ع': 25076678,
 'ي': 66911194,
 'ة': 26876400,
 '▁': 175697703,
 'ن': 40645199,
 'ا': 113016482,
 'ت': 35694363,
 'ظ': 1656020,
 'ل': 87515206,
 'ك': 17479055,
 'ب': 28667850,
 'ذ': 4091937,
 'ش': 7436805,
 'ه': 16807062,
 'ر': 37572090,
 'و': 44653074,
 '،': 8375857,
 'أ': 15081931,
 'ص': 6715974,
 'س': 20982676,
 'غ': 4223972,
 'ف': 21797688,
 'ء': 2253015,
 ':': 723209,
 'ؤ': 692505,
 '"': 2274429,
 ',': 398016,
 'ض': 4460761,
 'خ': 6160118,
 'د': 25203109,
 'ح': 13235647,
 '1': 4631719,
 'ز': 5247059,
 'ج': 11037208,
 '2': 3000399,
 'ث': 4949473,
 '(': 1593790,
 ')': 1568198,
 '4': 1258386,
 '5': 1309435,
 '٪': 43597,
 '3': 1355936,
 'إ': 6072487,
 '[': 100357,
 ']': 99939,
 '·': 4268,
 '*': 13657,
 '=': 32449,
 'ئ': 2997145,
 '9': 2702938,
 '8': 1441854,
 'A': 175896,
 'B': 94518,
 'C': 174822,
 'D': 102237,
 'E': 86089,
 'F': 72503,
 'G': 75057,
 'آ': 782015,
 '-': 728899,
 '0': 3534787,
 '&': 36379,
 'l'

In [48]:
target_tokenizer.detokenize(target_tokenizer.tokenize(prepare('السلام عليكم و رحمة الله و بركاته')))

'السلام▁عليكم▁و▁رحمة▁الله▁و▁بركاته'

In [49]:
target_tokenizer.save_model(file_path='dotless_arabic/experiments/dots_retrieval/bin/target_tokenizer.model')

Saving as pickle file ...


# Run the experiment

## tokenize and split

In [50]:
encoded_trainset = create_features_from_text_list(text_list=undotted_train_dataset,tokenizer=source_tokenizer)
trainy = create_features_from_text_list(text_list=train_dataset,tokenizer=target_tokenizer)

  0%|          | 0/8848888 [00:00<?, ?it/s]

  0%|          | 0/8848888 [00:00<?, ?it/s]

In [51]:
encoded_valset = create_features_from_text_list(text_list=undotted_val_dataset,tokenizer=source_tokenizer)
valy = create_features_from_text_list(text_list=val_dataset,tokenizer=target_tokenizer)

  0%|          | 0/44468 [00:00<?, ?it/s]

  0%|          | 0/44468 [00:00<?, ?it/s]

In [52]:
encoded_testset = create_features_from_text_list(text_list=undotted_test_dataset,tokenizer=source_tokenizer)
testy = create_features_from_text_list(text_list=test_dataset,tokenizer=target_tokenizer)

  0%|          | 0/89835 [00:00<?, ?it/s]

  0%|          | 0/89835 [00:00<?, ?it/s]

In [53]:
testy_with_dots = create_features_from_text_list(text_list=test_dataset,tokenizer=target_tokenizer)

  0%|          | 0/89835 [00:00<?, ?it/s]

In [54]:
# encoded_trainset, encoded_valset, trainy, valy = train_test_split(
#   encoded_trainset,
#   trainy,
#   test_size=0.01,
#   random_state=seed,
# )
# len(encoded_trainset),len(encoded_valset),len(trainy), len(valy)

In [55]:
encoded_trainset.shape,trainy.shape

((8848888, 500), (8848888, 500))

In [56]:
# create tensor datasets
trainset = TensorDataset(torch.from_numpy(encoded_trainset), torch.from_numpy(trainy))
validset = TensorDataset(torch.from_numpy(encoded_valset), torch.from_numpy(valy))
testset = TensorDataset(torch.from_numpy(encoded_testset), torch.from_numpy(testy))
testset_with_dots = TensorDataset(torch.from_numpy(encoded_testset), torch.from_numpy(testy_with_dots))

In [57]:
# create dataloaders
trainloader = DataLoader(trainset, shuffle=True, batch_size=batch_size,num_workers=4)
valloader = DataLoader(validset, shuffle=False, batch_size=batch_size,num_workers=4,drop_last=False)
testloader = DataLoader(testset, shuffle=False, batch_size=batch_size,num_workers=4,drop_last=False)
testloader_with_dots = DataLoader(testset_with_dots,shuffle=False,batch_size=batch_size,num_workers=4,drop_last=False)

## build and train the model

In [58]:
model = LitBiLSTMModel(
    seq_len=seq_len,
    vocab_size=source_tokenizer.vocab_size,
    output_size=target_tokenizer.vocab_size,
  )
model

LitBiLSTMModel(
  (train_accuracy): MulticlassAccuracy()
  (val_accuracy): MulticlassAccuracy()
  (test_accuracy): MulticlassAccuracy()
  (embedding): Embedding(179, 512, padding_idx=1)
  (lstm): LSTM(512, 512, num_layers=2, batch_first=True, dropout=0.33, bidirectional=True)
  (dropout): Dropout(p=0.33, inplace=False)
  (fc): Linear(in_features=512, out_features=195, bias=True)
)

In [59]:
trainer = train_model(
    model,
    train_dataloader=trainloader,
    val_dataloader=valloader,
    text_type='dotless-to-dotted',
  )

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA RTX A4500') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name           | Type               | Params
------------------------------------------------------
0 | train_accuracy | MulticlassAccuracy | 0     
1 | val_accuracy   | MulticlassAccuracy | 0     
2 | test_accuracy  | MulticlassAccuracy | 0     
3 | embedding      | Embedding          | 91.6 K
4 | lstm           | LSTM               | 10.5 M
5 | dropout        | Dropout            | 0     
6 | fc             | Linear             | 100 K 
------------------------------------------------------
10.7 M    Trainable params
0         Non-trainable params
10.7 M    Total params
42.775    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 00003: reducing learning rate of group 0 to 5.0000e-04.


Validation: 0it [00:00, ?it/s]

In [60]:
trainer.test(ckpt_path='best',dataloaders=testloader)

Restoring states from the checkpoint path at /home/majed_alshaibani/Experiments/DotlessArabic/DotsRetrieval/dotless-to-dotted/checkpoints/epoch=1-val_loss=0.042-step=60489.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Loaded model weights from the checkpoint at /home/majed_alshaibani/Experiments/DotlessArabic/DotsRetrieval/dotless-to-dotted/checkpoints/epoch=1-val_loss=0.042-step=60489.ckpt


Testing: 0it [00:00, ?it/s]

[{'test_acc': 0.9858377575874329, 'test_loss': 0.04184914752840996}]

find the text metrics, (wer,cer,ver)

In [61]:
model_predictions = trainer.predict(ckpt_path='best',dataloaders=testloader)

Restoring states from the checkpoint path at /home/majed_alshaibani/Experiments/DotlessArabic/DotsRetrieval/dotless-to-dotted/checkpoints/epoch=1-val_loss=0.042-step=60489.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Loaded model weights from the checkpoint at /home/majed_alshaibani/Experiments/DotlessArabic/DotsRetrieval/dotless-to-dotted/checkpoints/epoch=1-val_loss=0.042-step=60489.ckpt


Predicting: 0it [00:00, ?it/s]

In [62]:
predictions = list()
labels = list()
for (batch_predictions,batch_labels) in model_predictions:
  batch_predictions = batch_predictions.view(-1,seq_len)
  batch_labels = batch_labels.view(-1,seq_len)
  for sample_predictions,sample_labels in zip(batch_predictions,batch_labels):
    predictions.append(sample_predictions)
    labels.append(sample_labels)
print('len predictions and lables of the test set:',len(predictions),len(labels))

len predictions and lables of the test set: 89835 89835


In [63]:
wers,cers,vers = list(),list(),list()
for i,(sample_preds,sample_labels) in tqdm(enumerate(zip(predictions,labels)),total=len(predictions)):
  wer,cer = calculate_text_metrics(
      predictions=sample_preds.tolist(),
      labels=sample_labels.tolist(),
      target_tokenizer=target_tokenizer,
      print_text=True if i < 100 else False,
    )
  wers.append(wer)
  cers.append(cer)

  0%|          | 0/89835 [00:00<?, ?it/s]

المتحف البحري أو متحف ن
المتحف البحري أو متحف ن
جاء في (تاريخ الهاغاناه) أنه عندما دمر لواء هنتعف (النقب) لتابع للبلماح قرية برير ، (بدأ الفلاحون من القرثيين المجاورتين حليفان وكوكبا بالقرار في نجاة جبال الجليل
جاء في (تاريخ الهاغاناه) أنه عندما دمر لواء هنيغف (النقب) لتابع للبلماح قرية بربر ، (بدأ الفلاحون من القريتين المجاورتين حليقات وكوكبا بالفرار في تجاه جبال الخليل
بعد فترة وجيزة ، رغم أن الجنوب قد ضرب عقل جوانا ولم يعد قادرة على الحكم ، لذا عرفت بلقب " جوانا الحيوية "
بعد فترة وجيزة ، زعم أن الجنون قد ضرب عقل خوانا ولم تعد قادرة على الحكم ، لذا عرفت بلقب " خوانا الجنونة "
آخر نشاط عسكري حدث في مارس عام 1949 عندما استولت القوات الإسرائيلية على صحراء النقب ووصلت إلى البحر الأحمر
آخر نشاط عسكري حدث في مارس عام 1949 عندما استولت القوات الإسرائيلية على صحراء النقب ووصلت إلى البحر الأحمر
وبدأ ماثيوس ويورك بلعتان في فريق ثنائي مرة أخرى
وبدأ ماثيوس ويورك يلعبان في فريق ثنائي مرة أخرى
ألكسندر موريتز فراي هو كاتب ألماني ، ولد في 29 مارس 1881 في ميونخ في ألمانيا ، وتوفي في 24 يناير 1957 في



في الفصل 8 من كتاب العقل الصائب ، يصف هايدن كيف بدأ دراسة علم النفس السياسي من أجل مساعدة الحزب الديمقراطي في الفوز بمزيد من الانتخابات ، ولكن في الفصل 12 يقول إن كل مجموعة من المجموعات السياسية الرئيسية - المحافظون والتقدميون والتحرريون - لديهم رؤى قيمة وأن الحقيقة والسياسة الجيدة تبرز من تناقس الأفكار
في الفصل 8 من كتاب العقل الصائب ، يصف هايدت كيف بدأ دراسة علم النفس السياسي من أجل مساعدة الحزب الديمقراطي في الفوز بمزيد من الانتخابات ، ولكن في الفصل 12 يقول أن كل مجموعة من المجموعات السياسية الرئيسية - المحافظون والتقدميون والتحرريون - لديهم رؤى قيمة وأن الحقيقة والسياسة الجيدة تبرز من تنافس الأفكار
موعد مع الموت هو فيلم كويتي إنتاج 1967 ، يتكلم قصتة عن رجل أعمال عنن يقال له أنه سيموت بعد عدة ساعات بخلطه وتجري الأحداث
موعد مع الموت هو فيلم كويتي إنتاج 1967 ، تتكلم قصته عن رجل أعمال غني يقال له انه سيموت بعد عدة ساعات بجلطه وتجري الاحداث
بلغ عدد الأسر 2 , 747 أسرة كانت نسبة 43% منها لديها أطفال تحت سن الثامنة عشر تعيش معهم ، وبلغت نسبة الأزواج القاطنين مع بعضهم البعض 65
بلغ عدد الأسر

In [64]:
avg_wer = sum(wers)/len(wers)
avg_cer = sum(cers)/len(cers)
print('avg_wer, avg_cer:',avg_wer,avg_cer)

avg_wer, avg_cer: tensor(0.0595) tensor(0.0152)


: 

text metrics with vowel words loader

In [None]:
# labels_with_dots = list()
# for batch in testloader_with_dots:
#   _,batch_labels = batch
#   batch_labels = batch_labels.view(-1,seq_len)
#   for sample_labels in batch_labels:
#     labels_with_dots.append(sample_labels)
# print('len lables with vowel words of the test set:',len(labels_with_dots))

In [None]:
# wers,cers = list(),list()
# for sample_preds,sample_labels in tqdm(zip(predictions,labels_with_dots),total=len(predictions)):
#   wer,cer,ver = calculate_text_metrics(
#       predictions=sample_preds.tolist(),
#       labels=sample_labels.tolist(),
#       target_tokenizer=target_tokenizer,
#     )
#   wers.append(wer)
#   cers.append(cer)

In [None]:
# avg_wer = sum(wers)/len(wers)
# avg_cer = sum(cers)/len(cers)
# print('avg_wer, avg_cer, avg_ver:',avg_wer,avg_cer)

test on the best model according to the validation loss

In [None]:
# model = LitBiLSTMModel.load_from_checkpoint(
#     trainer.checkpoint_callback.best_model_path,
#     vocab_size=source_tokenizer.vocab_size,
#     output_size=target_tokenizer.vocab_size,
#   )
# model

In [None]:
# trainer.test(model,testloader)

In [None]:
# model_predictions = trainer.predict(model,testloader)

In [None]:
# predictions = list()
# labels = list()
# for (batch_predictions,batch_labels) in model_predictions:
#   batch_predictions = batch_predictions.view(-1,seq_len)
#   batch_labels = batch_labels.view(-1,seq_len)
#   for sample_predictions,sample_labels in zip(batch_predictions,batch_labels):
#     predictions.append(sample_predictions)
#     labels.append(sample_labels)
# print('len predictions and lables of the test set:',len(predictions),len(labels))

In [None]:
# wers,cers,vers = list(),list(),list()
# for sample_preds,sample_labels in tqdm(zip(predictions,labels),total=len(predictions)):
#   wer,cer,ver = calculate_text_metrics(
#       predictions=sample_preds.tolist(),
#       labels=sample_labels.tolist(),
#       target_tokenizer=target_tokenizer
#     )
#   wers.append(wer)
#   cers.append(cer)
#   vers.append(ver)

In [None]:
# avg_wer = sum(wers)/len(wers)
# avg_cer = sum(cers)/len(cers)
# avg_ver = sum(vers)/len(vers)
# print('avg_wer, avg_cer, avg_ver:',avg_wer,avg_cer,avg_ver)

text metrics with vowel words

In [None]:
# labels_with_vowel_words = list()
# for batch in testloader_with_vowel_words:
#   _,batch_labels = batch
#   batch_labels = batch_labels.view(-1,seq_len)
#   for sample_labels in batch_labels:
#     labels_with_vowel_words.append(sample_labels)
# print('len lables with vowel words of the test set:',len(labels_with_vowel_words))

In [None]:
# wers,cers,vers = list(),list(),list()
# for sample_preds,sample_labels in tqdm(zip(predictions,labels_with_vowel_words),total=len(predictions)):
#   wer,cer,ver = calculate_text_metrics(
#       predictions=sample_preds.tolist(),
#       labels=sample_labels.tolist(),
#       target_tokenizer=target_tokenizer,
#     )
#   wers.append(wer)
#   cers.append(cer)
#   vers.append(ver)

In [None]:
# avg_wer = sum(wers)/len(wers)
# avg_cer = sum(cers)/len(cers)
# avg_ver = sum(vers)/len(vers)
# print('avg_wer, avg_cer, avg_ver:',avg_wer,avg_cer,avg_ver)

In [None]:
# for sample_preds,sample_labels in tqdm(list(zip(predictions,labels_with_vowel_words))[:100],total=100):
#   wer,cer,ver = calculate_text_metrics(
#       predictions=sample_preds.tolist(),
#       labels=sample_labels.tolist(),
#       target_tokenizer=target_tokenizer,
#       print_text=True
#     )

In [None]:
# from google.colab import runtime
# runtime.unassign()