# Installs

In [1]:
# !pip install -q tqdm
# !pip install -q seaborn
# !pip install -q datasets
# !pip install -q scikit-learn
# !pip install -q pytorch_lightning
# !pip install -q git+https://github.com/MagedSaeed/tkseem

# Prepare

In [2]:
import re
import os
import shutil
import string
from pathlib import Path

from pyarabic import araby

from sklearn.model_selection import train_test_split

import numpy as np

import torch
from torch import nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader

from tqdm.auto import tqdm

import torchmetrics
from torchmetrics.functional import word_error_rate, char_error_rate

from pytorch_lightning import seed_everything
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning import LightningModule, Trainer
from pytorch_lightning.callbacks import (
    EarlyStopping,
    LearningRateMonitor,
    ModelCheckpoint,
    RichProgressBar,
)

from dotless_arabic import constants
from dotless_arabic.processing import undot, process
from dotless_arabic.tokenizers import CharacterTokenizer
from dotless_arabic.experiments.dots_retrieval.src.models import LitBiLSTMModel
from dotless_arabic.datasets.wikipedia.collect import collect_dataset_for_dots_retreival
# from dotless_arabic.datasets.aggregated.collect import collect_dataset_for_dots_retreival
from dotless_arabic.constants import LETTERS_MAPPING

import datasets
import seaborn as sns
import matplotlib.pyplot as plt
from sacremoses import MosesPunctNormalizer



In [3]:
seed = 42

In [4]:
# random.seed(seed)     # python random generator
# np.random.seed(seed)  # numpy random generator

# torch.manual_seed(seed)
# torch.cuda.manual_seed_all(seed)

# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = False

seed_everything(seed)

Global seed set to 42


42

In [5]:
tqdm.pandas()

# Load and explore the dataset

In [6]:
dataset = list(set(collect_dataset_for_dots_retreival()))
len(dataset)

  0%|          | 0/4636663 [00:00<?, ?it/s]

  0%|          | 0/4636645 [00:00<?, ?it/s]

  0%|          | 0/10867699 [00:00<?, ?it/s]

1440782

In [7]:
train_dataset,test_dataset = train_test_split(dataset,test_size=0.05,shuffle=True,random_state=seed)
len(train_dataset),len(test_dataset)

(1368742, 72040)

In [8]:
train_dataset[:2]

['ŸÉÿßŸÜÿ™ ÿ®ÿØÿßŸäÿßÿ™Ÿá ŸÖÿπ ŸÜÿßÿØŸä ÿßŸÑÿ∑ÿßÿ¶Ÿä Ÿà ŸÅŸä ÿπÿßŸÖ 2015 ŸàŸÇÿπ ŸÖÿπ ÿßŸÑŸÅŸäÿµŸÑŸä Ÿà ÿπÿßÿØ ÿ•ŸÑŸâ ŸÜÿßÿØŸä ÿßŸÑÿ∑ÿßÿ¶Ÿä 2017 Ÿà ŸàŸÇÿπ ŸÖÿπ ÿßŸÑŸÜÿßÿØŸä ÿßŸÑÿπÿ±ÿ®Ÿä ŸÅŸä ÿπÿßŸÖ 2018 Ÿà ÿßŸÜÿ™ŸÇŸÑ ÿ•ŸÑŸâ ŸÜÿßÿØŸä ÿßŸÑÿ¥ÿπŸÑÿ© ŸÅŸä ÿπÿßŸÖ 2019 Ÿà ŸàŸÇÿπ ÿ®ÿπÿØŸáÿß ŸÖÿπ ŸÜÿßÿØŸä ÿßŸÑŸÑŸàÿßÿ°',
 'ÿ™ŸÖ ÿ™ÿ¥ÿÆŸäÿµ ÿ™ŸÇÿ±Ÿäÿ± ÿ™ÿ≥ŸÑÿ≥ŸÑ Ÿàÿ±ÿßÿ´Ÿä ŸÖŸÜ ŸÉÿßÿ®Ÿäÿ™ÿßŸÑ ÿ®ŸäŸà ŸÖŸäÿØŸÑÿßÿ® ŸÖŸÜ ÿ®ŸÉŸäŸÜ ÿπŸÜ ÿßŸÑÿπÿßŸÖŸÑ ÿßŸÑŸÖÿ≥ÿ®ÿ® ŸÑŸÖÿ±Ÿäÿ∂ Ÿäÿ®ŸÑÿ∫ ŸÖŸÜ ÿßŸÑÿπŸÖÿ± 41 ÿπÿßŸÖŸãÿßÿå Ÿàÿ™ŸÖ ÿ¨ŸÖÿπ ÿßŸÑÿπŸäŸÜÿ© ŸÖŸÜ ŸÇÿ®ŸÑ ŸÖÿ≥ÿ™ÿ¥ŸÅŸâ ŸàŸàŸáÿßŸÜ ÿßŸÑŸÖÿ±ŸÉÿ≤Ÿä ÿ•ÿ∞ ÿ™ŸÖ ÿ™ÿ¥ÿÆŸäÿµŸá ÿπŸÜ ÿ∑ÿ±ŸäŸÇ ÿßŸÑÿÆÿ∑ÿ£ ÿ®ÿ£ŸÜŸá ŸÅŸäÿ±Ÿàÿ≥ ŸÖÿ™ŸÑÿßÿ≤ŸÖÿ© ÿ™ŸÜŸÅÿ≥Ÿäÿ© ÿ≠ÿßÿØÿ© ÿ¥ÿØŸäÿØÿ© (ŸÅŸäÿ±Ÿàÿ≥ ÿßŸÑÿ≥ÿßÿ±ÿ≥ ÿßŸÑÿ™ÿßÿ¨Ÿä)']

In [9]:
test_dataset[:2]

['ÿ®ÿ∑ŸàŸÑÿ© ŸÉÿ£ÿ≥ ÿßŸÑÿ≥Ÿàÿ®ÿ± ÿßŸÑÿ£ŸÑÿ®ÿßŸÜŸä 2002 ŸáŸä ÿßŸÑŸÜÿ≥ÿÆÿ© ÿßŸÑÿ™ÿßÿ≥ÿπÿ© ŸÖŸÜ ÿ®ÿ∑ŸàŸÑÿ© ŸÉÿ£ÿ≥ ÿßŸÑÿ≥Ÿàÿ®ÿ± ÿßŸÑÿ£ŸÑÿ®ÿßŸÜŸä ÿå ŸÑÿπÿ® ŸäŸàŸÖ 14 ÿ≥ÿ®ÿ™ŸÖÿ®ÿ± 2002 ŸÅŸä ÿßŸÑÿßÿ≥ÿ™ÿßÿØ ÿßŸÑŸàÿ∑ŸÜŸä ÿ®ÿ™Ÿäÿ±ÿßŸÜÿß ÿå ÿ®ŸäŸÜ ŸÅÿ±ŸäŸÇ ÿ™Ÿäÿ±ÿßŸÜÿß ÿßŸÑŸÅÿßÿ¶ÿ≤ ÿ®ŸÉÿ£ÿ≥ ÿ£ŸÑÿ®ÿßŸÜŸäÿß ŸàŸÅÿ±ŸäŸÇ ÿØŸäŸÜÿßŸÖŸà ÿ™Ÿäÿ±ÿßŸÜÿß ÿßŸÑŸÅÿßÿ¶ÿ≤ ÿ®ÿßŸÑÿØŸàÿ±Ÿä',
 'ŸàŸáÿ∞ÿß ÿßŸÑŸÖÿ±ÿ≥Ÿâ ŸÖŸÜ ÿ£ÿ≠ÿ≥ŸÜ ÿßŸÑŸÖÿ±ÿßÿ≥Ÿä Ÿàÿ∂ÿπŸãÿßÿå ŸàŸáŸà ÿ¥ÿ®Ÿá ÿÆŸÑŸäÿ¨ ŸÖŸÜ ÿßŸÑÿ®ÿ≠ÿ± ŸäÿØÿÆŸÑ ŸÅŸä ÿßŸÑÿ®ÿ±ÿå ŸàÿßŸÑÿ®ÿ± ŸÖÿ∑ŸäŸÅ ÿ®ÿ≠ÿßŸÅÿ™ŸäŸáÿå ŸàŸäŸèŸÉŸëŸÜ ŸÖŸÜ ÿ¨ŸÖŸäÿπ ÿßŸÑÿ£ÿ±Ÿàÿßÿ≠ÿå Ÿàÿ®ÿßÿ≥ÿ™ŸÇÿ±ÿßÿ±ŸÜÿß ŸÅŸäŸá ÿπÿßÿØÿ™ ŸÑÿ£ÿ¨ÿ≥ÿßÿØŸÜÿß ÿßŸÑÿ£ÿ±Ÿàÿßÿ≠ÿå Ÿàÿ£ŸÖŸÜŸêŸëÿß ŸÅŸä ŸÖÿ±ŸÉÿ®ŸÜÿß ŸÖŸÜ ÿßÿÆÿ™ŸÑÿßŸÑ ÿßŸÑÿØÿ≥ÿ± ŸàÿßŸÑÿ£ŸÑŸàÿßÿ≠']

In [10]:
train_dataset,val_dataset = train_test_split(train_dataset,test_size=0.05,shuffle=True,random_state=seed)
len(train_dataset),len(val_dataset)

(1300304, 68438)

In [11]:
val_dataset[:2]

['ŸÑŸÖ Ÿäÿ®ŸÇ ÿ¢ŸÑ ÿ≠ÿ±ÿ≤ ÿ®ÿπÿØ ŸàŸÅÿßÿ© ŸàÿßŸÑÿØŸá ÿ∑ŸàŸäŸÑÿßŸã ÿ•ÿ∞ ÿ™ŸàŸÅŸä ÿ¥ÿßÿ®ÿßŸã ÿ≥ŸÜÿ© 1340 ŸáŸÄ / 1921 ŸÖÿå ŸàÿØŸÅŸÜ ÿ®ÿ¨Ÿàÿßÿ± ŸàÿßŸÑÿØŸá ŸÅŸä ŸÖŸÇÿ®ÿ±ÿ© ÿßŸÑÿ•ŸÖÿßŸÖ ÿ®ÿ¨ÿØ ÿ≠ŸÅÿµÿå ŸàŸÑŸÖ ŸäÿπŸÇÿ® ÿ•ŸÑÿß ÿ®ŸÜÿ™ÿßŸã Ÿàÿßÿ≠ÿØÿ© ÿ™ÿ≤Ÿàÿ¨Ÿáÿß ŸÖÿ≠ŸÖÿØ ÿπŸÑŸä ÿßŸÑŸÖÿØŸÜŸä',
 'ŸÉÿßŸÜÿ™ ÿ®ÿØÿßŸäÿ© ŸÖÿ≥ŸÑÿ≥ŸÑ ÿ£ÿÆŸä ÿßŸÑÿπÿ≤Ÿäÿ≤ ÿπÿßŸÖ 1975ÿå ÿ≠Ÿäÿ´ ŸÉÿßŸÜÿ™ ÿπŸÑŸâ ÿ¥ŸÉŸÑ ŸÇÿµÿ© ŸÖÿµŸàÿ±ÿ© ŸÉŸÖÿß Ÿäÿ≥ŸÖŸäŸá ÿßŸÑŸäÿßÿ®ÿßŸÜŸäŸàŸÜ ŸÖÿßŸÜÿ∫ÿßÿå ÿ•ŸÑŸâ ÿ£ŸÜ ÿ™ÿ≠ŸàŸÑ ÿ•ŸÑŸâ ŸÖÿ≥ŸÑÿ≥ŸÑ ÿ™ŸÑŸÅÿ≤ŸäŸàŸÜŸä ÿπÿßŸÖ 1990 Ÿàÿπÿ±ÿ∂ ŸÑÿ£ŸàŸÑ ŸÖÿ±ÿ© ÿπŸÑŸâ ŸÇŸÜÿßÿ© NHK ÿßŸÑŸäÿßÿ®ÿßŸÜŸäÿ© ŸÖŸÜ 1991 - 1992ÿå ŸàÿØÿ®ŸÑÿ¨ Ÿáÿ∞ÿß ÿßŸÑŸÖÿ≥ŸÑÿ≥ŸÑ ÿ•ŸÑŸâ ÿßŸÑŸÑÿ∫ÿ© ÿßŸÑÿ•Ÿäÿ∑ÿßŸÑŸäÿ© ŸàÿßŸÑŸÅÿ±ŸÜÿ≥Ÿäÿ© ŸàÿßŸÑÿ£ŸÑŸÖÿßŸÜŸäÿ© ŸàÿßŸÑÿπÿ±ÿ®Ÿäÿ©ÿå ÿ•ŸÑÿß ÿ£ŸÜŸá ŸÑÿßŸÇŸâ ÿ≥ÿÆÿ∑ÿß ŸÉÿ®Ÿäÿ±ÿß ÿ≠Ÿäÿ´ ÿ£ŸÜŸá ŸÖŸÜÿπ ŸÅŸä ÿ®ÿπÿ∂ ÿßŸÑÿØŸàŸÑ ŸÉŸÅÿ±ŸÜÿ≥ÿß ŸÑŸÖÿß ŸÇŸäŸÑ ÿ£ŸÜŸá ÿ≥ÿ®ÿ® ÿØŸÖÿßÿ± ŸÑÿπŸÇŸÑŸäÿ© ÿßŸÑÿ£ÿ∑ŸÅÿßŸÑÿå ÿ®ÿ≥ÿ®ÿ® ÿ®ÿπÿ∂ ÿßŸÑŸÖÿ¥ÿßŸáÿØ ÿßŸÑÿ™Ÿä ÿ™ÿ™ŸÜÿßŸÅŸâ ŸÖÿπ ÿßŸÑÿ£ÿÆŸÑÿßŸÇÿå ÿ•ŸÑÿß ÿ£ŸÜŸá ÿ≠ÿßŸÅÿ∏ ÿπŸ

dataset chars and their counts

In [12]:
chars_dict = {}
for document in tqdm(dataset):
  for word in document.split():
      for c in word:
        chars_dict[c] = chars_dict.get(c,0)+1
f'{len(chars_dict.keys()):,}',f'{sum(chars_dict.values()):,}'

  0%|          | 0/1440782 [00:00<?, ?it/s]

('5,822', '317,098,386')

trainset vocabulary and tokens count:


In [13]:
vocabs_dict = {}
for document in tqdm(train_dataset):
  for word in document.split():
    vocabs_dict[word] = vocabs_dict.get(word,0)+1
f'{len(vocabs_dict.keys()):,}',f'{sum(vocabs_dict.values()):,}'

  0%|          | 0/1300304 [00:00<?, ?it/s]

('2,469,605', '58,440,721')

# Clean and Preprocess the dataset

In [14]:
PUNC_NORMALIZER = MosesPunctNormalizer()

strip chars with frequency less than 1000, exclude dotless letters

In [15]:
rare_chars = ''.join(c for c,f in chars_dict.items() if f < 1000)
rare_chars = ''.join(set(rare_chars)-set(LETTERS_MAPPING.values()))
len(rare_chars)

5651

In [16]:
rare_chars

'ËÄï·Ñ©Â•ñÁ¶çË°ÜÍ•≤ÿâ‚ãØÁæä·âµ‰ªã·ø¶êé¥Ô¥æÊ±∫Êº´Ë¨Ä‡®ÇÍ∞ëÊè≠Êæ≥‚ÇÄíÑäË¢ñÊï∏‚≤´Ÿ§ÂÖ®Ï†Ä≈í›ï‡µà–à·Ö≤ÈªçÈÜíËÇâÈÄü·ÆîÂ•ΩÊïµ·Ü∂ÁÜäÍ•®Î≥ÄÍ•£ÂºñÌü≥·ÄçÌÉë‡¨∂‡§¨Ê≤êÁßÅ‡∫°·É©‡µÇ‚â•‰øÇ·Ñû‰∫´‡∑ôÈπòË≤ª√Ç‚îî’ìÂºòÀáÏïàÍ•§·ûÉÈåµ’¥Îª§ÌñâÏû¨·ûñƒ†Êï¨Ëô´Á¨ô√ßŒâÎ≤åÁ´†·πü·Ü≥‡™´·á¶íâàÂç°≈ÜÈ¨ºÂ©¶Ë®£‚ÖîËá∫·Ü§·ä†·â¶—ôÎ©¥íä®‡Ωü≈°Ê∞¥‚ùÑÏπòíÄ≠‡¶°ËêäÊ≤åË•≤ÈóÆÍ≤å…ëÊ∂àÂ¢ìÃ•ÊÆ≠Îûú‰ª∞Ê†º‰πÖËê¨‡¨æÎû©ÁÑº‰øäÁä∂ŒÑÈôàÂõ∞È®éÌü∞ÌüõÈì∂Ë¨ôÊôâÌó§Á∂†·Ñª„ÉªœÜÂåóÊ¥≤Œõ„ÄçËôö„É∂ Ç„ÄÇÊÆµÈ∫óê≠ä·ªÉÀÅÍú•Â≤≠ÁΩó·∫ßË°ìËÄ≥Îπå‡ÆÆÏ∂òÁ∑¨·Ωå√åË∂≥ÈüìíÇç·ÜûﬁÑ‚ôàÊæ§ÏÖî‚òãÎò•ÎÖÄÂ•ßÊú¥\x87ÊäπŸî·ÄÆÁøÅÌè¨Êû∂·äëÍ•¢◊™Á©Ü‡ß¨ÃìÈ¶¨„Ñ∫–çÌïëê∞úÌü¶·æΩ·Éõ‡∏ç‡πÄ‡•®·ªòÂºìË™åŒ£°®∏ÂæÅÊñóÁ§∫Âè£·ºâÎßåƒóÊâç‚ÜêÂÖ∂ÎÇòÌüüËâ¶‰æãËÇ∫„Éû‡¶ôƒÖÁ∞°^ËÖëÂÖúÂ≤õ‚Ññ¬∂ÁØâÁª£Ëä¶‚µìÌüîÂä®ÈâÑ„Ç∂È£õ·áΩ·åãÊ®µÊùè∆¢≈£íå∑Âà∂Âú®Ôº∂ƒé·áÅÈü≠€É‡∞™»öÊú±√†·ö¥Í∑úÂë®ÏÑ†‡¥®‡∏©üá±Ë™≤·º†ÈÅìÎ¨º·É¨‚ù´‡ßÇ⁄µ·áÉê©§ÌåúÂ∑≥ê©£’´ πüá™ÏÇ¥Â≠ó·Ü©·á∫Âã≤‚¥±€ö·øÜíÑ≠ÈÉÅÁÅØÂ±ïÂÄãÁäñƒΩÁù°ÏÉòË©±Â•ë·ÖìË©ûÈΩï·áÖ‡∑ù·áù·ÄÖÌüïÂ∞∏È¥®ÁãÆÎ∂àÎπà·ªüÈÑï≈ØÌÉú‡∏πÊ∞óËºîÊñºÈÇÑ·ÜìÁñπÂäâ€ÄÂ°îÂ∫óÁÉ≠Ë£ïÔºçﬁôÊπñ—£·ìÑ√¥Êô∫‚ÖìË

In [17]:
# rare_chars = "·Éë·Éî·É†·Éô·Éù·Éö·ÉÆ·ÉóŒí÷π◊ÅŒù‚≤≠‚≤è‚≤ô‚≤ì√ªŒß‹°‹´‹ö–•√∞‚ÖîÁ•ûÈ¢®ÁâπÊîªÈöä≈û·º∞·ºÖ·ø¶·ºà·∏§ÊØéÊó•Êò†Áîª„Ç≥„ÇØÊñ∞ËÅûœé·ø¨·øÜ‚àï‡∏ï‡∏•‡∏î‡∏´‡∏±‡∏Å‡∏ó‡∏û‡πå‡πÅ‡∏á‡∏õ‡∏∞‡πÄ‡∏®‡πÑËÖ∞ÂºÅÂΩì„Ç™„Çø„É°„Ç¨„Éç⁄¨⁄†≈ëÂ§ÆÂü∑Ë°åÂßîÂì°ÊúÉ‰∏ªÂ∏≠‰∫∫Ê∞ëÁõÜË∏ä„Çä‘ø’Ω’Ø’§‡§°‡•Ä‡§®‡•ã‡§Æ‡§ø‡§Ø‡≤°‡≤ø‡≤®‡≥ã‡≤Æ‡≤∞‡≤Ø‡≤æŸπ·Ωê–îŒ¶·øñÊú™Ê≤âÊµÆÈò¥Â∞èÈ™ê∆∞∆°·ªöŒ§·ºîœï‰ªãÂ£ΩË∑ØÂá±ÈÅîÊ†ºËò≠ÈÅìƒÖËÖπÂ§™È§ÖÁ¶è≈†⁄òÍ∑ºÏ¥àÍ≥†Ïôï‚Öì‹¨‹ò–£—óÂºÄÁΩóÂÆ£Ë®ÄŒì–ß€áÊÖïÂ£´Â°îÂ≥∞‡∏™‡∏ò‡∏¥‡∏ö‡∏∏‡∏ç„ÉêŒõ—¥ÌîÑÎ¶¨Ïä§Ìä∏Áï•Âñ™Êúç‘±’¥≈£·é†·èç·é¶·èØ·èó·éØ–ûÊµ™ÈÄü‡¶≤‡¶æ‡¶®’Ñ’Æ’Ä’µ÷Ñ‚Çπ≈æ∆íÀ§ÂÖ´ÂÜõËªç√ÜÁ¨¨‰∫åÊ¨°„ÇΩ„É≠„É¢Êµ∑Êà¶Œò√µÔºë„É™„ÉàÊ∂ôÊú®Ëó§‰∫ú‰πü–Æ–ó·∫ßƒê·∫°·ªá·ª©·∫∑·ªπ·ªô·ªã◊•Œ∂ÏßÄÍ∏àÏö∞ÌïôÍµêÎäî·Ω©‚âÖ„ÄäÊ∏ÖÂçéÂõ≠Âë®Âõ¥Ê∞°Ê∞îÁöÑÂº∫Â∫¶ÂèäÊØèÂ§©ÂèòÂåñ„Äã—ä‚Çø·Éõ·É¶œä’ñ’ß’øÂú∞Áã±ÁçÑ√Çƒ°Œö‚âà√ΩƒÄ·πõÂÆâÈÇ£Ëà¨Ê≤êËã±·æ∂·ø∂Ï¶êÎ¨∏ÌÜ†Í∏∞ÏãúÎåÄÊ´õÊñáÂúüÂô®ÊôÇ‰ª£ÎπóÏÇ¥Î¨¥Îä¨·ºê…™ËèØÈñÄÈó®„ÄÅ‚ÄßÔΩûÔπèŒú¬≥–¢–ï–Ø–ôíÇçíãºíÖéíÖçíå®íÄ≠íáâ„Ç≤„É†„ÇÆ„Ç¢·ºô·Ω¥·º§—ôÏó¨ÏÑ±Í∞ÄÏ°±Î∂ÄÂ•≥ÊÄßÂÆ∂ÊóèÈÉ®„Éù„Ç±„Éõ‚Ä†≈Ø‚â•ÎØºÏ†ïÏÑúÍπÄÌö®ÏßÑÁéãÁø¶Âè∏È¶¨Â∞öÈÉ≠ÈñãÈüìÂÄâË∂ôËî•È°èËÅöÂ¶ñÊÄ™Ë®ò‚µ£‚µâ‚µè‚¥±‚µú‚¥º‚µ°·∏ó∆èÁöáÂêé√ú ∫‚àôÊîØ„Åù„Å∞„É©›ù‡•á‡§™‡§≤‡•Ç‡§ï‡§µ‡§¶‡§∂‡§ó€é◊Ç÷∂÷µ÷±◊ö·ª±·ªü·ªù·ªìÈÉëÁ≠±Ëê∏·π≠‡Æö‡Æü‡Øç‡Æ©‡Æø‡≤ö‡≤ü‡≥ç‡§ö‡§ü‚ÑÖÏù¥Ïú†ÏòÅ≈ôÁîü„ÅçÁî≤ÊñêÂ†¥ÂÜÖÂ∏Ç€ÜÃÑÂØåÂ±±Êú¨ÂÆÆÊµÖÈñìÁ§æÁôΩÊé¢·àΩ·ãã·â∞·ä≠·àà·åä·ãÆ·à≠·àµ¬∏Èï∑Ê±ü‰∏âÈÆÆ‡§Ö‡•Å‡§§‡§†ÏïÑÎàÑÌåúÌååÌã∞√çÈ≠ØËø∑Âüé»õËëâÁπºÂïè‚ñ™‚àó‚â†€ê⁄Ö⁄´ƒóÏ∞ΩÌè¨ÏàòÏ∑®Îñ°Ïî®Î¶Ñ…ë…§‡∏°‡∏Ñ‡∏µ‡∏¨‡∏ü‡πÉ‡∏ä‡∏π‡∏ñ‡∏†‡®π‡©ã‡®≤‡®æ‡®Æ‡©±‡§π‰ºöÊáêÁü≥‚ÅÑ‡πá‡∏à‡πâË≠¶Ë¶ñÂ∫ÅÂÖ¨Âøç–®Œ°ŒóŒèËÄÖËç£ËÄÄÊ¶ÆÈªëÊ©ãÈáå‰πùÊîøÏ≤†Ìé∏ÈêµÁâáÎçîÎ∏îÎûôÎ†à∆ÜÎÇòÏù∏ÎÆ§‡§¨‡§≠≈ä≈ãÂéüÂçöÈÄöÁ´πÂèñÁâ©Ë™û‚â§·πÉ»ôŒäÏ°¥Ìò∏Â∞äËôü‚Äü@–õ–ò≈ºƒôƒëÂ≠üË©µÈ£üÁôÇËçâ„Éâ„Éª„Ç®„Ç∞„ÉûÈªÉËä≥ÏÉÅËÅñ‰∏äÌô©ÌõÑÌÉúÏûêÂ≠êË©îÂãÖÌîåÎ∞±ÊúùË≥ûÏàôÏ¢ÖÌòÑÏùòÍ¥ëÎ•úÏòàÎ†¨Ïû•ÌóåÍ≤ΩÎ™ÖÏõêËÇÖÂÆóÈ°ØÁæ©ÂÖâÂÄ´ÁùøÁÉàÁ´†ÊÜ≤Ê≠¶Êï¨ÊòéÂÖÉÂ≠ùÎ™®Ïö¥ÌôçÏ§ÄÎçïÎ∞∞Ï≤úÌï©ÎèÑÍ≥ÑÌú¥ÎèÖÏ§ëÌòëÍ∑πÏã†ÌõàË£ïË¨®Ê∞∏ÈÅãÊ¥™‰ªÅÂ≥ªÂæ∑ÈÖçÂêàÂïì‰ºëÁØ§ÊÖ∂Ê≠£ÊÅäÊ•µÊØÖÂã≥„Ç´Á©∫‚àà‚äô„Å§›ò‚ãÖ’ç’∑’è’ä·É°·Éú·É®·É´·É¢·Éû‡®∞‡®µ‡©ÄÈ≤çÂì≤ÂçóÈÆëÈªííãÄíÜ†„ÉÅË¨é„Åø„Å¶‚óè≈∫Ê•äÁø†‚ü®‚ü©ÌöåÏïàÍµ∞Êá∑ÂêõÎ∞©Í∞ÑÂππ„Éï„Ç°„Éì„ÉÄÊ≤ôÊô∫È°ó‚µé¬ßË´ñÁáÉ„Åà‚â°Ï∞∏Ïä¨íÖóíå§íÄÄíâ¢íÅìíÜ∑Ïó∞ÂÆáÂÆôÂÖÑÂºüÊ†™Âºè„ÉØ„Éî„É£„Ç∫‰ΩêÁæéÂíåÊ∏§Ê≥•ÏÑ∏·∏•€ÄÁÅ´ÈæçÁ∂ìÈæôÁªè«íÍ∞ëÎèôËî£›£Â∏ù’é’∞’î’π‰∫ïËå∂Ë•øÈÉ∑ÈöÜÁõõÌïúÏôÑÏäπÏ†ÄÎîîÌÑ∏Îû´ÌèºÏóêÎî∞Î•∏ÎßûÏ∂§ÎßåÌôîÎç∏ÎßÅÍµ¨“Ø“ô”ô√éÊ•öÊÇºÊäπ‡¶ï‡¶ñ‡¶ó‡¶ò‡¶ô‡¶õ‡¶ú‡¶ù‡¶û‡¶ü‡¶†‡¶°‡¶¢‡¶£‡¶§‡¶•‡¶¶‡¶ß‡¶™‡¶´‡¶¨‡¶Æ‡¶Ø‡¶∞‡¶∂‡¶∑‡¶∏‡¶π‡¶º‡ß±–à—õÍµ≠Ï†úÎ†•Îã®ÂúãÈöõÂçîÂäõÂúò‚µÄ‚µì‚µîÏ∞®‚ô°—çÊûóÂøóÁé≤ÁèæË±°…°ËúÇË∞∑ÁúüÁî±Áß¶≈Ñ‚∏µŒæ«ê·º∏Á∞°ÊòìË£ÅÂà§ÊâÄ‡§∑‡§∏‡¶ø‡ßç·∫øÎ¶âÈôµÍ≥µÏùÄ·∏´‹ï‹†√öËòáË≠∑Â≠¶Ê†°Â≠∏‡∏∑„Éí ïÎ¥ÑÎÇ†„Å≤„Çì‚àß¬¨ŒÑ·ΩÅ·Ωº◊£…õƒÜùû¥Èáé‰∏ÄÈõÑ„Åä„Åã„Åö€±‡∏ß‡∏í„Åæ„Çå„ÇãÂàÉÁ¥ã‹¢÷îÂóîÁå™„Éä„Ç∏‚àöÊò•ÊßêÁæÖÊ∑ëÈõÖ·∫ΩÏπòÏ°∞Ê≤ªÏπ®Ï†ÑÂØ¢ÊÆø‡¥Æ‡µÅ‡¥∏‡µç‡¥≤‡¥ø‡¥Ø‡¥æ‡µºƒû—Ñ‹®‹£‡¶ì‰∏úÈ£éÔºâÍûâÂëºÊè≠Âæó‰πåÊä§Ìï¥ÌîºÎãà‡•ÉÎ™ªÎêúÏÇ¨Îûë…êÃûÔºç‘µ‘≥’£¬µ≈µƒ≠≈≠‚ùÑ‚ùÖ‚ùÜœàÎ≤ΩÊù±ÁÖßÀôÀÄ ºÃØ≈ΩÏÇ∞Ïó≠Í±∞‚úñËäôËìâ—éÈô∂ÈãºÂÜíÈô∫ÂøÉŒâÂ∑ùÈ¶®√ÄÂπ∏‰∏çÈöæÁü•ÊàëÁà±‰Ω†ÊúÄÂ•Ω·Éï·Éñ—ë É ∑ §Ê®ÇÂÖ∏‰πê«é€∞€≥€∏€π„Ç∂„Çµ„Ç¶„Ç©ÂøúÂ§ñÂØáÊà∏Ê†óÈÉÅ„Ç£⁄µÊæ§ÂèØÊÑõ„Çè„ÅÑ√à‡®´‡©å‡®ú‡®∏‡®ø‡©∞‡®òÏö∏Ïïô≈ÜÈ©¨«îÂäâËã•ÏÜåÎ°†ÎÖ∏„Éã„ÉÜ„Éá‚äÜÁõ¥ÂàÄÂä°ÂëòÂãô€à÷≤Ïñ∏Îû©ÌÉÄ ªÔøºÂ§èÂõΩÊäïËµÑÊúâÈôêË¥£‰ªªÂπ≥ÈÅ•Ÿ∏Œ©·∫ØÂè∞ÁÅ£Áä¨Ë°õÁπÅÊÆñ≈õ–¶Œñ‚µù‚¥≥‚¥ª‚µç·ºÑ√øÂ∑®Ëá£ÏÑùÊõ∫Â•≠·º∂’¢’¶’∫‡ßÄÂåàÂ•¥‚àÇ”Ä·Ωñ·ø•ÊôÆËØùË©±Â≤õÊôèËß£ÊîæÈºìÁµê–§·ºÄ·ä´·àö·àçË¥ùÏûÑÎßàÏóòÌéòÎùºÏâ¨Êùé‰∏á„É•„Éë‰πÖ‰øùÂª∫ÊÖã≈©·Äç·ÄØ·Åö·Ä∫·Äü·Ä∂·Äû·Ä¨·Äù·Äê·Ä≥·ÄÆ·Äî·Ä±·Äï·Äº·ÄäÂ°æ‡§Ç‰∫ûÁóÖÂ§´Ê¥≤Ê®™Â∫ÑÂéÑÈô§„ÅëÊâï√ë‹•‹©‹í·ºç·ºò„ÉòÁ¥ôÎ∞ï·ªßƒ©ê©§ê©©ê©®ê©¨‰ºØ‰ª≤ÂèîÂ≠£Á¨¶Ë¨Ä·ªïÀã·ºï·ΩÖ·Ω≤Èìú„Åô„ÇÑ„ÅóËÆ°ÂàíÈ´òÊäÄÊúØÁ†îÁ©∂ÂèëÂ±ï‚ÜëÂ∏∏‰æç„Åè‚¥∑‚µñ…£Êùë„ÇÄ„ÇâÂÜ∑ÁõòÂâçËèú›†Ô∏é‰πãÂÆùÊôÉË¢ÅÁ∑ØÊâø–≠Âºæ‰∏∏ÂàóËªäüòÇËèÖ„Çà„ÅÜËí≤ÁäÅ‚Äê ÅÃÉ‹ìÈùíÂª£ÂΩ∞„ÅÇ„Ççùë•≈ì…π äÂä†„Ç∑„Ç≠Ëõá‰ºùÊõ∏‰π¶¬§›¢·Ω∂ÂæåÈªéÈÉ°ÂπøÂ∫ú‚ÇÇ≈ïÃ•·∏±·ΩëÁøîÎöúÎëêÈüÉÈùºÂ≥Ω≈è„É¨√∑Ëâ≤„ÅØ‰∏ÉÈõ£Èö†ﬁãﬁ®ﬁàﬁ¨ﬁÄﬁÉﬁßﬁáﬁ∞ﬁñﬁ≠ﬁéﬁ§ﬁ¶ﬁ™ﬁâﬁ©ﬁîﬁÇﬁêﬁìœùÀêÂ®ò„ÅåËæ∫„Åß„Å£·ªò·∫¢·ªÜê∞úê∞áê∞õê±Öê∞ºê∞∞Á™ÅÂé•Ê±óÌíç‚à™‹üÁµ±·∫≠–Ñ‚µõ‚µàÈ¶ñÂ∞îÁàæÊº¢Ê±âÀåÁôãË¶ãÁè†‚Åπÿã‚óá‰ºäÊ¥•Êò≠‘∑’ª’≥ƒí«∞ƒùÃÇÁü≠ÈäÉÔø•¬•ŒÜ—âÂêüÈÜ∏„É¥ÂùÇ„Å™≈å„Éß‚Ä∞ÎûúÎìúÏ£ºÂÖ®Â∑ûÊ∞èÎ≥∏Í¥ÄÁø∞‡§ô‡§Éƒ¶Áî∞„Åü‡∫ô‡∫∞‡∫Ñ‡∫≠‡∫´‡∫º‡∫ß‡∫á‡∫Ω‡∫à‡∫±√ì‡§ú‡§è‡§ÅÂ¶ªÂ¶æÊàêÁæ§√õ‰∏ãÁí∞Îâ¥Á∂†Âçä‰∏ñÁµµ⁄ïÊùøÂû£ÈÄÄÂä©Ëá™ÂÖöÊ±†ËåÇÁ§éÔøΩÁ¶çÁ¥ÑÊùü·ΩÆ‹ó‹ôÈ≥•Â±Ö„Äñ„ÄóÂßãÎ≤ÑÏÑØÎØ∏„Çº◊¥„ÉôÂõûÊ≠∑Ê≥ïÂéÜÁ∫èÂ∞ëÂçÅÂÖ≠Êàø·àì·ãç·ãµÈ≥≥Âá∞Êú∫Èô¢Ê†∏Â∑•‰∏öÁßë·π¨‡§£ÈπøËãëÂØ∫Èü≠È•ºÊ≤πÏôÄÌòÅÌÉàÎ°úÎÖÑÈÑ≠Âπ¥Ë™≤Êï∞·º±Œê‰ø°Ë∂äÊ•≠÷Ç’©÷Å’¨„Éú«ßÍ≤Ä“∂“≥“∑€ÉÕ° ≤ìàà‡§´‡§ºÈ≠ÇÊñóÈ°çÊïèÂçìÁ∂≠Â§öÂà©ÊïôÂçÄÊ∏ØÁ≤µÂæêËê•Îî±ÎÑàÍ∞ôÎî∏¬®Ëµ∞ÂÖ•È≠îÊ∞£ÂäüÂÅèÂ∑ÆÊøÉ·∫£‚ÜìË¶≥„ÇªËØ≠Êµú„ÉñÂÆãÁêÜ‚ñ∫‚óÑ‚àò‡µà‡¥®‡¥Ü‡¥¨‡¥¶‡µÄƒß“•‡•à·úê·úì·úé·úå·úî·úã·úàÁõÆ·ª£ÁµÑ⁄ÄŸΩ€í„Çí√òÈòøÈ∫ª‡Ωñ‡Ωº‡Ωì‡ºã ∞ÃÄ‡§ß‰πôÁì¢ÊπñËÇâ‰ΩìÂèóÏïîÌñâÏñ¥íÇóíäÆíä®íàæÊ∏≠Ê∞¥Áõü‰æøÊ°•·Éí·É£Ëä∏ËÉΩË°ùÊíÉÂå∫Âåó‡Æï‡Øä‡Æû‡Æú‡Æµ‡Æ∞‡ÆÆ‡Æ®‡Ææ‡ÆÖ‡Æ£‡Æ§‡ØÅ‡ØàÁü¢‡®ï‡®¶‡®ü„Çß€û‡§á–ñÍú§Áï´‰∏πÊªÖÈùàÊîπÊíí√æíÄØíÄ≥ÂÆòÈ®éÂõ£È´îËÇ≤ÈÄ†·Äë·ÄÑ·ÄÖ·ÄÜ·Ä≠·Äõ·Äæ·Äö·Äú·Äô·Äπ⁄¶Œ®·º©·ª≠‚ÑñË°ìÂâ£È§®…î·Ω∏ ¶Â∑±Á¢∫Á´ã‰ªñÂÖ±Ê•ΩË°®Êã≥ÁØÑœã≈öüá±üáæüá∏üá¶üá™üá¨€ëÁàÜÊ¥ãË≥áÊñô‚µÑÁÑîÁâôÈü¶Ï†àÂæ°ÁØÄÂè§Ê≥âÁôæÂ≤ÅÂÜ∞‡∫™‡ªÄ‡∫Ç‡∫îÂàÜÂ∞º„Å†„Å≥„Éó„Éè„Å±·ºëÈîÖÔºå‚¶ÅÊº´Êµ¶‰Ω≥Â•à√ÑËñ©Êë©ÎÇ≠Îã•ÌÑ∞ÂãùÈÉéÁ¥∫Á¢ß–Ç—í’≤ÍΩà‚óã‚àûÂÆûÂØ¶ËØÜË≠òÊó∂Èô∞Ë±êÁ∂¨Ãâ‹¶ÊôìÊùæ‚àº‰∏™ÁïåËÆ∫ÂÄãÈ¶ÜÂèçÁ∂øÔºà·Ö°·Ö∂·Ö∑·Ü£·Ö¢·Ö£·Ö∏·Öπ·Ü§·Ö§·Ö•·Ö∫·Öª·Öº·Ö¶·Öß·Ü•·ÖΩ·Öæ·Ö®·Ö©·Ö™·Ö´·Ü¶·Üß·Öø·ÜÄÌû∞·ÜÅ·ÜÇÌû±·ÜÉ·Ö¨·Ö≠Ìû≤Ìû≥·ÜÑ·ÜÖÌû¥·ÜÜ·Üá·Üà·ÖÆ·Üâ·Üä·ÖØ·Üã·Ö∞Ìûµ·Üå·Üç·Ö±Ìû∂·Ö≤·ÜéÌû∑·Üè·Üê·Üë·ÜíÌû∏·Üì·Üî·Ö≥ÌûπÌû∫ÌûªÌûº·Üï·Üñ·Ö¥·Üó·Öµ·Üò·ÜôÌûΩÌûæÌûøÌüÄ·ÜöÌüÅÌüÇ·ÜõÌüÉ·ÜúÌüÑ·Üù·ÜûÌüÖ·ÜüÌüÜ·Ü†·Ü°·Ü¢ÂîêÁî£ËÇ•„Å°ÏÇºÏπ†ÏùºÈÉ¢·ÄÇ·Ä´·ÄÅÈÆìÊªì‰ª•Â°©Á±≥ËëÖÁÜüËÄå‡∏©‡∏ìË≥¢„Å©„ÅòÂØÜÊâìÈõÄËøêÂä®Ê∂àÁÅ≠Î¶ΩÏò•Ï∂ï‰∏ë‰∫ãÍ≤®·õãÂæãÁîµÈõªÈ≤ÅÂÆöÈÉΩ‚àÜ‹õÈÇÑ„ÅñÁµÇ„Å´Ë®àÔºè‰∫¨ÈßíÂΩ¢«£ÊáøÏÑ†Î≥¥ÎûµÁíøÊ∫êË≠ú’æ‘πÏò®ÏàúÎèàÁ´ØÊÅ≠Ê∫´Á¥îËéäÊôØÈ†ÜÁ•≠‚òä‚òãÂÆ¢Ë≥äÂÇ≥€•ÈáëÈäÄ‚µô·ã®·ä†·à±·àù·àê·âµ’Ü’ã‡¶ÜÊÇ™ËíºÊúàÂ≠óÊû∂Î∞§ÏùÑÍ±∑ÎπÑÂ∫≠Â∏´ÂÆπÁñë“ìÁÑ°ÊåëÊà∞Ï©åÎã§Íú£“£Àó≈Å‰ªÆÈù¢„ÇÅ„Çá„Å≠ÂêêÂ∞ïÂè£ÂûãÈÄ£ÎÇ®‡∫≤‡∫ó‡∫•‡∫õ‡∫ä‡∫¥‡ªÑ‡∫ï‡∫ª·ªÅ’º‘≤·äï·åâ·à†·äê·åà·à•‘∏Ïö©Î•¥Ê≥Ω”òÊ¢ÖÈõ®ÏõÖÎÖÄÁÜäÊ™ÄÊ®πÂïÜ“ê‚Ä°Èô≥ÏÜ°ÌòúÊÖß‚µÉ‚µ¢ËÇ†ËÖ∏Ëû†Ëô´„É¶Í∞úÎ∂àﬁçﬁ†ﬁÑﬁäﬁåﬁôﬁ¢ﬁØﬁÜ·ªÖ·ªç…µÎ∞ÄÌïòÍ≤åÏúÑ¬ØÈùûÂõõË≤ûÂçíÊã¨Êé°Áî®Ë∫´ÈõáÊú±‰æÜÂü∫Áù£ÁñóÊ∂ÖÂ∏àÁîò·ΩÄ‘ª„Å®ÂÉè íËä±„ÇÜ·ºÅÈØâ„ÅºÂçàÂè•‰æõÁöê⁄∑Ê∏ã„ÅïÂÄ≠ ë ÇÏ∞åÎ¶ø’™Áõ£ÂÖµÂçúÂ•é‡ºº‡Ωë‡ΩÇ‡Ω¥‡Ωè‡Ω¢‡ºΩê≠±ê≠ßê≠Øê≠•ê≠©ËñáÂ®ü€¥ËÅ∂‡•â·É≠·ÉìÊ≠ª‚ñ†‡ßÅËø™Ïà†ÍæºÎì§⁄™ ø‡§âÌäúÏò§ÎûòÍ≥§Ïãù‡Æì‡ÆØ‡Æ≤‡Æ≥‡ØÇ‡Æ±Ìù¨ÁßòÏû¨Ìôò‚ñ°›ûÁµïÁ∑ãÁê≥ÁãªÁåäÂç≥ÁãÆÂá∫ÂüüËæ≤Ê≤ºÁ®≤„Å¨„Éüƒäƒãƒ†·∫üÈúßÂ≥∂⁄áÏµúÂ¥îÈå´ÈºéÊ¨≤’≠Â•áË´áÂ∑°ÊùñÁæΩÊ∏©Âà•Ê†πƒâƒ•ƒµ≈ù‰ΩúÊåÅÂè≥ÂÄçÁ¥ç‰º¥ÂëÇÎ≤àÏß∏Ï¶àÁä∂Áôº‰ª∂ÊòüÂ≤©Í∞ìÎ∏åÏø®—π—ûÈäòÂêç π‚≤Å‚≤É‚≤£œ©‚≤•‚≤ï‚≤â‚≤áÏΩîÏóëÎ™∞·º∑·æ±‰º§ÂÆ≥ÊÑüÊÉÖÃ≤Ï±ÑËçä„ÉéÂã¢‡∫û‰∫ë·Äó·ÄªÊ∑òÁΩëÈïøÊ≥æÈüãÂùáÁäñÂº†ÂÖ∂Âùö≈≥ÂÆàË¨æÁ®øË≤øÊåØËààÊ©üÊßãÀâÈ£≤ÂàùÁùõ≈ªÊ≥¢Ë°£’®Ë¥π“∫–©–¨„Äé„ÄèÈùôÊ≤≥Âè≤Â••Îß®Ìà¨‰∏àÊñπ√äË≤ª”®”©“ïÏóÖÊñé‡§ñ‡Ω¶‡æ°‡æî‡Ω£ÈöºËçíÂèà⁄ë·π£‚Üî…æ‚òâÊ≤¢Ê†Ñ‚Ñ≥’á„Åµ„ÅéÈõ∑÷ñËâ¶…¥Ã∞ùüèùüé–Ü‡¶Ç‚ùûÃìŒüËáºÂøÖÊê∫‚Üµ·ä¢·åµ·ã´—ò·àò·âÅ·å†·à™·âçÂê≥ÂøåÂØí‡πÇËôõÊ≠≤ËôöÈ£ûÈ£õÈ´™ÂÜ†·É™Âª∂‰∫îËÄÅÁøíËøë‰π†ËèÅÂÖê„Åì„Åõ‚ùù‡§ÜÁÆÄÏúºÌÇ§ÈÄÅÁßÅÈõ¢ÎπàŒàÊï¶Ë≥ÄËìÆÎãπÏßë·åç·ãï·ãùËÅ∑‚úì—ü‚àëÁ¥ÄÊïàËΩÆÂÖªÁõäËßÇÈü≥Ê∑´ÈÇ™Èóá‰πÉ∆¢∆£Ã°·∂á∆ü∆µ∆∂·ûõ·ûì·üã‡®Ö‡®®‡®™‡©ç‡®§‡®ßÌÅ¨‡§ã·ÄΩÏÉùÊïëÊøüÈÜ´‚µóÏúµËµ∑Ïõî·ΩÑÊñØÁßÄÈéåÌå®Ï¥ùË≤ùÂ°ö≈π‚¥æ‚¥∏Ê¢®Âèã„ÇÇÁ≠âËôéÎ≥ë⁄≥Èõ≤·ªëÎ®∏Îßù—î“ªÂñúËøûÁè≠Âè∂ÂñÑÁ©ÜÊüØÂØ®ÂÇ∑Ë™øÊüª·ºæÂπïÊú´Êñ¨ƒ™‰Ωè‡∏ú‡∏éÍµ¥Â±àÍ≥°Â£Ø÷øÈù©ÂëΩ‚ÑÉË©©—öŒ•Á∫•Á¥á…í¬©»Éíàó·πáÏñµÂÑÑÁ•∫ÀÆÏ≤≠Ï∞¨ î›¥ùëãÁçª„Å∂Ï∂î’±Î∞•ÏûòÏÅú‡∏ÇËñôÂäçÂ∞∫ÁìäÊõ≤ÁéâÂí´Èè°„ÅêÂß´‡∏ØËäëËàπËà∂‰∏éÁ®ãÂ∞æÂºµÈáãËø¶ÁâüÊ°à‡®¨‘¥Ìäº÷ÉÍ∂ÅÊòå‚Å∫ÂÖãÂ≠ú‰π°‚â™ê≠°ê≠´ê≠Öê≠âê≠áê≠îê≠êê≠ì›é‚Äï‚≤†‚≤°‚≤Ü‚≤üÃÖÂøµÊØîÂòâË¢ãÈ°û‰∫íÈñ¢‰∫âÊ¥æÊà¥ÂéöËâØ‡∞∂‡∞æ‡∞§‡∞µ‡∞π‡∞®‡∞∏‡∞Æ‡±ç‡∞∞‡∞ú‡∞Ø‡±Å‚ä¢‚ÄëÁ∂æÈôΩÎä•ÏñëÊÅ©Ë≥úÂúí æ·ã≥Êª°‡ßãÊ±∫ÈÆ≠Ê±ÅÁ≤πÂ†±‹µ‹π‚îÄíÄïíå∑íÄî·Ωà·Ω®ƒ∑·ä®·ãúÊÉ°ÊÅãÎ≤ïÎ≥ÄË¥µœ£‚≤ß„Åπ„ÄÇ‡§≥‘ºÂ°òË°óÂ∂ΩÈ∫ìÂ≤≥Â¢æ‰∏Å√ÉÈõÅ‚ä•íÑäíÉ≤ÎÇ¥ÏùåÏÜç‚ô≠ÁÅµÊΩÆ·πìÂçÉÂ∞ãÎ®ºÈôÖËÆ§Ë™çÂ§¥È†≠Èó¥íÜçíäèÁïëÂÅ•Êæ≥·É§·ûî·üí·ûö·û∂·ûü·ûë·ûó·û∑·ûò·û¢·ûÄ·ûúÊûÅÊÆµÊä´Â°ûÊÖàÁáïÊâãÈπ§ÈâÑÈ™ëÊ≠•ÁèçÂ£ÆÈé≠ÈïúÊ≠©ÁÇíÊáâÂåªÎì±‰æãÁùÄÈì∂ÂêåÊãÜÁéá‡ΩÜ‡Ω∫‡ΩîÊõπÎπô–´…®–ç‚¶µ‚Ç§‰ΩµÊù°ÊãìÎ™©Á•ñÁ≤æÁº©Èò≥Á∏Æ‚ô¶√ïÏ≤òÊú¥Ï∂òÌù•‚ë§ÁãºÁãóËÇ∫·∏®Êµô‡Øá‡ØÜ‡ÆôÂãïÂ≤°ÂúñÂöïÂô∂ÂõæÂôúÁúºÁêÉËàêÂ¶çÏûëÈ§ä≈¢Â∞áÊàíÊ≥∞Â∏ÉÁë™ÊéíÂçëË≥ΩËêäÊÇüÈÇµÈÑíÈñ£◊≥Â±ãÊùâÂΩó‡©Å‡®ö‡¶â‚àáÊøÄÏåçÈõôÊë†ÁÆ°ÎÖïÂØßÂæ©Â∏∞ÊïµÂØæË™ÖÂÜá‰øÇ≈É“°‡∂ª‡∑ì‡∂Ω‡∂Ç‡∂ö‡∑è‡∑Ä‡Æá…¨ÂØπÂ∞çÊ∞ó„ÉÑÂ¥ë‰Ωõ‡æí‡Ωü‡æê‡Ωò‡Ω≤‡ΩÑ‡Ω§·íæÁ•êÎ©¥Âãâ‚µÖ‚¥µ‚µü‚µá‚µÅÂÆúÁ¢©ËÄéÍ∏ÄÏ£Ω»†Ã©Ïú§ÂéÇÂñÉ‡ΩûŒû„ÇÖÂ∞ñËßíËó©ËàûÊãâÊ≤ñÊò¥Î∂ÅÏ∂ú‰ΩëËºøÊùè„ÄÖÂâá·∏¶Â¢ûÂ£πÂê´Ïã≠ÈÇ±·Ω∞Áúû‹ºÎπÖ·Ωî·Ω†‚äñœ¥È¶ôË®ºÎã¨Í∞ÅÊ£ÆÊÖéÈß±ÈßùÁ••Â†Ç÷æ‚µØÍ≤∞ÌòºÈóª»òÂÆµ‡∏ãÎ≥µÁßçﬁ´‚äÉÏó¥„É§Ë£∏Ë∂≥Âà∫ÂÅâ‰ºç‰øäÎª•ÌäÄÎπåÏÖòÏÉ§÷£÷≠÷ëÂéøÁ±ªÈ¢åÈ™®Í±¥ÎçüÎ¨ºÌôâÊ≠¢ÈÅä‚òÜÊàØ Ä·öº·õÖ·ö±·õö·õè·õ¨·ö¥·ö¢·öæ·õ¶‡Ω†‡æ≤‡Ω°‡ºç‡æ±‡ΩÅË≤¥ÁúÅËÆ∞Î©∏Ï†ìÏÉàÏï°‡™¨‡™æ‡™™‡´ÅÂ∫É›®⁄àÁìÆÁî∑»ö·àõ·ãä·ç°·âÄ·çÑ·äÉ·ã≠·àã·à¥·àû·ãì·â†·à≥·ãò·ä•·ã∞·àÅ·à∞·ã©·ãö·â•·àî·∏≥·πó√êÁÄ¨Ê∑≥Ê∏°Â¥ãƒèÁïôƒïÊ≠ºËΩ∞ÊÆ≤ËΩü·àæ·â£Ï¢ãÏöî‹∫‹∂ÂÆÅÊ§úÁ¶éÂ∑¥ËÆä·πÜ√åÂÜÜÊÅêÁ´úÌíÄ‡∫öÈÄÜ‡¶áÏÉòÁ°¨Ëõã‚ó¶‡∞≤‡∞°‡¥°‡≤≤‡≥ÅÊ±ùÂ•ëÊ≠ìËà™·∫•Ëé´ÎÑ§ÏïåÏã¨‡¶ö‡ßáÈò™Â±†Èî∑Â≠ôÊ†ãÊ¢∞ËâæÂ≠©Á∑®Ë™ïÂÆø°®∏»≥êÄ°êÄÆêÄÜêÄÉêÄ∫êÄöÂ¥éÂºòÊØç≈ê≈∞‚ôØ·ÉßÊªáÈáçÀçË≠≤‡≤§‡≤≥‡≤≠‡≤∑‡≥ÜÂìàÊª®Êø±Ëûç‡§àÈà¥“ÆÂà∏·â¥·ãé·àÆ·äñ·à®·ºπ‡¶≠ÕµÕ∫ÂÉïÊü≥Âç†Ëíô’å√î ù‡•åË•≤Èó°Î•òÎ£°È©™Âß¨ËãîÁ¥´‡πä÷ªÊñâ‡∏≥ÃàÎûÄÎ¥âÈºªÂ©¶‘ΩÊΩúÈõÜÊ¢ÅÁæåÏπ´Î≥∂Âú£Áªü‰∫öÂêâ‰øÆÊó©Âõ≥ÊÑè◊ÉÊù•ËªíÂö¥ÊÆäÊüìÁÇéÈò≤Á¥ìÂõ∞Ê¢ùÊ¢ìÁπî„Åî„ÇÉÎ™Ω‚åµËíøÏò¨ÈçµÈñâ‚àÖ·àÉ·à´·ã¥·ä§‰πâÂΩπÁêà…ï éÀë‚ò∞ËôîÁãÑ…Ø·É©Â∏å≈∂ƒúÌûòËä≠Ëïâ„É∂‚òøÂààËùôÊÄÄÈùúÂùêÊù∞‰º¶ÊØõ‡®ØÂΩ¶€ã‰ªäÁå´ËÄ≥Ïºì·ª≥‡Æ¥‡ØÄ‡Æ™Â§ïÁõ∏‡∞™‡±Ü‡∞è‡∞±‡±áÎäêÏôî„Öà„Öä„Öã„Öå„Öç„ÖéÌûàÁ§ºËâ∫‡™Æ‡™π‡´ç‡™¶‡™ñ‡™®‡™§‡™∞‡´Ä‡™ú·â°ŸµÂπ≤È£Ø‡∞ø‡∞Ç‡¥∞‡¥ô·πÖ·Ω§ËµõÁæäÁæπËá≥…üÊßôÂáØÈí∞ÊÅíÎ†§Â≠îÂ∫ô‰æòÂØÇ‚à©ÏñºÏï†Èñî‹∏Â•∂ÈÇë·∫§ÏºÄÌåù‡ß≠‡ß¨‡¶èƒΩË±äÏßÅÏûà„Éö…¶ÌÜµËÑá‚òÇ‡∫Å‰ΩïÁ©ó“öÎ©úÍº¥Âù°ÂÅµ⁄±Ë∞üÁªìÂáúÁ°´ÈªÑÊ¨Ω‡§û·πØ‚ôÇ‡ßå’ÖÁÇπÊà™ÂäáÂ∑¶Á∏±Á∫µÔ¨ßËæ©ËØÅÂç´Ë°ÄÁÑ¶·ªØ·∫©Ëè†ËòøÂåÖ‡≥ä‡≤∏ÂàÆÁóß‚ù™‚ù´ËÆÉ‰ºèÁæ≤Â™ßÁáßË¥ØÂΩªÊâßÊåâÂä≥ÂàôÂúÜÂúìÂø´‡ªâ‹∞‹Ω‹≥ÏñÑË£èÂèÇ√û«´ËøΩËøπÊ≠¥Ë£µÊü±Í∑∏ÍΩÉÎìùÎ•ºÂùäÈ¨ºÁ´•’ìƒæËééËΩ¶Ê∑∑Ê≤åÁ≤ãíÜóÏÑ≠Ê∫ÄÈΩ¢ÔºíÈÅ∫Î£π·ªÉ≈íÌÖåÏúà‚óæ‚ò≠ÈôàÁöÆ€¶Áë¢Ê∑∏Âå™Êáàœ¨‚≤±‚≤õ‡∏êÊÇ†Áà∂‡†î‡†ù‡†å‡††‡†Ä‡≤π‡≤µ‡≥Ä‡¥π‡¥µ‡µª·πâÊù≠Ï∫îÁÜπÁøºê≠äê≠çÁßãÁÉèËÄ∂Èîê‰∫´ÂæÅÊ•ºÌñ•Ï∞∞ÈÑïÊú≠Ë®£ÂêèËÆÄÊ££Âñ∂Ê¥ª⁄ß‚Ö¢Ïπú‚Ö°ÂçèË∂ÖÁ∫ßËÅîËæ∞ÁéõÊ¶ú·ªâÌûêÎü¨ÈôïÈôù…¢ÁâàÂë™Âªª‰π±ÈóòÁã¨ÊΩî·º•·ººÿø›íÂÜ®Âã≤Ëû¢Ëü≤Êô¥⁄É‚¥ΩÊëÇÎ≥Ñ‡∞ï‡±Ä‡∞¶‚îò‚îîÍú•ÂñÆ⁄Å‡∏§‡æôÏÜêÊÄíÂÖà„ÅíÈôõÁï∂Ìùë»≤Âê¥·º¥ÊåáÊå•Êû™ÊÇ≤ÈÄ≤Ïã§ÊîùÂÆ¶„Ñ±„Ñ¥„Ñ∑„Ñπ„ÖÅ„ÖÇ„ÖáÌëúÁÑöÂùëÂÑí‡∫∏‡∫π‡∫°⁄ãËÄïËóè·∏ç·∏∑·∏π·πùƒ∂·ºåÍ∞ïÁíß–âÊÆ∑Êï∑ÊÄ•Í≥®Îì†ÁåõÈÅéŒ™ŒÖËàáÁ∑öÈõúË™åÊ≤°Á•àÌÖêËßÜÁéØ≈àÿâÈêò·ºú·ûÇ·üÜÂ≤≠Á´ôÍ≤ÉÁ≤âê©£ê©≤ê©µ·ãê‚Ç©ÏóîÁ©ÄÈ∫¶Á≤üË±ÜÈªç·π¢Êäú¬™Ê§ç‚â´Í∂åÎ≤îÊï¥ÁãÄ·ø∑·ªóÊàÄÈ©ö‚≤©ÈçõÈîªË¨ùÂ§âÔºüÂá¶ÂØüÂÖúÈç™ƒº·º°“ë‡∏ø·ççÂ≠´„ÄúÊÉ≥È¶ÆÂ§¢Àõ—ßÊ∑°Ï†ÅÁáàÊµÅÁÅØÂÜ•ËúÉËìùÂÑøËóçÂÖí‚Ç¶Á∫™Á¢ëÂ∫óÂºÑÈöòÁÉÇËÖæËÆØÂæÆÊêúÁãêÂç∞ËñèÊºøÏùëÎãµ‚ÇÉ«ÄÏáºÏ±î·ûÑ·üä·û∏·ûñ·ûª·ûáÁó¥‡ß∞ÊãúÊóÖÎ∞òÊâçÊßåÁ∫å‰ªôÁëûÏ∂©Âø†Ëæ£ÏπºÍ≥ºÊπæÂ§ú·ºâËëóÈ§ìÁõ§…≥Ê¶éÌåÄ·ÑéÊ≤à≈±ËÅØÂÜçÃå‚≤´‚ÇÄ–éÈåÑìáæÂÆΩÂàòÂèå‰π≥·É•‚äÇËàüÂè∑‡¥™‡¥ï‡¥∂ÈÄñ·â¢·â§Îª§ÂüîÌòï‚òÉÎààÂëâ‡∫µ‡ªàÎ°ùÁ•øÂÖßÂ∞ÜË®óÁ∏ΩÌíà‡∫úÁúå‚¥πÂ¶ÇÎüâÊé®·ø¥ÁÉ≠Â®ÖÂç°„ÉåÔºöÔºÅÂ¶´Â™ØÊªøËÉ°¬¢ÈìÅÊµÑ·ΩäÂµØÂ≥®‚Ñ∞Âú®„Ç¥Á∑èÈ†ò‚àÄ‚àÉÊòÄÌúòËºùÏùµÂºóÂ†ÄÂ∫∑Ë¶™ÁáíÈÖíÁÑºÈÖé‰ΩøÁ®ªÂùùÂ£©Êó≠·ÉØÂåÇÈåµÂÜúËæõÊóΩÈ¥®ÌäπÂùéÂºàÊªÑËº™Â†ÖÈ≠ö“õÊ°ìÂõ†”èÂÜôÂ¢®·û†·üÇ·çãÊ∑µÈÇìÁ®ºÈÑß·ª•“±ÁÆ±Áµ∂Áù¶ËîµÏ∏µÈ∂ª·åã‰øóÁªçËä¨‚òÖÂíñÂï°ÊùØÂ∞ôÁ¶Æ‚Å∞‚òÑÌèâÂ∫´ÎùΩ·∏áÁ∑¨Áà≠·ÅÅ·Åá·ÅÜ·ÅÖ·ÅâÁâõÂ¥áÂ∞à√ô·É¨·è£·é≥·é©·è¨·èÇÌï®Âº•Ëñ´€Ç«ÉÍ∑†Á¶™Á¶Ö·πü‚®éÁõë·ûÖ·üâ·ûô–äÎÜçÏÑ§Ë™™Ëπ≤Ë∏ûÁ±† ΩÂ≤∏ÎπõÁ•ùÂ©∑íÑ≠ úÈó™Âßö·ûí·ûé·ûä·ûã·ûèÏ≤¥ÌÉëÊÄùÈÅ†Â•ÑË¶öÁï∞Â©öÂßªË≠öÁéÑË¨ôÂæ≥‡§ë¬∂È°îËÇáÈ¥ªÂüπË©∞ÊÆ∫ÂãáÏΩ©Ï•êÌå•«û«ü·∏ê·∏ëƒª≈Ö»Æ»Ø»∞»±»¨»≠≈ñ≈ó≈™ÊòØËàåÊ∑±ËôïÁâ¢Ë∑ÉËøõÂ¨™Ëê¨‚ùñÂá™ÈÅáÊáã≈∑…Ω…ªÂπüÂ∏úÂº∑Ë∫∫ËúúÁçÖË±¨ÈπÖÈµùÎ≤åÊßòÁ¶∞Ÿ∂Ïõå‚Ñ∂⁄ìÊä•ÂëäÈΩãÂèü‰πçÁïèÊå∫ËÆ∏Âã§ÁßªÂõ¢ËææË´∏÷á‡®ñ‡©á‡®• âÊπØÊñΩ·ΩïÁ®Æ‰∫àÂæÖ—£ÍªòÎÅºÌï†Ï†ë·àÖ·ãàÈ∑ÑÎ¶ºÈõûÂåπÁóï‚àéÈéÆ‚Ñ•ÌÅ¥ÎüΩÂÆ™‹ñ„Åû„É®Â±ØË°çÁ∂è‡∏ëÏ§ò·ªõÂ∑≥ËæüÂããÂ§ç›Ç‰∏æËµ§Êñº‚ÄíÂÖ©ÊôâÎ£® ê ãÁÄö‚ë†‚ë°‚ë¢‚ë£‚ë•‚ë¶ÀáÏ∫êÂÖÄíåìíÑííâ£ËêßÁ¥†ËÇ°ÀìÈΩêÂú∫ÈΩäÏäµÂ´¶Â®• ±‚ÑùÂΩº‰º∏ÂêæÎÖêÍ∑úËØ∫Ïïº„Öè„ÖëÂÆüÂÉµÂ∞∏Â±çÊÆ≠ÂÅΩÈóúÂÖ≥ÂÆ§·øæ…´ üÁÜô‰æØÊüèÊΩ¢ÊàòÁÜ±‚µä„Å∏Ãä·∏ï„Üç„Ö∫„Öº„ÖΩ„ÖÜ„Öæ„ÖÖ„Ñ∫„Ñª„ÑºÌïÑÂâ•Êç®Ê¥Ω·ÉüÈõ∂÷û–èÁ§∫ÏÑº·ä£·ã≤Á£ÅÔº∂Áµ≤Á∂¢‰∏ùÁª∏⁄å‡®â‡©àÁ∫¢Â¥ñ«öÁ¶ßÊÑº⁄ú‚ô•È†º·ΩùÈä≠Ë¶É–ãÊìçËë£‰∫éËµµÈûÖ‰æ¶Èñ©·ø≥Êõæ¬πêçÜêå∞êåøêçÇêå≤êåæÁß∞Ê∫ñÁ¶ÅÂºêÊãæËëõÂ™õÂÇôÁ®öÁÇ∫Âïü‚Ü¶Âé¶Áå¥‰ª∞Ê±Ω‰ªéÎûåË¶ΩÁõß‰ºêËåÉÈõéÈ†ÖÁë±Îß§ÌõîÏ≥êÁé∞‰øÑÊí≠ÈÅ∑‡∏èÂ¶áÂñ∑Â•è‚¥¥Á≥ªÁÆ≠Ê∂õËã¶ÁìúÁûéÊªå‰∏•Â±•ÈßÑÈûãùÑûÈáèÈ™å·±•·±ü·±±·±õ·±≤·±§·âÖÊÜÇÂªâÊÅ•Â±ÄÊûùÂ∫ß·â±ÂØ∂ËßÄ…ò‡™´‡´ã‡™≤‡™∏ÁóòÂçñÎÜíÎßêÎåìíÑëíâãíÇµíà®íåã‡∏Æ…óÂÑ™Ìó§Î∏êÏûÖÊ°ú·∫ªËäÇË≤ìÊçíÊúóÂ∑´ÈæúÂ¥ôÈ≤úÎã¥Ïú∑ÎÜÄÌÅ∞Íµø‰ªòËàäÂèÉÈÄÉ‰∫°‰º†Ìë∏ÎäëÎò•Î∞úÎ∞îÏì∞ÂØ©Ë¶èœπ·∫ãÊ°ë‚Å∏ÏÖß‚ñà‚òºË•™ÁûûÂä™ÁêâË´çÁî∫Áªø‚áìÎÖπÏùçÁÅ∞ÂìÄ·∏™Â§±÷ÖË°ΩÌÉùÏ†äêéÖêéÑêéÜËù¶Â§∑‡ÆâÁô∫ËÉåËªåË∑°Á¨†Ë°Ü‚à¨≈•ÊúüÈõØÊõâ·∫´‰º™Áª¥Í∂êÈóïÂêÑÊ¥ó·æ∞ÂÜ¨íà†Ëî°ÁïÖÏπ¥ÌÜ°‰øÉÂàáÊò∂Í≥ΩÁΩÆËøéÁíãËãèÌÇ¨‡∑Å‡∑ä‡∂∏‡∑ê‡∑öƒ§…∞‹ÜËÑâ·äù‡∂å‡∑ô‡∑É‡∑ÑÊï∏Ë†°·∑ÑÁ¥ò‡®ó‚ÇΩÈë´ƒÇ≈òƒòÈùñÂ•âË£ÖÈå¨ËºîÂÆ´Ëóù„Äå„ÄçÏïÖ”óÈóπË∞à·Ω∫ÊüøÈ∫ø‡Æé‡Æ∏Á§Å„Ñ≤„Ñ∏„ÖÉ„Öâ„Ñ≥„Ñµ„Ñæ„Ñø„ÖÑ‰æ®Í¥úÏ∞Æ·â´Á•ØÊ¥ûÊΩòÈèûÂûÇÁ∑£ÈöàÊÄªÊãåÎÇúÈúáÊñõÊ≠åÂá°ÊèêÌåêËº∏ÌòπËãóÂßìÂçø‡ªÅ‚ñßÂàõ‰ºòÂìÅÂâµÁ®îÁÖô·ø°Èúä“íÈ©ó ™ ´Õú…ÆÏò∑Î∂âÎÅùË¨õËÇñÁê™‚ÖõÊâáÎ≤§Áù°Áú†ÊùªÈØ®‡§•ÌÜ±Ê£©œÆ‚≤Ö‚≤ò‚≤öÁØâ‚ãÆ‚ãØœµÌîàÊ¢†ÏóÑÌÉÅ‚ÑìÈπòÊöÆ‡¨∂‡¨æ‡¨ó‡®≠Ëíãê±Éê∞∫ê∞¥„Å¢Á¥êÂºì·øÉ»á»ãÈßïÊ¥õËêΩ‰ºΩ‡∫∑‡£∞‡£±€çË¥¢ÈóÆÈ¢òË¥∑ÁªºË°°ÂØºË¥ßÂ∏ÅÂåôÎÑê«ùÊÉ†ÃáÍ∞ê„ÇæÃç·äëËá∫ÎπîÊüîÔºõÈÑâÂªä·µê·µëÊúõÎ∞èÎ•†‚à® ¢ËøÖÊ†ë‚úù‚úö·ÅãË≠∞·âΩ·â¶·äì·àï‰∫ÜËøá‚Ç±ﬁùÔºùÊóóÂÖÅÊ©´ÈÅºÎ°ÄÊÖ∞Ê∂â·ª´·Æò·Æû·Æ•·Æî·Æ™·Æì»üíÜ≥íÑ©íâ°íã´ÊÉ£ÊèÜÁ∂≤È¶¥ÂØø‚ù§Ã®È∫íÂ∞ÇÊ¨ßÊ≥ìÂ•ïÊ≠ê åÂ∞éÁøÅÂÜ≤ƒπƒé·∏û·∏†Êòº‚Ç†ÏûéÎãô’ëÊÅ™ÊÑçÊõøÂä≤ÈÇÆÈÉµÂØ´Ê®ìÈíü„ÇëË©ïÎêòÂ†§Ë†ªÊàéœí‚Ö£Á∂∞ËáßËçºË£Ω·ûº‚ú™ÊØ´Ïà≠Í≤∏ÈúúÏóΩ…≤Á¥ÖÂ¶âÊ¨¢Â•ñ‰ª§Â±èÊéàÈ∑≤ÂÄªÁ∑ë‡∏∂Èô∏ÈßòÎØπüåõüåúÊàå‰∫•Áï™Èõ™Á∫øÂ∏ØËéâÊÄ®Êµ©Î°ØÎç∞Àú„Ç•ÎªêÁóõÂ¨ñË±öÂçµÂ∑£Á≥†Êº¨„Å•ÊöóÈââ⁄ΩÏõπÌà∞ê∞¢ê∞çê∞ÄÂç∑Ë®ï⁄ö‡ªçÁÖé‚éÅ‚ñ¨‡∑í‡∂¢‡∂≠‡∂∫Áì¥Èßø‰∫Ç·º†ÈÜíÊßø’ÅËá•Ëñ™ÂòóËÉÜÈΩïËÑà·Äñ·Äâ·Ä∏Ÿ≠»ùÈñÉÀÅÁ≠îÂ®ÅÂàÑÂ∫Ü‰ºüÁñÜË•Ñ‡ΩÄÁñ´Êòßê≠†ê≠≠‚è∫Ã§ŸºÊØíËñ¨ÂØõÊ±∞Â∂∫üú®Ë∞ãÁç£Í∏â‚áîÍπåŒéÁÆïÁæÜË≤îË≤ÖË≤ôÈôÜÈÇä‚µ•‚µïê≠ÆËïÉÏ≤ô‰ΩçÈ∑πùüóùüñùüêÌóàË®±Á≠ÜÁ¨îËêå‰æ†ÂàëÊ≤™ÁöøÁóáÂÄôÂ∫èÁ†¥ÊÅµ‡∞∑·∏©‡®ºÂêØÎãòÊ∂º‹≤‹øÏïΩêéªêèÅêé´êé†êéøêé±„Åª‡∏âŸøÊ°îÊ¢ó∆§Ë¢´‰∏ôÂæ≠È§òÈ∫óÈΩÆÂÆæËØçË©û‚ôÄ·ΩçËæÖÈ¶πÁû¨‰ªèÊµ¥‚ü´‚ü™Íú¢‘∫Ë®∂Â¶ì‡∑ù‡∂ß‡Øã‡•®·æΩÊü¥‡µÇ‡¥±‡µΩ‡¥á‡µÜ‡¥´‡µã‡¥é‡¥ú‡µá‡¥∑ÏΩòÁπìÁâß‰ªìÂßëËàÇÊ∏âÈÄäÁ¢£÷•÷Ω÷§÷ô÷õÌïë·±ö·±û·±™·±†ÂºäË≠âÊ∏àÔº™Ôº°ê§Éê§âê§Åê§çÂÖ¥ÊùúÁñπÂ•ßË≥ìÂÄÇÏÖî…ûÎãÆÊñ∑Ë¢ñÁôñ·ºìÍ≤¨ÎéåÂ™ΩÂªüÂ¶àÈòÅËéûÏáÑÈéñ—¶—®—¨—™Ë™†Ïï§Á™ó·Ωå‰∫§ÂúèÊ≤ÖÊöÅÌòàÈª®Âà´Ë∞äËäù‚òÆ·åΩ⁄Ñ€âÊâ¨È∂¥Ê≠°Á®ΩÊ®µ·ªèê§áê¢äê°á‚≤ë‚≤óËó•ê©¢ê©≥ê©ßê©•‚âÉÂÖéË™¨Âö¢ÂºñÌôà·ºù·µªœöÂΩ±Êà∂ƒ∫‚Åá‚àùÂ•óÂ∫î‡∫çÊòäÂ¨åÊèÆÈñëœëœú‡≤∂‡≤óÎπµÌôúÊÆòÊºîÁ∏Ñ‚áê·ì±·ñΩ·êß·ñøËå∏ËÑ±ÊââËë´Ëä¶Â®Éê©¶Ê∑ñÈªòÏ©êÏüÅÁ∏ÅÂ¢ÉËø∞Â∞ßÁÑ∂Ë¥∫·∫ì·âÜ·âà·â≥·â≤ËÑö·à¨ÈπèÎ£∞ÎûÑÂêëËÑèËÖëÈßÖ‡•§‡••‡•©‡£≤‰∫¶·∏èÁ≥∏Â≤êËØ∏ÌåîÔº∑ÔΩâÔΩîÔΩàÔº¥ÔΩÖÔΩéÔΩÑÔº¢ÔΩåÔΩñÔº©Ôº≠ÔΩÅÔΩáÔΩÉ·∏≤·üÄÊ∏áÁ£®ËèåÈÇ¶Â¶Æ·ºÜ—ìÈèò‚åà‚åâÎÑ∑ÏÜîÏ≤´ÎÜàÎ©ãÏóàÈáùÈÅÇÊâàËºíÊµéÂ∏¶ÀòÈå¢ÂéòÂâ≤ÈìÄÎ∂ÑÏ†êÂà∂Ã†ËæπÂ±¨‰ªΩÂ±ûÁê¥ÂºÇÊúïÂØ°ÂäûËÆÆ‡ΩÖ‡Ωê‡Ωö‡Ωõ à‰∏§–™‚ôà‚ôã‚ôé‚ôëÈ°ìÈ†äÊ±æËìêÂßí«ú›≠‰ΩôÈ¨±íà©íå¶ËäΩÎû®Îç©ÊïéÏ∏°Ëå®Ë≤®‡≤ú‡∞óËëµÀÉÀÇÁΩ∞Ëá≠Ê§øÎßâÎüºê©´‘æÂ¶ô‚áå‡®Ç‡ßÉÌï≠ÂäàÂãæÎÑåÎ≠îÂæÄ€µÊû´Êù®ÊïÖÏú°Êä±Âû¢Ê∑®ÈôÄÈ±≤Á∏£·Äí·Ä°·πµ·∏≠È∫π“≤ÊöâÊ¨äÊì•Á¨ôÏó†ÈëíÏûºÁåøË∞±È≠è⁄â⁄º…ñ‡¶Ö‚òê‚òë‚òí·æçüëåÈ†àÂìâêéúêéÇêéóêéö‚ä≥ÃüÂÖÜÂÆÖÂúÄ‡ßÇ‡ßß‚ôÆËØïË©¶ÊäóÂâëÈê∏‚Çì‚ÄøÊòîÌñàÏûñÈ≥©Ê°ÇÂπΩÈúçËîò·∫íÊâ∂Ë∑ã·ø§‡∂©‡∂∂‡∂´‡∂±›•ÎèåÏûîÊ∏æËÜΩÂ≠§ÍûåÎ£åÎ¨òÂ¢≥Â¢ìÁõúÊéòÊæç‡™ø‡™Ø·Ä∞Ë£≥ÈµúÁª™Á®≥Á©©Ë£î·ìÑ·ìá·ïó·ë¶ËáüÈÅ∏Ê£ü·ºµË™æ‡©¥‡®á‡®ìÂπª‡∫£ÏäàÌçº√∫√üŒ≤·Éò·Éêƒ±≈ü$œÖŒªŒÆœÑœâŒΩŒï÷∞◊î◊ï÷∏◊¢÷¥◊ë◊®◊ù◊ß◊™◊ú÷º◊ì◊û◊©◊ó÷∑ŒπŒ∫ŒºŒ¥ŒµœÅœÉœåœÇœáŒ∑‹ù‹ê√ñ◊ü◊ò‚Ä≤Œ≥Œ¨ŒØ‚àí„É≥„Éº„É´Ÿ†œÄŒî‡∏≤‡πà#◊õ◊ñ‰∏≠¬£Ô¥øÔ¥æ^–ì–¥–ø–ª–∂’°‚Üí‡§∞‡§æœÜ?—Ç—Ü—è—É–º—ÖŒ∏œçŒ≠◊°€ïÂ§ß–ö–±—ñ–ú–°–π—á‡∏ô„Çπ~÷Ä≈ç—ã‚Ä≥’•ƒüŒ†≈Ç√•„ÉÉ„ÅÆ—å–≥–∑◊§’´–ë–†–ü◊êƒ∞…ôÀàŒ£Œë–í–ù–ê≈°ƒç—àƒÉ‚¥∞◊†‡•ç¬±◊í€Å◊¶€î’∏’∂‚Äπ‚Ä∫„Ç§ÃÅ‹™√Æ‚ÜêŒô'üèΩÈïáË°¢êéºêé∑êé°êé¥êéπ€ä‰ø±Ïô∏Í≤©ÂºâË´æÎ£ΩÎøêËèä‡∂†‡∂ØÂ¢üÂú≥Îç§ÏòπÏõÄÊô©ÂæΩíâàÁõó›ï⁄ø∆ùÂ£±Âè¢ËçÜ·ûÉË¢¥ÁîöÂçòÎ†â·ûÅ·üÖÂ°äÂñµÎ≤ÖÊûú‡≤†‡≤•„ÉÆÊ£†À∏‡ßàüèªÎ≤®Î≤≥Î¶∞Ïõ¨Èî¶Áª£Îú®ÂÆõ·Ü®·Ü©·á∫·áÉ·áª·Ü™·áÑ·áº·áΩ·áæ·Ü´·áÖ·áø·áÜÌüã·áá·áà·Ü¨Ìüå·áâ·Ü≠·ÜÆ·áäÌüçÌüé·áãÌüèÌüêÌüëÌüíÌüìÌüî·ÜØ·Ü∞Ìüï·áåÌüñ·áç·áé·áè·áêÌüó·Ü±·áë·áíÌüò·Ü≤Ìüô·áìÌüö·áî·áï·Ü≥·áñ·áóÌüõ·áò·Ü¥·Üµ·Ü∂·áôÌüúÌüù·Ü∑·áöÌüûÌüü·áõÌü†·áúÌü°·áù·áû·áüÌü¢·á†·á°·á¢·Ü∏Ìü£·á£Ìü§Ìü•Ìü¶·ÜπÌüßÌü®Ìü©·á§·á•·á¶·Ü∫·áß·á®·á©Ìü™·á™Ìü´·ÜªÌü¨Ìü≠ÌüÆÌüØÌü∞Ìü±Ìü≤·á´Ìü≥Ìü¥·Üº·á∞·á¨·á≠Ìüµ·á±·á≤·áÆ·áØÌü∂·ÜΩÌü∑Ìü∏Ìüπ·Üæ·Üø·áÄ·áÅ·á≥Ìü∫Ìüª·á¥·áÇ·áµ·á∂·á∑·á∏·áπÂáõëÄ•ëÄÅëÄ´ëÄÆëÄ∫ëÄß‚ÇæÂççÂçê„ÅΩÊ£öÂ∞±ÁÖ•Âú≠üîπÔ∏èËè±Ê£ãÈ∫üÁñä"

In [18]:
stripped_chars = "‚ô´‚ô™¬°¬≤¬∫¬ø√Å√Ö√á√â√†√°√¢√£√§√¶√ß√®√©√™√´√¨√≠√Ø√±√≤√≥√¥√∂√∏√π√ºƒÅƒáƒåƒìƒõƒ´≈´Àö‡∏¢‡∏£‡∏≠‚Äì‚Äî‚Äò‚Äô‚Äú‚Äù‚Ä¶‚Ç¨‚ô™‚ô´¬Ω¬º¬æ‚Ñ¢Ÿ´Ÿæ⁄Ü⁄§⁄®⁄≠⁄¥¬Æ"
stripped_chars += "\xa0"
stripped_chars += "\x80"
stripped_chars += "\x93"
stripped_chars += "\x94"
stripped_chars += "\x87"
stripped_chars += "\u200e"
stripped_chars += "\u200f"
stripped_chars += "\u202a"
stripped_chars += "\u202c"
stripped_chars += "\u200c"
stripped_chars += "\u2066"
stripped_chars += "\u200d"
stripped_chars += "\x8d"
stripped_chars += "\x89"
stripped_chars += "\u2060"
stripped_chars += "\u2063"
stripped_chars += "\U0010fc00"
stripped_chars += "\x81"
stripped_chars += "\x9b"
stripped_chars += "\u2069"
stripped_chars += "\u2067"
stripped_chars += "\x88"
stripped_chars += "\x9d"
stripped_chars += "\U0001faf2"
stripped_chars += "\U0001faf1"
stripped_chars += "\u061c"
stripped_chars += "\xad"
stripped_chars += "\u06dd"
stripped_chars += "\x97"
stripped_chars += "\u206c"
stripped_chars += "\u206a"
stripped_chars += "\x9e"
for c in rare_chars:
    stripped_chars += c
stripped_chars = "".join(list(set(stripped_chars)))

In [19]:
def clean_pipeline(text):
    text = araby.strip_diacritics(text)
    text = araby.strip_tatweel(text)
    # text = araby.normalize_alef(text)
    # text = araby.normalize_hamza(text)
    # text = araby.normalize_teh(text)
    # text = araby.normalize_ligature(text)
    text = text.translate(str.maketrans(constants.UNICODE_LETTERS_MAPPING))
    text = text.replace("‚ô´", "")
    text = text.replace("‚ô™", "")
    text = text.replace("\xa0", "")
    text = text.replace("\x85", "")
    text = text.replace("\x96", "")
    text = text.replace("\u200a", "")
    text = text.replace("\u2009", "")
    text = text.replace("\u3000", "")
    text = text.replace("\u202f", "")
    text = text.replace("\u2002", "")
    text = text.replace("\u2003", "")
    # delete punctuations
    # text = re.sub(
    #     r"""([.,!?()\/\\ÿå"'\{\}\(\)\[\]ÿü<>¬´¬ª`ÿõ=+\-\*\&\^\%\$\#\@\!:|‚Ä¶;ÿü‚Äì‚àí])""",
    #     r"",
    #     text,
    # )
    # text = text.translate(str.maketrans({key: "" for key in string.punctuation}))
    
    # text = re.sub(rf"{stripped_chars}", "", text)
    text = text.translate(str.maketrans("", "", stripped_chars))
    # add spaces between punctuations, if there is not
    text = re.sub(
        r"""([.,!?()\/\\ÿå"'\{\}\(\)\[\]ÿü<>¬´¬ª`ÿõ=+\-\*\&\^\%\$\#\@\!:|‚Ä¶;ÿü‚Äì‚àí])""",
        r" \1 ",
        text,
    )
    text = text.translate(
        str.maketrans({key: " {0} ".format(key) for key in string.punctuation})
    )
    # normalize punctuations
    text = PUNC_NORMALIZER.normalize(text)
    # delete extra spaces
    text = re.sub("\s{2,}", " ", text).strip()
    text = text.replace("Ÿ°", "1")
    text = text.replace("Ÿ¢", "2")
    text = text.replace("€≤", "2")
    text = text.replace("Ÿ£", "3")
    text = text.replace("Ÿ§", "4")
    text = text.replace("Ÿ•", "5")
    text = text.replace("Ÿ¶", "6")
    text = text.replace("Ÿß", "7")
    text = text.replace("€∑", "7")
    text = text.replace("Ÿ®", "8")
    text = text.replace("Ÿ©", "9")
    return text.replace(' ','‚ñÅ')

In [20]:
clean_pipeline('ÿßŸÑÿ≥ŸÑÿßŸÖ ÿπŸÑŸäŸÉŸÖ Ÿàÿ±ÿ≠ŸÖÿ©. ÿßŸÑŸÑŸá')

'ÿßŸÑÿ≥ŸÑÿßŸÖ‚ñÅÿπŸÑŸäŸÉŸÖ‚ñÅŸàÿ±ÿ≠ŸÖÿ©‚ñÅ.‚ñÅÿßŸÑŸÑŸá'

In [21]:
def prepare(text):
  return clean_pipeline(text)

In [22]:
# test the prepare method
prepare('hello ÿ®ÿßŸÑÿ•ŸÜÿ¨ŸÑŸäÿ≤Ÿäÿ© ÿ™ÿπŸÜŸä ÿ£ŸáŸÑÿßŸã')

'hello‚ñÅÿ®ÿßŸÑÿ•ŸÜÿ¨ŸÑŸäÿ≤Ÿäÿ©‚ñÅÿ™ÿπŸÜŸä‚ñÅÿ£ŸáŸÑÿß'

In [23]:
train_dataset = list(map(prepare,tqdm(train_dataset)))
train_dataset = list(filter(lambda doc:len(doc)>0,tqdm(train_dataset)))
train_dataset[:2]

  0%|          | 0/1300304 [00:00<?, ?it/s]

  0%|          | 0/1300304 [00:00<?, ?it/s]

['ŸÇÿßŸÖ‚ñÅÿ±ŸàÿßÿØ‚ñÅÿßÿÆÿ™ÿ±ÿßÿπ‚ñÅÿßŸÑÿ™ŸÑŸÇŸäÿ≠‚ñÅÿßŸÑÿßÿµÿ∑ŸÜÿßÿπŸä‚ñÅÿ®ÿ™ÿ£ÿ≥Ÿäÿ≥‚ñÅÿπŸäÿßÿØÿ©‚ñÅÿ®Ÿàÿ±ŸÜ‚ñÅŸáŸàŸÑ‚ñÅŸÅŸä‚ñÅ1980‚ñÅÿå‚ñÅŸàŸáŸÖ:‚ñÅÿßŸÑÿ≥ŸäÿØ‚ñÅÿ®ÿßÿ™ÿ±ŸäŸÉ‚ñÅÿ≥ÿ™Ÿäÿ®ÿ™Ÿà‚ñÅPatrick‚ñÅSteptoe‚ñÅÿå‚ñÅŸàÿπÿßŸÑŸÖÿ©‚ñÅÿßŸÑÿ£ÿ¨ŸÜÿ©‚ñÅÿ¨ŸäŸÜ‚ñÅÿ®Ÿäÿ±ÿØŸä‚ñÅJean‚ñÅPurdy‚ñÅŸàÿßŸÑÿπÿßŸÑŸÖ‚ñÅÿ±Ÿàÿ®ÿ±ÿ™‚ñÅÿ•ÿØŸàÿßÿ±ÿØÿ≤‚ñÅRobert‚ñÅEdwards‚ñÅÿå‚ñÅŸàÿßŸÑÿ∞ŸäŸÜ‚ñÅŸÉÿßŸÜŸàÿß‚ñÅŸÖÿ≥ÿ§ŸàŸÑŸäŸÜ‚ñÅÿπŸÜ‚ñÅÿ≠ŸÖŸÑ‚ñÅŸÑŸàŸäÿ≤‚ñÅÿ®ÿ±ÿßŸàŸÜ‚ñÅÿå‚ñÅŸàŸáŸä‚ñÅÿ£ŸàŸÑ‚ñÅÿ•ŸÜÿ≥ÿßŸÜ‚ñÅŸàŸÑÿØ‚ñÅÿ®ÿπÿØ‚ñÅÿßŸÑÿ≠ŸÖŸÑ‚ñÅÿπŸÜ‚ñÅÿ∑ÿ±ŸäŸÇ‚ñÅÿßŸÑÿ•ÿÆÿµÿßÿ®‚ñÅŸÅŸä‚ñÅÿßŸÑŸÖÿÆÿ™ÿ®ÿ±‚ñÅÿ£Ÿà‚ñÅÿßŸÑÿ™ŸÑŸÇŸäÿ≠‚ñÅÿßŸÑÿßÿµÿ∑ŸÜÿßÿπŸä‚ñÅŸÅŸä‚ñÅ1977ŸÖ',
 'ÿπŸÑÿßŸàÿ©‚ñÅÿπŸÑŸâ‚ñÅÿπŸÜÿµÿ±‚ñÅÿßŸÑÿ≤ŸäŸÜÿ©‚ñÅÿå‚ñÅŸÅŸáŸä‚ñÅÿ≥ŸÑÿßŸÑÿ©‚ñÅÿ∞ÿßÿ™‚ñÅŸÖŸÜÿ∏ÿ±‚ñÅÿ¨ŸÖŸäŸÑ‚ñÅÿå‚ñÅŸÖŸÜ‚ñÅŸÜÿßÿ≠Ÿäÿ©‚ñÅÿßŸÑÿ•ŸÜÿ™ÿßÿ¨‚ñÅÿå‚ñÅÿ™ÿ®ÿØÿ£‚ñÅÿØÿ¨ÿßÿ¨ÿßÿ™‚ñÅÿ≥ŸÑÿßŸÑÿ©‚ñÅŸÖŸäŸÜŸàÿ±ŸÇÿ©‚ñÅÿ®Ÿàÿ∂ÿπ‚ñÅÿßŸÑÿ®Ÿäÿ∂‚ñÅŸÖÿ®ŸÉÿ±ÿß‚ñÅÿå‚ñÅŸÅŸä‚ñÅÿ≠ŸàÿßŸÑŸä‚ñÅ26‚ñÅÿ£ÿ≥ÿ®Ÿàÿπÿß‚ñÅÿõ‚ñÅ6‚ñÅÿ¥ÿπŸàÿ±‚ñÅŸàŸÜÿµŸÅ‚ñÅÿå‚ñÅŸÑÿßŸäÿ™ŸÅŸàŸÇ‚ñÅÿπŸÑŸäŸáÿß‚ñÅŸÅŸä‚ñÅÿ≥ÿ±

In [24]:
val_dataset = list(map(prepare,tqdm(val_dataset)))
val_dataset = list(filter(lambda doc:len(doc)>0,tqdm(val_dataset)))
val_dataset[:2]

  0%|          | 0/68438 [00:00<?, ?it/s]

  0%|          | 0/68438 [00:00<?, ?it/s]

['ŸÑŸÖ‚ñÅŸäÿ®ŸÇ‚ñÅÿ¢ŸÑ‚ñÅÿ≠ÿ±ÿ≤‚ñÅÿ®ÿπÿØ‚ñÅŸàŸÅÿßÿ©‚ñÅŸàÿßŸÑÿØŸá‚ñÅÿ∑ŸàŸäŸÑÿß‚ñÅÿ•ÿ∞‚ñÅÿ™ŸàŸÅŸä‚ñÅÿ¥ÿßÿ®ÿß‚ñÅÿ≥ŸÜÿ©‚ñÅ1340‚ñÅŸá‚ñÅ/‚ñÅ1921‚ñÅŸÖ‚ñÅÿå‚ñÅŸàÿØŸÅŸÜ‚ñÅÿ®ÿ¨Ÿàÿßÿ±‚ñÅŸàÿßŸÑÿØŸá‚ñÅŸÅŸä‚ñÅŸÖŸÇÿ®ÿ±ÿ©‚ñÅÿßŸÑÿ•ŸÖÿßŸÖ‚ñÅÿ®ÿ¨ÿØ‚ñÅÿ≠ŸÅÿµ‚ñÅÿå‚ñÅŸàŸÑŸÖ‚ñÅŸäÿπŸÇÿ®‚ñÅÿ•ŸÑÿß‚ñÅÿ®ŸÜÿ™ÿß‚ñÅŸàÿßÿ≠ÿØÿ©‚ñÅÿ™ÿ≤Ÿàÿ¨Ÿáÿß‚ñÅŸÖÿ≠ŸÖÿØ‚ñÅÿπŸÑŸä‚ñÅÿßŸÑŸÖÿØŸÜŸä',
 'ŸÉÿßŸÜÿ™‚ñÅÿ®ÿØÿßŸäÿ©‚ñÅŸÖÿ≥ŸÑÿ≥ŸÑ‚ñÅÿ£ÿÆŸä‚ñÅÿßŸÑÿπÿ≤Ÿäÿ≤‚ñÅÿπÿßŸÖ‚ñÅ1975‚ñÅÿå‚ñÅÿ≠Ÿäÿ´‚ñÅŸÉÿßŸÜÿ™‚ñÅÿπŸÑŸâ‚ñÅÿ¥ŸÉŸÑ‚ñÅŸÇÿµÿ©‚ñÅŸÖÿµŸàÿ±ÿ©‚ñÅŸÉŸÖÿß‚ñÅŸäÿ≥ŸÖŸäŸá‚ñÅÿßŸÑŸäÿßÿ®ÿßŸÜŸäŸàŸÜ‚ñÅŸÖÿßŸÜÿ∫ÿß‚ñÅÿå‚ñÅÿ•ŸÑŸâ‚ñÅÿ£ŸÜ‚ñÅÿ™ÿ≠ŸàŸÑ‚ñÅÿ•ŸÑŸâ‚ñÅŸÖÿ≥ŸÑÿ≥ŸÑ‚ñÅÿ™ŸÑŸÅÿ≤ŸäŸàŸÜŸä‚ñÅÿπÿßŸÖ‚ñÅ1990‚ñÅŸàÿπÿ±ÿ∂‚ñÅŸÑÿ£ŸàŸÑ‚ñÅŸÖÿ±ÿ©‚ñÅÿπŸÑŸâ‚ñÅŸÇŸÜÿßÿ©‚ñÅNHK‚ñÅÿßŸÑŸäÿßÿ®ÿßŸÜŸäÿ©‚ñÅŸÖŸÜ‚ñÅ1991‚ñÅ-‚ñÅ1992‚ñÅÿå‚ñÅŸàÿØÿ®ŸÑÿ¨‚ñÅŸáÿ∞ÿß‚ñÅÿßŸÑŸÖÿ≥ŸÑÿ≥ŸÑ‚ñÅÿ•ŸÑŸâ‚ñÅÿßŸÑŸÑÿ∫ÿ©‚ñÅÿßŸÑÿ•Ÿäÿ∑ÿßŸÑŸäÿ©‚ñÅŸàÿßŸÑŸÅÿ±ŸÜÿ≥Ÿäÿ©‚ñÅŸàÿßŸÑÿ£ŸÑŸÖÿßŸÜŸäÿ©‚ñÅŸàÿßŸÑÿπÿ±ÿ®Ÿäÿ©‚ñÅÿå‚ñÅÿ•ŸÑÿß‚ñÅÿ£ŸÜŸá‚ñÅŸÑÿßŸÇŸâ‚ñÅÿ≥ÿÆÿ∑ÿß‚ñÅŸÉÿ®Ÿäÿ±ÿß‚ñÅÿ≠Ÿäÿ´‚ñÅÿ£ŸÜŸá‚ñÅŸÖŸÜÿπ‚ñÅŸÅŸä‚ñÅÿ®ÿπÿ∂‚ñÅÿßŸÑÿØ

In [25]:
test_dataset = list(map(prepare,tqdm(test_dataset)))
test_dataset = list(filter(lambda doc:len(doc)>0,tqdm(test_dataset)))
test_dataset[:2]

  0%|          | 0/72040 [00:00<?, ?it/s]

  0%|          | 0/72040 [00:00<?, ?it/s]

['ÿ®ÿ∑ŸàŸÑÿ©‚ñÅŸÉÿ£ÿ≥‚ñÅÿßŸÑÿ≥Ÿàÿ®ÿ±‚ñÅÿßŸÑÿ£ŸÑÿ®ÿßŸÜŸä‚ñÅ2002‚ñÅŸáŸä‚ñÅÿßŸÑŸÜÿ≥ÿÆÿ©‚ñÅÿßŸÑÿ™ÿßÿ≥ÿπÿ©‚ñÅŸÖŸÜ‚ñÅÿ®ÿ∑ŸàŸÑÿ©‚ñÅŸÉÿ£ÿ≥‚ñÅÿßŸÑÿ≥Ÿàÿ®ÿ±‚ñÅÿßŸÑÿ£ŸÑÿ®ÿßŸÜŸä‚ñÅÿå‚ñÅŸÑÿπÿ®‚ñÅŸäŸàŸÖ‚ñÅ14‚ñÅÿ≥ÿ®ÿ™ŸÖÿ®ÿ±‚ñÅ2002‚ñÅŸÅŸä‚ñÅÿßŸÑÿßÿ≥ÿ™ÿßÿØ‚ñÅÿßŸÑŸàÿ∑ŸÜŸä‚ñÅÿ®ÿ™Ÿäÿ±ÿßŸÜÿß‚ñÅÿå‚ñÅÿ®ŸäŸÜ‚ñÅŸÅÿ±ŸäŸÇ‚ñÅÿ™Ÿäÿ±ÿßŸÜÿß‚ñÅÿßŸÑŸÅÿßÿ¶ÿ≤‚ñÅÿ®ŸÉÿ£ÿ≥‚ñÅÿ£ŸÑÿ®ÿßŸÜŸäÿß‚ñÅŸàŸÅÿ±ŸäŸÇ‚ñÅÿØŸäŸÜÿßŸÖŸà‚ñÅÿ™Ÿäÿ±ÿßŸÜÿß‚ñÅÿßŸÑŸÅÿßÿ¶ÿ≤‚ñÅÿ®ÿßŸÑÿØŸàÿ±Ÿä',
 'ŸàŸáÿ∞ÿß‚ñÅÿßŸÑŸÖÿ±ÿ≥Ÿâ‚ñÅŸÖŸÜ‚ñÅÿ£ÿ≠ÿ≥ŸÜ‚ñÅÿßŸÑŸÖÿ±ÿßÿ≥Ÿä‚ñÅŸàÿ∂ÿπÿß‚ñÅÿå‚ñÅŸàŸáŸà‚ñÅÿ¥ÿ®Ÿá‚ñÅÿÆŸÑŸäÿ¨‚ñÅŸÖŸÜ‚ñÅÿßŸÑÿ®ÿ≠ÿ±‚ñÅŸäÿØÿÆŸÑ‚ñÅŸÅŸä‚ñÅÿßŸÑÿ®ÿ±‚ñÅÿå‚ñÅŸàÿßŸÑÿ®ÿ±‚ñÅŸÖÿ∑ŸäŸÅ‚ñÅÿ®ÿ≠ÿßŸÅÿ™ŸäŸá‚ñÅÿå‚ñÅŸàŸäŸÉŸÜ‚ñÅŸÖŸÜ‚ñÅÿ¨ŸÖŸäÿπ‚ñÅÿßŸÑÿ£ÿ±Ÿàÿßÿ≠‚ñÅÿå‚ñÅŸàÿ®ÿßÿ≥ÿ™ŸÇÿ±ÿßÿ±ŸÜÿß‚ñÅŸÅŸäŸá‚ñÅÿπÿßÿØÿ™‚ñÅŸÑÿ£ÿ¨ÿ≥ÿßÿØŸÜÿß‚ñÅÿßŸÑÿ£ÿ±Ÿàÿßÿ≠‚ñÅÿå‚ñÅŸàÿ£ŸÖŸÜÿß‚ñÅŸÅŸä‚ñÅŸÖÿ±ŸÉÿ®ŸÜÿß‚ñÅŸÖŸÜ‚ñÅÿßÿÆÿ™ŸÑÿßŸÑ‚ñÅÿßŸÑÿØÿ≥ÿ±‚ñÅŸàÿßŸÑÿ£ŸÑŸàÿßÿ≠']

In [26]:
vocabs_dict = {}
for document in tqdm(train_dataset):
  for word in document.split():
    vocabs_dict[word] = vocabs_dict.get(word,0)+1
f'{len(vocabs_dict.keys()):,}',f'{sum(vocabs_dict.values()):,}'

  0%|          | 0/1300304 [00:00<?, ?it/s]

('1,298,835', '1,300,304')

# Helper functions and constants

In [27]:
train_dataset[:10]

['ŸÇÿßŸÖ‚ñÅÿ±ŸàÿßÿØ‚ñÅÿßÿÆÿ™ÿ±ÿßÿπ‚ñÅÿßŸÑÿ™ŸÑŸÇŸäÿ≠‚ñÅÿßŸÑÿßÿµÿ∑ŸÜÿßÿπŸä‚ñÅÿ®ÿ™ÿ£ÿ≥Ÿäÿ≥‚ñÅÿπŸäÿßÿØÿ©‚ñÅÿ®Ÿàÿ±ŸÜ‚ñÅŸáŸàŸÑ‚ñÅŸÅŸä‚ñÅ1980‚ñÅÿå‚ñÅŸàŸáŸÖ:‚ñÅÿßŸÑÿ≥ŸäÿØ‚ñÅÿ®ÿßÿ™ÿ±ŸäŸÉ‚ñÅÿ≥ÿ™Ÿäÿ®ÿ™Ÿà‚ñÅPatrick‚ñÅSteptoe‚ñÅÿå‚ñÅŸàÿπÿßŸÑŸÖÿ©‚ñÅÿßŸÑÿ£ÿ¨ŸÜÿ©‚ñÅÿ¨ŸäŸÜ‚ñÅÿ®Ÿäÿ±ÿØŸä‚ñÅJean‚ñÅPurdy‚ñÅŸàÿßŸÑÿπÿßŸÑŸÖ‚ñÅÿ±Ÿàÿ®ÿ±ÿ™‚ñÅÿ•ÿØŸàÿßÿ±ÿØÿ≤‚ñÅRobert‚ñÅEdwards‚ñÅÿå‚ñÅŸàÿßŸÑÿ∞ŸäŸÜ‚ñÅŸÉÿßŸÜŸàÿß‚ñÅŸÖÿ≥ÿ§ŸàŸÑŸäŸÜ‚ñÅÿπŸÜ‚ñÅÿ≠ŸÖŸÑ‚ñÅŸÑŸàŸäÿ≤‚ñÅÿ®ÿ±ÿßŸàŸÜ‚ñÅÿå‚ñÅŸàŸáŸä‚ñÅÿ£ŸàŸÑ‚ñÅÿ•ŸÜÿ≥ÿßŸÜ‚ñÅŸàŸÑÿØ‚ñÅÿ®ÿπÿØ‚ñÅÿßŸÑÿ≠ŸÖŸÑ‚ñÅÿπŸÜ‚ñÅÿ∑ÿ±ŸäŸÇ‚ñÅÿßŸÑÿ•ÿÆÿµÿßÿ®‚ñÅŸÅŸä‚ñÅÿßŸÑŸÖÿÆÿ™ÿ®ÿ±‚ñÅÿ£Ÿà‚ñÅÿßŸÑÿ™ŸÑŸÇŸäÿ≠‚ñÅÿßŸÑÿßÿµÿ∑ŸÜÿßÿπŸä‚ñÅŸÅŸä‚ñÅ1977ŸÖ',
 'ÿπŸÑÿßŸàÿ©‚ñÅÿπŸÑŸâ‚ñÅÿπŸÜÿµÿ±‚ñÅÿßŸÑÿ≤ŸäŸÜÿ©‚ñÅÿå‚ñÅŸÅŸáŸä‚ñÅÿ≥ŸÑÿßŸÑÿ©‚ñÅÿ∞ÿßÿ™‚ñÅŸÖŸÜÿ∏ÿ±‚ñÅÿ¨ŸÖŸäŸÑ‚ñÅÿå‚ñÅŸÖŸÜ‚ñÅŸÜÿßÿ≠Ÿäÿ©‚ñÅÿßŸÑÿ•ŸÜÿ™ÿßÿ¨‚ñÅÿå‚ñÅÿ™ÿ®ÿØÿ£‚ñÅÿØÿ¨ÿßÿ¨ÿßÿ™‚ñÅÿ≥ŸÑÿßŸÑÿ©‚ñÅŸÖŸäŸÜŸàÿ±ŸÇÿ©‚ñÅÿ®Ÿàÿ∂ÿπ‚ñÅÿßŸÑÿ®Ÿäÿ∂‚ñÅŸÖÿ®ŸÉÿ±ÿß‚ñÅÿå‚ñÅŸÅŸä‚ñÅÿ≠ŸàÿßŸÑŸä‚ñÅ26‚ñÅÿ£ÿ≥ÿ®Ÿàÿπÿß‚ñÅÿõ‚ñÅ6‚ñÅÿ¥ÿπŸàÿ±‚ñÅŸàŸÜÿµŸÅ‚ñÅÿå‚ñÅŸÑÿßŸäÿ™ŸÅŸàŸÇ‚ñÅÿπŸÑŸäŸáÿß‚ñÅŸÅŸä‚ñÅÿ≥ÿ±

In [28]:
# Find out the max samples token
# sorted_docs_by_length = sorted(tqdm(train_dataset),key=lambda document: len(document.split()),reverse=True)
sorted_docs_by_length = sorted(tqdm(train_dataset),key=len,reverse=True)
len(sorted_docs_by_length[0]),\
len(sorted_docs_by_length[1]),\
len(sorted_docs_by_length[2]),\
len(sorted_docs_by_length[5]),\
len(sorted_docs_by_length[10]),\
len(sorted_docs_by_length[50]),\
len(sorted_docs_by_length[1_000]),\
len(sorted_docs_by_length[2_500]),\
len(sorted_docs_by_length[5_000]),\
len(sorted_docs_by_length[10_000]),\
len(sorted_docs_by_length[20_000])

  0%|          | 0/1300304 [00:00<?, ?it/s]

(32866, 8248, 7814, 6779, 5298, 3082, 1471, 1164, 973, 813, 677)

In [29]:
# Find out the max samples token
# sorted_docs_by_length = sorted(tqdm(test_dataset),key=lambda document: len(document.split()),reverse=True)
sorted_docs_by_length = sorted(tqdm(test_dataset),key=len,reverse=True)
len(sorted_docs_by_length[0]),\
len(sorted_docs_by_length[1]),\
len(sorted_docs_by_length[2]),\
len(sorted_docs_by_length[5]),\
len(sorted_docs_by_length[10]),\
len(sorted_docs_by_length[50]),\
len(sorted_docs_by_length[1_000]),\
len(sorted_docs_by_length[2_500]),\
len(sorted_docs_by_length[5_000]),\
len(sorted_docs_by_length[10_000]),\
len(sorted_docs_by_length[20_000])

  0%|          | 0/72040 [00:00<?, ?it/s]

(5623, 4786, 4665, 2693, 2266, 1578, 700, 533, 433, 349, 278)

In [30]:
# setting seq_len:
seq_len = 500

In [31]:
def create_features_from_text_list(text_list,tokenizer):
  encoded = list()
  for doc in tqdm(text_list):
    encoded_doc = tokenizer.encode(doc)
    encoded_doc = tokenizer.pad(encoded_doc,length=seq_len)
    encoded_doc = encoded_doc[:seq_len]
    encoded.append(np.array(encoded_doc))
  return np.array(encoded)

In [32]:
# define batch size
batch_size = 256

In [33]:
def calculate_text_metrics(predictions,labels,target_tokenizer, print_text=False):
  # drop pads, those pads are not necessary pad tokens!!
  # last_pad = predictions[-1]
  # for i,pad in reversed(list(enumerate(predictions))):
  #   if pad == last_pad:
  #     predictions.pop(i)
  #   else:
  #     break

  true_text = ''.join(target_tokenizer.decode(labels))
  true_text = true_text.replace('<PAD>','').strip().replace('‚ñÅ',' ')
  # true_text = re.sub(' +',' ',true_text)

  predicted_text = ''.join(target_tokenizer.decode(predictions))
  predicted_text = predicted_text.replace('<PAD>','')[:len(true_text)].strip().replace('‚ñÅ',' ')
  # predicted_text = re.sub(' +',' ',predicted_text)

  if print_text:
    print(predicted_text)
    print(true_text)

  wer = word_error_rate(preds=predicted_text, target=true_text)
  cer = char_error_rate(preds=predicted_text, target=true_text)

  return wer,cer

# Undot the dataset

In [34]:
undotted_train_dataset = list(map(
    lambda text: undot(text.replace('‚ñÅ',' ')).replace(' ','‚ñÅ'),
    tqdm(train_dataset))
)
undotted_train_dataset[:2]

  0%|          | 0/1300304 [00:00<?, ?it/s]

['⁄°ÿßŸÖ‚ñÅÿ±ŸàÿßÿØ‚ñÅÿßÿ≠ŸÆÿ±ÿßÿπ‚ñÅÿßŸÑŸÆŸÑ⁄°ŸÆÿ≠‚ñÅÿßŸÑÿßÿµÿ∑ŸÆÿßÿπŸâ‚ñÅŸÆŸÆÿßÿ≥ŸÆÿ≥‚ñÅÿπŸÆÿßÿØŸá‚ñÅŸÆŸàÿ±⁄∫‚ñÅŸáŸàŸÑ‚ñÅ⁄°Ÿâ‚ñÅ1980‚ñÅÿå‚ñÅŸàŸáŸÖ:‚ñÅÿßŸÑÿ≥ŸÆÿØ‚ñÅŸÆÿßŸÆÿ±ŸÆŸÉ‚ñÅÿ≥ŸÆŸÆŸÆŸÆŸà‚ñÅPatrick‚ñÅSteptoe‚ñÅÿå‚ñÅŸàÿπÿßŸÑŸÖŸá‚ñÅÿßŸÑÿßÿ≠ŸÆŸá‚ñÅÿ≠ŸÆ⁄∫‚ñÅŸÆŸÆÿ±ÿØŸâ‚ñÅJean‚ñÅPurdy‚ñÅŸàÿßŸÑÿπÿßŸÑŸÖ‚ñÅÿ±ŸàŸÆÿ±ŸÆ‚ñÅÿßÿØŸàÿßÿ±ÿØÿ±‚ñÅRobert‚ñÅEdwards‚ñÅÿå‚ñÅŸàÿßŸÑÿØŸÆ⁄∫‚ñÅŸÉÿßŸÆŸàÿß‚ñÅŸÖÿ≥ŸàŸàŸÑŸÆ⁄∫‚ñÅÿπ⁄∫‚ñÅÿ≠ŸÖŸÑ‚ñÅŸÑŸàŸÆÿ±‚ñÅŸÆÿ±ÿßŸà⁄∫‚ñÅÿå‚ñÅŸàŸáŸâ‚ñÅÿßŸàŸÑ‚ñÅÿßŸÆÿ≥ÿß⁄∫‚ñÅŸàŸÑÿØ‚ñÅŸÆÿπÿØ‚ñÅÿßŸÑÿ≠ŸÖŸÑ‚ñÅÿπ⁄∫‚ñÅÿ∑ÿ±ŸÆŸØ‚ñÅÿßŸÑÿßÿ≠ÿµÿßŸÆ‚ñÅ⁄°Ÿâ‚ñÅÿßŸÑŸÖÿ≠ŸÆŸÆÿ±‚ñÅÿßŸà‚ñÅÿßŸÑŸÆŸÑ⁄°ŸÆÿ≠‚ñÅÿßŸÑÿßÿµÿ∑ŸÆÿßÿπŸâ‚ñÅ⁄°Ÿâ‚ñÅ1977ŸÖ',
 'ÿπŸÑÿßŸàŸá‚ñÅÿπŸÑŸâ‚ñÅÿπŸÆÿµÿ±‚ñÅÿßŸÑÿ±ŸÆŸÆŸá‚ñÅÿå‚ñÅ⁄°ŸáŸâ‚ñÅÿ≥ŸÑÿßŸÑŸá‚ñÅÿØÿßŸÆ‚ñÅŸÖŸÆÿ∑ÿ±‚ñÅÿ≠ŸÖŸÆŸÑ‚ñÅÿå‚ñÅŸÖ⁄∫‚ñÅŸÆÿßÿ≠ŸÆŸá‚ñÅÿßŸÑÿßŸÆŸÆÿßÿ≠‚ñÅÿå‚ñÅŸÆŸÆÿØÿß‚ñÅÿØÿ≠ÿßÿ≠ÿßŸÆ‚ñÅÿ≥ŸÑÿßŸÑŸá‚ñÅŸÖŸÆŸÆŸàÿ±⁄°Ÿá‚ñÅŸÆŸàÿµÿπ‚ñÅÿßŸÑŸÆŸÆÿµ‚ñÅŸÖŸÆŸÉÿ±ÿß‚ñÅÿå‚ñÅ⁄°Ÿâ‚ñÅÿ≠ŸàÿßŸÑŸâ‚ñÅ26‚ñÅÿßÿ≥ŸÆŸàÿπÿß‚ñÅÿõ‚ñÅ6‚ñÅÿ≥ÿπŸàÿ±‚ñÅŸàŸÆÿµ⁄°‚ñÅÿå‚ñÅŸÑÿßŸÆŸÆ⁄°ŸàŸØ‚ñÅÿπŸÑŸÆŸáÿß‚ñÅ⁄°Ÿâ‚ñÅÿ≥ÿ±

In [35]:
undotted_val_dataset = list(map(
    lambda text: undot(text.replace('‚ñÅ',' ')).replace(' ','‚ñÅ'),
    tqdm(val_dataset))
)
undotted_val_dataset[:2]

  0%|          | 0/68438 [00:00<?, ?it/s]

['ŸÑŸÖ‚ñÅŸÆŸÆŸØ‚ñÅÿ¢ŸÑ‚ñÅÿ≠ÿ±ÿ±‚ñÅŸÆÿπÿØ‚ñÅŸà⁄°ÿßŸá‚ñÅŸàÿßŸÑÿØŸá‚ñÅÿ∑ŸàŸÆŸÑÿß‚ñÅÿßÿØ‚ñÅŸÆŸà⁄°Ÿâ‚ñÅÿ≥ÿßŸÆÿß‚ñÅÿ≥ŸÆŸá‚ñÅ1340‚ñÅŸá‚ñÅ/‚ñÅ1921‚ñÅŸÖ‚ñÅÿå‚ñÅŸàÿØ⁄°⁄∫‚ñÅŸÆÿ≠Ÿàÿßÿ±‚ñÅŸàÿßŸÑÿØŸá‚ñÅ⁄°Ÿâ‚ñÅŸÖ⁄°ŸÆÿ±Ÿá‚ñÅÿßŸÑÿßŸÖÿßŸÖ‚ñÅŸÆÿ≠ÿØ‚ñÅÿ≠⁄°ÿµ‚ñÅÿå‚ñÅŸàŸÑŸÖ‚ñÅŸÆÿπ⁄°ŸÆ‚ñÅÿßŸÑÿß‚ñÅŸÆŸÆŸÆÿß‚ñÅŸàÿßÿ≠ÿØŸá‚ñÅŸÆÿ±Ÿàÿ≠Ÿáÿß‚ñÅŸÖÿ≠ŸÖÿØ‚ñÅÿπŸÑŸâ‚ñÅÿßŸÑŸÖÿØŸÆŸâ',
 'ŸÉÿßŸÆŸÆ‚ñÅŸÆÿØÿßŸÆŸá‚ñÅŸÖÿ≥ŸÑÿ≥ŸÑ‚ñÅÿßÿ≠Ÿâ‚ñÅÿßŸÑÿπÿ±ŸÆÿ±‚ñÅÿπÿßŸÖ‚ñÅ1975‚ñÅÿå‚ñÅÿ≠ŸÆŸÆ‚ñÅŸÉÿßŸÆŸÆ‚ñÅÿπŸÑŸâ‚ñÅÿ≥ŸÉŸÑ‚ñÅ⁄°ÿµŸá‚ñÅŸÖÿµŸàÿ±Ÿá‚ñÅŸÉŸÖÿß‚ñÅŸÆÿ≥ŸÖŸÆŸá‚ñÅÿßŸÑŸÆÿßŸÆÿßŸÆŸÆŸà⁄∫‚ñÅŸÖÿßŸÆÿπÿß‚ñÅÿå‚ñÅÿßŸÑŸâ‚ñÅÿß⁄∫‚ñÅŸÆÿ≠ŸàŸÑ‚ñÅÿßŸÑŸâ‚ñÅŸÖÿ≥ŸÑÿ≥ŸÑ‚ñÅŸÆŸÑ⁄°ÿ±ŸÆŸàŸÆŸâ‚ñÅÿπÿßŸÖ‚ñÅ1990‚ñÅŸàÿπÿ±ÿµ‚ñÅŸÑÿßŸàŸÑ‚ñÅŸÖÿ±Ÿá‚ñÅÿπŸÑŸâ‚ñÅ⁄°ŸÆÿßŸá‚ñÅNHK‚ñÅÿßŸÑŸÆÿßŸÆÿßŸÆŸÆŸá‚ñÅŸÖ⁄∫‚ñÅ1991‚ñÅ-‚ñÅ1992‚ñÅÿå‚ñÅŸàÿØŸÆŸÑÿ≠‚ñÅŸáÿØÿß‚ñÅÿßŸÑŸÖÿ≥ŸÑÿ≥ŸÑ‚ñÅÿßŸÑŸâ‚ñÅÿßŸÑŸÑÿπŸá‚ñÅÿßŸÑÿßŸÆÿ∑ÿßŸÑŸÆŸá‚ñÅŸàÿßŸÑ⁄°ÿ±ŸÆÿ≥ŸÆŸá‚ñÅŸàÿßŸÑÿßŸÑŸÖÿßŸÆŸÆŸá‚ñÅŸàÿßŸÑÿπÿ±ŸÆŸÆŸá‚ñÅÿå‚ñÅÿßŸÑÿß‚ñÅÿßŸÆŸá‚ñÅŸÑÿß⁄°Ÿâ‚ñÅÿ≥ÿ≠ÿ∑ÿß‚ñÅŸÉŸÆŸÆÿ±ÿß‚ñÅÿ≠ŸÆŸÆ‚ñÅÿßŸÆŸá‚ñÅŸÖŸÆÿπ‚ñÅ⁄°Ÿâ‚ñÅŸÆÿπÿµ‚ñÅÿßŸÑÿØ

In [36]:
undotted_test_dataset = list(map(
    lambda text: undot(text.replace('‚ñÅ',' ')).replace(' ','‚ñÅ'),
    tqdm(test_dataset))
)
undotted_test_dataset[:2]

  0%|          | 0/72040 [00:00<?, ?it/s]

['ŸÆÿ∑ŸàŸÑŸá‚ñÅŸÉÿßÿ≥‚ñÅÿßŸÑÿ≥ŸàŸÆÿ±‚ñÅÿßŸÑÿßŸÑŸÆÿßŸÆŸâ‚ñÅ2002‚ñÅŸáŸâ‚ñÅÿßŸÑŸÆÿ≥ÿ≠Ÿá‚ñÅÿßŸÑŸÆÿßÿ≥ÿπŸá‚ñÅŸÖ⁄∫‚ñÅŸÆÿ∑ŸàŸÑŸá‚ñÅŸÉÿßÿ≥‚ñÅÿßŸÑÿ≥ŸàŸÆÿ±‚ñÅÿßŸÑÿßŸÑŸÆÿßŸÆŸâ‚ñÅÿå‚ñÅŸÑÿπŸÆ‚ñÅŸÆŸàŸÖ‚ñÅ14‚ñÅÿ≥ŸÆŸÆŸÖŸÆÿ±‚ñÅ2002‚ñÅ⁄°Ÿâ‚ñÅÿßŸÑÿßÿ≥ŸÆÿßÿØ‚ñÅÿßŸÑŸàÿ∑ŸÆŸâ‚ñÅŸÆŸÆŸÆÿ±ÿßŸÆÿß‚ñÅÿå‚ñÅŸÆŸÆ⁄∫‚ñÅ⁄°ÿ±ŸÆŸØ‚ñÅŸÆŸÆÿ±ÿßŸÆÿß‚ñÅÿßŸÑ⁄°ÿßŸâÿ±‚ñÅŸÆŸÉÿßÿ≥‚ñÅÿßŸÑŸÆÿßŸÆŸÆÿß‚ñÅŸà⁄°ÿ±ŸÆŸØ‚ñÅÿØŸÆŸÆÿßŸÖŸà‚ñÅŸÆŸÆÿ±ÿßŸÆÿß‚ñÅÿßŸÑ⁄°ÿßŸâÿ±‚ñÅŸÆÿßŸÑÿØŸàÿ±Ÿâ',
 'ŸàŸáÿØÿß‚ñÅÿßŸÑŸÖÿ±ÿ≥Ÿâ‚ñÅŸÖ⁄∫‚ñÅÿßÿ≠ÿ≥⁄∫‚ñÅÿßŸÑŸÖÿ±ÿßÿ≥Ÿâ‚ñÅŸàÿµÿπÿß‚ñÅÿå‚ñÅŸàŸáŸà‚ñÅÿ≥ŸÆŸá‚ñÅÿ≠ŸÑŸÆÿ≠‚ñÅŸÖ⁄∫‚ñÅÿßŸÑŸÆÿ≠ÿ±‚ñÅŸÆÿØÿ≠ŸÑ‚ñÅ⁄°Ÿâ‚ñÅÿßŸÑŸÆÿ±‚ñÅÿå‚ñÅŸàÿßŸÑŸÆÿ±‚ñÅŸÖÿ∑ŸÆ⁄°‚ñÅŸÆÿ≠ÿß⁄°ŸÆŸÆŸá‚ñÅÿå‚ñÅŸàŸÆŸÉ⁄∫‚ñÅŸÖ⁄∫‚ñÅÿ≠ŸÖŸÆÿπ‚ñÅÿßŸÑÿßÿ±Ÿàÿßÿ≠‚ñÅÿå‚ñÅŸàŸÆÿßÿ≥ŸÆ⁄°ÿ±ÿßÿ±ŸÆÿß‚ñÅ⁄°ŸÆŸá‚ñÅÿπÿßÿØŸÆ‚ñÅŸÑÿßÿ≠ÿ≥ÿßÿØŸÆÿß‚ñÅÿßŸÑÿßÿ±Ÿàÿßÿ≠‚ñÅÿå‚ñÅŸàÿßŸÖŸÆÿß‚ñÅ⁄°Ÿâ‚ñÅŸÖÿ±ŸÉŸÆŸÆÿß‚ñÅŸÖ⁄∫‚ñÅÿßÿ≠ŸÆŸÑÿßŸÑ‚ñÅÿßŸÑÿØÿ≥ÿ±‚ñÅŸàÿßŸÑÿßŸÑŸàÿßÿ≠']

In [37]:
def train_model(
    model,
    train_dataloader,
    val_dataloader,
    text_type,
    max_epochs=100,
  ):
  checkpoints_path = Path(f"./DotsRetrieval/{text_type}")
  shutil.rmtree(checkpoints_path, ignore_errors=True)
  checkpoint_callback = ModelCheckpoint(
      mode="min",
      save_top_k=1,
      verbose=False,
      save_last=True,
      monitor="val_loss",
      save_weights_only=False,
      auto_insert_metric_name=True,
      save_on_train_epoch_end=False,
      dirpath=f"{checkpoints_path}/checkpoints",
      filename="{epoch}-{val_loss:.3f}-{step}",
  )
  callbacks = list()
  callbacks.append(checkpoint_callback)
  early_stopping_callback = EarlyStopping(
      monitor="val_loss",
      min_delta=0.0025,
    #   min_delta=0,
      patience=10,
      check_finite=True,
  )
  callbacks.append(early_stopping_callback)
  lr_monitor = LearningRateMonitor(
      logging_interval="step",
      log_momentum=True,
  )
  callbacks.append(lr_monitor)
#   callbacks.append(RichProgressBar())
  devices = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  trainer = Trainer(
      deterministic=True,
      callbacks=callbacks,
      gradient_clip_val=5,
      fast_dev_run=False,
      max_epochs=max_epochs,
      val_check_interval=0.25,
      accelerator="auto",
      devices=[0],
      # log_every_n_steps=max(len(train_dataloader) // 25, 1),
      log_every_n_steps=max(len(train_dataloader) // 25, 1),
  )
  trainer.validate(
      model=model,
      dataloaders=val_dataloader,
  )
  trainer.fit(
      model,
      train_dataloader,
      val_dataloader,
  )
  return trainer

# Prepare vocab

## source tokenizer

In [38]:
source_tokenizer = CharacterTokenizer(vocab_size=10_000_000)

In [39]:
source_tokenizer.train(text='\n'.join(tqdm(undotted_train_dataset)))

  0%|          | 0/1300304 [00:00<?, ?it/s]

Training CharacterTokenizer ...


In [40]:
source_tokenizer.vocab_size

128

In [41]:
# test the tokenizer
source_tokenizer.tokenize(undot(prepare('ÿßŸÑÿ≥ŸÑÿßŸÖ ÿπŸÑŸäŸÉŸÖ Ÿà ÿ±ÿ≠ŸÖÿ© ÿßŸÑŸÑŸá Ÿà ÿ®ÿ±ŸÉÿßÿ™Ÿá'))),source_tokenizer.encode(undot(prepare('ÿßŸÑÿ≥ŸÑÿßŸÖ ÿπŸÑŸäŸÉŸÖ Ÿà ÿ±ÿ≠ŸÖÿ© ÿßŸÑŸÑŸá Ÿà ÿ®ÿ±ŸÉÿßÿ™Ÿá')))

(['ÿß',
  'ŸÑ',
  'ÿ≥',
  'ŸÑ',
  'ÿß',
  'ŸÖ',
  '‚ñÅ',
  'ÿπ',
  'ŸÑ',
  'ŸÆ',
  'ŸÉ',
  'ŸÖ',
  '‚ñÅ',
  'Ÿà',
  '‚ñÅ',
  'ÿ±',
  'ÿ≠',
  'ŸÖ',
  'Ÿá',
  '‚ñÅ',
  'ÿß',
  'ŸÑ',
  'ŸÑ',
  'Ÿá',
  '‚ñÅ',
  'Ÿà',
  '‚ñÅ',
  'ŸÆ',
  'ÿ±',
  'ŸÉ',
  'ÿß',
  'ŸÆ',
  'Ÿá'],
 [6,
  3,
  9,
  3,
  6,
  10,
  5,
  13,
  3,
  4,
  17,
  10,
  5,
  12,
  5,
  7,
  19,
  10,
  11,
  5,
  6,
  3,
  3,
  11,
  5,
  12,
  5,
  4,
  7,
  17,
  6,
  4,
  11])

In [42]:
''.join(v for v,f in source_tokenizer.vocab.items() if 0<f<100)

''

In [43]:
dict(sorted(source_tokenizer.vocab.items(),key=lambda item:item[1],reverse=True))

{'‚ñÅ': 61697940,
 'ŸÆ': 47162923,
 'ÿß': 46845238,
 'ŸÑ': 30773470,
 'ŸÖ': 17513557,
 'Ÿà': 15423999,
 'Ÿá': 15173174,
 'ÿ±': 14421490,
 '⁄°': 11884217,
 'ÿ≠': 10793500,
 'ÿπ': 10197337,
 'ÿØ': 9923038,
 'ÿ≥': 9426013,
 'Ÿâ': 9422099,
 '⁄∫': 6447329,
 'ŸÉ': 5688727,
 'ÿµ': 4021297,
 'ÿå': 3313176,
 'ÿ∑': 3078676,
 '1': 1083080,
 '0': 880149,
 '"': 847111,
 'ÿ°': 830269,
 'ŸØ': 764891,
 '2': 673451,
 '9': 608305,
 '(': 520736,
 ')': 518732,
 '8': 309705,
 '5': 291400,
 '3': 290173,
 'ÿ¢': 283394,
 '4': 265115,
 '7': 256673,
 '6': 256141,
 'e': 213800,
 '-': 207441,
 ':': 190362,
 'a': 187904,
 'i': 155275,
 'o': 143996,
 'r': 141978,
 'n': 136809,
 't': 122874,
 's': 110810,
 'l': 99194,
 'ÿõ': 90203,
 'u': 68726,
 '/': 64474,
 'c': 62367,
 ',': 62334,
 'd': 59660,
 'h': 56440,
 'm': 55437,
 'A': 49925,
 'S': 48899,
 'C': 47044,
 'g': 40103,
 'p': 38347,
 'M': 33413,
 'T': 33307,
 'P': 32771,
 '%': 32626,
 'y': 32413,
 ']': 29887,
 '[': 29811,
 'I': 29283,
 'D': 27388,
 'f': 26688,
 'B

In [44]:
source_tokenizer.save_model(file_path='dotless_arabic/experiments/dots_retrieval/bin/source_tokenizer.model')

Saving as pickle file ...


## target tokenizer

In [45]:
target_tokenizer = CharacterTokenizer(vocab_size=10_000_000)

In [46]:
target_tokenizer.train(text='\n'.join(tqdm(train_dataset)))

  0%|          | 0/1300304 [00:00<?, ?it/s]

Training CharacterTokenizer ...


In [47]:
target_tokenizer.vocab_size

144

In [48]:
# test the tokenizer
target_tokenizer.tokenize(prepare('ÿßŸÑÿ≥ŸÑÿßŸÖ ÿπŸÑŸäŸÉŸÖ Ÿà ÿ±ÿ≠ŸÖÿ© ÿßŸÑŸÑŸá Ÿà ÿ®ÿ±ŸÉÿßÿ™Ÿá')),target_tokenizer.encode(prepare('ÿßŸÑÿ≥ŸÑÿßŸÖ ÿπŸÑŸäŸÉŸÖ Ÿà ÿ±ÿ≠ŸÖÿ© ÿßŸÑŸÑŸá Ÿà ÿ®ÿ±ŸÉÿßÿ™Ÿá'))

(['ÿß',
  'ŸÑ',
  'ÿ≥',
  'ŸÑ',
  'ÿß',
  'ŸÖ',
  '‚ñÅ',
  'ÿπ',
  'ŸÑ',
  'Ÿä',
  'ŸÉ',
  'ŸÖ',
  '‚ñÅ',
  'Ÿà',
  '‚ñÅ',
  'ÿ±',
  'ÿ≠',
  'ŸÖ',
  'ÿ©',
  '‚ñÅ',
  'ÿß',
  'ŸÑ',
  'ŸÑ',
  'Ÿá',
  '‚ñÅ',
  'Ÿà',
  '‚ñÅ',
  'ÿ®',
  'ÿ±',
  'ŸÉ',
  'ÿß',
  'ÿ™',
  'Ÿá'],
 [8,
  7,
  23,
  7,
  8,
  19,
  5,
  18,
  7,
  4,
  26,
  19,
  5,
  2,
  5,
  11,
  27,
  19,
  25,
  5,
  8,
  7,
  7,
  16,
  5,
  2,
  5,
  12,
  11,
  26,
  8,
  13,
  16])

In [49]:
target_tokenizer.vocab

{'<UNK>': -1,
 '<PAD>': -1,
 'Ÿà': 15168787,
 'ŸÅ': 7190097,
 'Ÿä': 21889917,
 '‚ñÅ': 61697940,
 'ÿ∏': 583639,
 'ŸÑ': 30773470,
 'ÿß': 39288146,
 'ÿ∂': 1604356,
 'ÿ∑': 2495037,
 'ÿ±': 12664818,
 'ÿ®': 9863294,
 'ÿ™': 11974854,
 'Ÿâ': 2341565,
 'ÿ¥': 2611613,
 'Ÿá': 6201069,
 'ÿØ': 8317726,
 'ÿπ': 8904268,
 'ŸÖ': 17513557,
 'ŸÜ': 14111725,
 'ŸÇ': 5459011,
 'ÿå': 3313176,
 'ÿ≥': 6814400,
 'ÿ∫': 1293069,
 'ÿ©': 8972105,
 'ŸÉ': 5688727,
 'ÿ≠': 4722447,
 'ÿ•': 2201556,
 'ÿÆ': 2223514,
 'ÿ£': 5355536,
 'ÿ´': 1825304,
 'ÿ¨': 3847539,
 '"': 847111,
 'ÿ¶': 1025692,
 'ÿ°': 830269,
 ':': 190362,
 '(': 520736,
 ')': 518732,
 'ÿ§': 255212,
 'ÿµ': 2416941,
 'ÿ≤': 1756672,
 '+': 3772,
 '6': 256141,
 '8': 309705,
 '7': 256673,
 '3': 290173,
 '5': 291400,
 '1': 1083080,
 '4': 265115,
 '2': 673451,
 '0': 880149,
 'ÿ¢': 283394,
 'ÿ∞': 1605312,
 '9': 608305,
 '-': 207441,
 '/': 64474,
 'ÿõ': 90203,
 'B': 26579,
 'N': 23315,
 'Ÿ™': 10054,
 'a': 187904,
 'n': 136809,
 't': 122874,
 'i': 155275,
 'g': 40103,

In [50]:
{
    v: f for v, f in target_tokenizer.vocab.items() if 0 < f < 1000
}

{'Œø': 995, '¬∑': 935, '–ª': 933, '–∫': 969}

In [51]:
target_tokenizer.vocab

{'<UNK>': -1,
 '<PAD>': -1,
 'Ÿà': 15168787,
 'ŸÅ': 7190097,
 'Ÿä': 21889917,
 '‚ñÅ': 61697940,
 'ÿ∏': 583639,
 'ŸÑ': 30773470,
 'ÿß': 39288146,
 'ÿ∂': 1604356,
 'ÿ∑': 2495037,
 'ÿ±': 12664818,
 'ÿ®': 9863294,
 'ÿ™': 11974854,
 'Ÿâ': 2341565,
 'ÿ¥': 2611613,
 'Ÿá': 6201069,
 'ÿØ': 8317726,
 'ÿπ': 8904268,
 'ŸÖ': 17513557,
 'ŸÜ': 14111725,
 'ŸÇ': 5459011,
 'ÿå': 3313176,
 'ÿ≥': 6814400,
 'ÿ∫': 1293069,
 'ÿ©': 8972105,
 'ŸÉ': 5688727,
 'ÿ≠': 4722447,
 'ÿ•': 2201556,
 'ÿÆ': 2223514,
 'ÿ£': 5355536,
 'ÿ´': 1825304,
 'ÿ¨': 3847539,
 '"': 847111,
 'ÿ¶': 1025692,
 'ÿ°': 830269,
 ':': 190362,
 '(': 520736,
 ')': 518732,
 'ÿ§': 255212,
 'ÿµ': 2416941,
 'ÿ≤': 1756672,
 '+': 3772,
 '6': 256141,
 '8': 309705,
 '7': 256673,
 '3': 290173,
 '5': 291400,
 '1': 1083080,
 '4': 265115,
 '2': 673451,
 '0': 880149,
 'ÿ¢': 283394,
 'ÿ∞': 1605312,
 '9': 608305,
 '-': 207441,
 '/': 64474,
 'ÿõ': 90203,
 'B': 26579,
 'N': 23315,
 'Ÿ™': 10054,
 'a': 187904,
 'n': 136809,
 't': 122874,
 'i': 155275,
 'g': 40103,

In [52]:
target_tokenizer.detokenize(target_tokenizer.tokenize(prepare('ÿßŸÑÿ≥ŸÑÿßŸÖ ÿπŸÑŸäŸÉŸÖ Ÿà ÿ±ÿ≠ŸÖÿ© ÿßŸÑŸÑŸá Ÿà ÿ®ÿ±ŸÉÿßÿ™Ÿá')))

'ÿßŸÑÿ≥ŸÑÿßŸÖ‚ñÅÿπŸÑŸäŸÉŸÖ‚ñÅŸà‚ñÅÿ±ÿ≠ŸÖÿ©‚ñÅÿßŸÑŸÑŸá‚ñÅŸà‚ñÅÿ®ÿ±ŸÉÿßÿ™Ÿá'

In [53]:
target_tokenizer.save_model(file_path='dotless_arabic/experiments/dots_retrieval/bin/target_tokenizer.model')

Saving as pickle file ...


# Run the experiment

## tokenize and split

In [54]:
encoded_trainset = create_features_from_text_list(text_list=undotted_train_dataset,tokenizer=source_tokenizer)
trainy = create_features_from_text_list(text_list=train_dataset,tokenizer=target_tokenizer)

  0%|          | 0/1300304 [00:00<?, ?it/s]

  0%|          | 0/1300304 [00:00<?, ?it/s]

In [55]:
encoded_valset = create_features_from_text_list(text_list=undotted_val_dataset,tokenizer=source_tokenizer)
valy = create_features_from_text_list(text_list=val_dataset,tokenizer=target_tokenizer)

  0%|          | 0/68438 [00:00<?, ?it/s]

  0%|          | 0/68438 [00:00<?, ?it/s]

In [56]:
encoded_testset = create_features_from_text_list(text_list=undotted_test_dataset,tokenizer=source_tokenizer)
testy = create_features_from_text_list(text_list=test_dataset,tokenizer=target_tokenizer)

  0%|          | 0/72040 [00:00<?, ?it/s]

  0%|          | 0/72040 [00:00<?, ?it/s]

In [57]:
testy_with_dots = create_features_from_text_list(text_list=test_dataset,tokenizer=target_tokenizer)

  0%|          | 0/72040 [00:00<?, ?it/s]

In [58]:
# encoded_trainset, encoded_valset, trainy, valy = train_test_split(
#   encoded_trainset,
#   trainy,
#   test_size=0.01,
#   random_state=seed,
# )
# len(encoded_trainset),len(encoded_valset),len(trainy), len(valy)

In [59]:
encoded_trainset.shape,trainy.shape

((1300304, 500), (1300304, 500))

In [60]:
# create tensor datasets
trainset = TensorDataset(torch.from_numpy(encoded_trainset), torch.from_numpy(trainy))
validset = TensorDataset(torch.from_numpy(encoded_valset), torch.from_numpy(valy))
testset = TensorDataset(torch.from_numpy(encoded_testset), torch.from_numpy(testy))
testset_with_dots = TensorDataset(torch.from_numpy(encoded_testset), torch.from_numpy(testy_with_dots))

In [61]:
# create dataloaders
trainloader = DataLoader(trainset, shuffle=True, batch_size=batch_size,num_workers=4)
valloader = DataLoader(validset, shuffle=False, batch_size=batch_size,num_workers=4,drop_last=False)
testloader = DataLoader(testset, shuffle=False, batch_size=batch_size,num_workers=4,drop_last=False)
testloader_with_dots = DataLoader(testset_with_dots,shuffle=False,batch_size=batch_size,num_workers=4,drop_last=False)

## build and train the model

In [62]:
model = LitBiLSTMModel(
    seq_len=seq_len,
    vocab_size=source_tokenizer.vocab_size,
    output_size=target_tokenizer.vocab_size,
  )
model

LitBiLSTMModel(
  (train_accuracy): MulticlassAccuracy()
  (val_accuracy): MulticlassAccuracy()
  (test_accuracy): MulticlassAccuracy()
  (embedding): Embedding(128, 512, padding_idx=1)
  (lstm): LSTM(512, 512, num_layers=2, batch_first=True, dropout=0.33, bidirectional=True)
  (dropout): Dropout(p=0.33, inplace=False)
  (fc): Linear(in_features=512, out_features=144, bias=True)
)

In [63]:
trainer = train_model(
    model,
    train_dataloader=trainloader,
    val_dataloader=valloader,
    text_type='dotless-to-dotted',
  )

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA RTX A4500') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name           | Type               | Params
------------------------------------------------------
0 | train_accuracy | MulticlassAccuracy | 0     
1 | val_accuracy   | MulticlassAccuracy | 0     
2 | test_accuracy  | MulticlassAccuracy | 0     
3 | embedding      | Embedding          | 65.5 K
4 | lstm           | LSTM               | 10.5 M
5 | dropout        | Dropout            | 0     
6 | fc             | Linear             | 73.9 K
------------------------------------------------------
10.6 M    Trainable params
0         Non-trainable params
10.6 M    Total params
42.566    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [64]:
trainer.test(ckpt_path='best',dataloaders=testloader)

Restoring states from the checkpoint path at /home/majed_alshaibani/Experiments/DotlessArabic/DotsRetrieval/dotless-to-dotted/checkpoints/epoch=5-val_loss=0.036-step=29210.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Loaded model weights from the checkpoint at /home/majed_alshaibani/Experiments/DotlessArabic/DotsRetrieval/dotless-to-dotted/checkpoints/epoch=5-val_loss=0.036-step=29210.ckpt


Testing: 0it [00:00, ?it/s]

[{'test_acc': 0.9885143041610718, 'test_loss': 0.0352499894797802}]

find the text metrics, (wer,cer,ver)

In [65]:
model_predictions = trainer.predict(ckpt_path='best',dataloaders=testloader)

Restoring states from the checkpoint path at /home/majed_alshaibani/Experiments/DotlessArabic/DotsRetrieval/dotless-to-dotted/checkpoints/epoch=5-val_loss=0.036-step=29210.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Loaded model weights from the checkpoint at /home/majed_alshaibani/Experiments/DotlessArabic/DotsRetrieval/dotless-to-dotted/checkpoints/epoch=5-val_loss=0.036-step=29210.ckpt


Predicting: 0it [00:00, ?it/s]

In [66]:
predictions = list()
labels = list()
for (batch_predictions,batch_labels) in model_predictions:
  batch_predictions = batch_predictions.view(-1,seq_len)
  batch_labels = batch_labels.view(-1,seq_len)
  for sample_predictions,sample_labels in zip(batch_predictions,batch_labels):
    predictions.append(sample_predictions)
    labels.append(sample_labels)
print('len predictions and lables of the test set:',len(predictions),len(labels))

len predictions and lables of the test set: 72040 72040


In [67]:
wers,cers,vers = list(),list(),list()
for i,(sample_preds,sample_labels) in tqdm(enumerate(zip(predictions,labels)),total=len(predictions)):
  wer,cer = calculate_text_metrics(
      predictions=sample_preds.tolist(),
      labels=sample_labels.tolist(),
      target_tokenizer=target_tokenizer,
      print_text=True if i < 100 else False,
    )
  wers.append(wer)
  cers.append(cer)

  0%|          | 0/72040 [00:00<?, ?it/s]

ÿ®ÿ∑ŸàŸÑÿ© ŸÉÿ£ÿ≥ ÿßŸÑÿ≥Ÿàÿ®ÿ± ÿßŸÑÿ£ŸÑÿ®ÿßŸÜŸä 2002 ŸáŸä ÿßŸÑŸÜÿ≥ÿÆÿ© ÿßŸÑÿ™ÿßÿ≥ÿπÿ© ŸÖŸÜ ÿ®ÿ∑ŸàŸÑÿ© ŸÉÿ£ÿ≥ ÿßŸÑÿ≥Ÿàÿ®ÿ± ÿßŸÑÿ£ŸÑÿ®ÿßŸÜŸä ÿå ŸÑÿπÿ® ŸäŸàŸÖ 14 ÿ≥ÿ®ÿ™ŸÖÿ®ÿ± 2002 ŸÅŸä ÿßŸÑÿßÿ≥ÿ™ÿßÿØ ÿßŸÑŸàÿ∑ŸÜŸä ÿ®ÿ™Ÿäÿ±ÿßŸÜÿß ÿå ÿ®ŸäŸÜ ŸÅÿ±ŸäŸÇ ÿ™Ÿäÿ±ÿßŸÜÿß ÿßŸÑŸÅÿßÿ¶ÿ≤ ÿ®ŸÉÿ£ÿ≥ ÿ£ŸÑÿ®ÿßŸÜŸäÿß ŸàŸÅÿ±ŸäŸÇ ÿØŸäŸÜÿßŸÖŸà ÿ™Ÿäÿ±ÿßŸÜÿß ÿßŸÑŸÅÿßÿ¶ÿ≤ ÿ®ÿßŸÑÿØŸàÿ±Ÿä
ÿ®ÿ∑ŸàŸÑÿ© ŸÉÿ£ÿ≥ ÿßŸÑÿ≥Ÿàÿ®ÿ± ÿßŸÑÿ£ŸÑÿ®ÿßŸÜŸä 2002 ŸáŸä ÿßŸÑŸÜÿ≥ÿÆÿ© ÿßŸÑÿ™ÿßÿ≥ÿπÿ© ŸÖŸÜ ÿ®ÿ∑ŸàŸÑÿ© ŸÉÿ£ÿ≥ ÿßŸÑÿ≥Ÿàÿ®ÿ± ÿßŸÑÿ£ŸÑÿ®ÿßŸÜŸä ÿå ŸÑÿπÿ® ŸäŸàŸÖ 14 ÿ≥ÿ®ÿ™ŸÖÿ®ÿ± 2002 ŸÅŸä ÿßŸÑÿßÿ≥ÿ™ÿßÿØ ÿßŸÑŸàÿ∑ŸÜŸä ÿ®ÿ™Ÿäÿ±ÿßŸÜÿß ÿå ÿ®ŸäŸÜ ŸÅÿ±ŸäŸÇ ÿ™Ÿäÿ±ÿßŸÜÿß ÿßŸÑŸÅÿßÿ¶ÿ≤ ÿ®ŸÉÿ£ÿ≥ ÿ£ŸÑÿ®ÿßŸÜŸäÿß ŸàŸÅÿ±ŸäŸÇ ÿØŸäŸÜÿßŸÖŸà ÿ™Ÿäÿ±ÿßŸÜÿß ÿßŸÑŸÅÿßÿ¶ÿ≤ ÿ®ÿßŸÑÿØŸàÿ±Ÿä
ŸàŸáÿ∞ÿß ÿßŸÑŸÖÿ±ÿ≥Ÿâ ŸÖŸÜ ÿ£ÿ≠ÿ≥ŸÜ ÿßŸÑŸÖÿ±ÿßÿ≥Ÿä Ÿàÿ∂ÿπÿß ÿå ŸàŸáŸà ÿ¥ÿ®Ÿá ÿÆŸÑŸäÿ¨ ŸÖŸÜ ÿßŸÑÿ®ÿ≠ÿ± ŸäÿØÿÆŸÑ ŸÅŸä ÿßŸÑÿ®ÿ± ÿå ŸàÿßŸÑÿ®ÿ± ŸÖÿ∑ŸäŸÅ ÿ®ÿ≠ÿßŸÅŸäÿ™Ÿá ÿå ŸàŸäŸÉŸÜ ŸÖŸÜ ÿ¨ŸÖŸäÿπ ÿßŸÑÿ£ÿ±Ÿàÿßÿ≠ ÿå Ÿàÿ®ÿßÿ≥ÿ™ŸÅÿ≤ÿßÿ≤ŸÜÿß ŸÅŸäŸá ÿπÿßÿØÿ™ ŸÑÿ£ÿ¨ÿ≥ÿßÿØŸÜÿß ÿßŸÑÿ£ÿ±Ÿàÿßÿ≠ ÿå Ÿ



ÿßÿØÿπÿ™ ÿßŸÑÿπÿØŸäÿØ ŸÖŸÜ ÿßŸÑÿ®ÿπÿ´ÿßÿ™ ÿßŸÑÿ£ÿ´ÿ±Ÿäÿ© ÿ®ÿ£ŸÜŸáÿß ÿ™ŸàÿµŸÑÿ™ ÿ•ŸÑŸâ ŸÖŸÉÿßŸÜ ŸÉŸáŸÅ ÿ£ŸáŸÑ ÿßŸÑŸÉŸáŸÅ ÿå ŸàŸÜÿ∏ÿ±ÿß ŸÑÿ£ŸÜ ÿßŸÑŸÜÿ≥ÿÆ ÿßŸÑÿ£ŸàŸÑŸâ ŸÖŸÜ ÿßŸÑÿ±ŸàÿßŸäÿ© ÿßŸÜÿ™ÿ¥ÿ±ÿ™ ŸÖŸÜ ŸÖÿØŸäŸÜÿ© ÿ£ŸÅÿ≥ÿ≥ ÿå ŸÅŸÇÿØ ÿßÿ±ÿ™ÿ®ÿ∑ÿ™ ÿ®Ÿáÿß ÿ≥ÿ±ÿßÿØŸäÿ® ÿßŸÑŸÖŸàÿ™Ÿâ ÿßŸÑŸÖÿ≥Ÿäÿ≠Ÿäÿ© ÿßŸÑŸÖÿ®ŸÉÿ±ÿ© ÿå ŸÖŸÖÿß ÿ£ÿØŸâ ÿ•ŸÑŸâ ÿ¨ÿ∞ÿ® ÿ£ÿπÿØÿßÿØ ŸÉÿ®Ÿäÿ±ÿ© ŸÖŸÜ ÿßŸÑÿ≠ÿ¨ÿßÿ¨ ÿ•ŸÑŸâ ÿ™ŸÑŸÉ ÿßŸÑŸÖÿØŸäŸÜÿ©
ÿ£ÿØÿπÿ™ ÿßŸÑÿπÿØŸäÿØ ŸÖŸÜ ÿßŸÑÿ®ÿπÿ´ÿßÿ™ ÿßŸÑÿßÿ´ÿ±Ÿäÿ© ÿ®ÿ£ŸÜŸáÿß ÿ™ŸàÿµŸÑÿ™ ÿ•ŸÑŸâ ŸÖŸÉÿßŸÜ ŸÉŸáŸÅ ÿ£ŸáŸÑ ÿßŸÑŸÉŸáŸÅ ÿå ŸàŸÜÿ∏ÿ±ÿß ŸÑÿ£ŸÜ ÿßŸÑŸÜÿ≥ÿÆ ÿßŸÑÿ£ŸàŸÑŸâ ŸÖŸÜ ÿßŸÑÿ±ŸàÿßŸäÿ© ÿßŸÜÿ™ÿ¥ÿ±ÿ™ ŸÖŸÜ ŸÖÿØŸäŸÜÿ© ÿ£ŸÅÿ≥ÿ≥ ÿå ŸÅŸÇÿØ ÿßÿ±ÿ™ÿ®ÿ∑ÿ™ ÿ®Ÿáÿß ÿ≥ÿ±ÿßÿØŸäÿ® ÿßŸÑŸÖŸàÿ™Ÿâ ÿßŸÑŸÖÿ≥Ÿäÿ≠Ÿäÿ© ÿßŸÑŸÖÿ®ŸÉÿ±ÿ© ÿå ŸÖŸÖÿß ÿ£ÿØŸâ ÿ•ŸÑŸâ ÿ¨ÿ∞ÿ® ÿ£ÿπÿØÿßÿØ ŸÉÿ®Ÿäÿ±ÿ© ŸÖŸÜ ÿßŸÑÿ≠ÿ¨ÿßÿ¨ ÿ•ŸÑŸâ ÿ™ŸÑŸÉ ÿßŸÑŸÖÿØŸäŸÜÿ©
ŸäÿØŸäŸÜ ÿ£ÿ∫ŸÑÿ® ÿßŸÑÿ¢ÿ¥Ÿàÿ±ŸäŸäŸÜ ŸÅŸä ÿßŸÑÿØŸäÿßŸÜÿ© ÿßŸÑŸÖÿ≥Ÿäÿ≠Ÿäÿ© ÿπŸÑŸâ ŸÖÿ∞Ÿáÿ® ÿßŸÑÿ≥ÿ±ŸäÿßŸÜŸäÿ© ÿßŸÑÿ£ÿ±ÿ´Ÿàÿ∞ŸÉÿ≥Ÿäÿ© ÿå ŸàŸäŸÇÿ∑ŸÜŸàŸÜ ÿå ÿ™ÿ®ÿπÿß ŸÑÿ∞ŸÑŸÉ ÿå ŸÅŸä

In [68]:
avg_wer = sum(wers)/len(wers)
avg_cer = sum(cers)/len(cers)
print('avg_wer, avg_cer:',avg_wer,avg_cer)

avg_wer, avg_cer: tensor(0.0469) tensor(0.0113)


text metrics with vowel words loader

In [None]:
# labels_with_dots = list()
# for batch in testloader_with_dots:
#   _,batch_labels = batch
#   batch_labels = batch_labels.view(-1,seq_len)
#   for sample_labels in batch_labels:
#     labels_with_dots.append(sample_labels)
# print('len lables with vowel words of the test set:',len(labels_with_dots))

In [None]:
# wers,cers = list(),list()
# for sample_preds,sample_labels in tqdm(zip(predictions,labels_with_dots),total=len(predictions)):
#   wer,cer,ver = calculate_text_metrics(
#       predictions=sample_preds.tolist(),
#       labels=sample_labels.tolist(),
#       target_tokenizer=target_tokenizer,
#     )
#   wers.append(wer)
#   cers.append(cer)

In [None]:
# avg_wer = sum(wers)/len(wers)
# avg_cer = sum(cers)/len(cers)
# print('avg_wer, avg_cer, avg_ver:',avg_wer,avg_cer)

test on the best model according to the validation loss

In [None]:
# model = LitBiLSTMModel.load_from_checkpoint(
#     trainer.checkpoint_callback.best_model_path,
#     vocab_size=source_tokenizer.vocab_size,
#     output_size=target_tokenizer.vocab_size,
#   )
# model

In [None]:
# trainer.test(model,testloader)

In [None]:
# model_predictions = trainer.predict(model,testloader)

In [None]:
# predictions = list()
# labels = list()
# for (batch_predictions,batch_labels) in model_predictions:
#   batch_predictions = batch_predictions.view(-1,seq_len)
#   batch_labels = batch_labels.view(-1,seq_len)
#   for sample_predictions,sample_labels in zip(batch_predictions,batch_labels):
#     predictions.append(sample_predictions)
#     labels.append(sample_labels)
# print('len predictions and lables of the test set:',len(predictions),len(labels))

In [None]:
# wers,cers,vers = list(),list(),list()
# for sample_preds,sample_labels in tqdm(zip(predictions,labels),total=len(predictions)):
#   wer,cer,ver = calculate_text_metrics(
#       predictions=sample_preds.tolist(),
#       labels=sample_labels.tolist(),
#       target_tokenizer=target_tokenizer
#     )
#   wers.append(wer)
#   cers.append(cer)
#   vers.append(ver)

In [None]:
# avg_wer = sum(wers)/len(wers)
# avg_cer = sum(cers)/len(cers)
# avg_ver = sum(vers)/len(vers)
# print('avg_wer, avg_cer, avg_ver:',avg_wer,avg_cer,avg_ver)

text metrics with vowel words

In [None]:
# labels_with_vowel_words = list()
# for batch in testloader_with_vowel_words:
#   _,batch_labels = batch
#   batch_labels = batch_labels.view(-1,seq_len)
#   for sample_labels in batch_labels:
#     labels_with_vowel_words.append(sample_labels)
# print('len lables with vowel words of the test set:',len(labels_with_vowel_words))

In [None]:
# wers,cers,vers = list(),list(),list()
# for sample_preds,sample_labels in tqdm(zip(predictions,labels_with_vowel_words),total=len(predictions)):
#   wer,cer,ver = calculate_text_metrics(
#       predictions=sample_preds.tolist(),
#       labels=sample_labels.tolist(),
#       target_tokenizer=target_tokenizer,
#     )
#   wers.append(wer)
#   cers.append(cer)
#   vers.append(ver)

In [None]:
# avg_wer = sum(wers)/len(wers)
# avg_cer = sum(cers)/len(cers)
# avg_ver = sum(vers)/len(vers)
# print('avg_wer, avg_cer, avg_ver:',avg_wer,avg_cer,avg_ver)

In [None]:
# for sample_preds,sample_labels in tqdm(list(zip(predictions,labels_with_vowel_words))[:100],total=100):
#   wer,cer,ver = calculate_text_metrics(
#       predictions=sample_preds.tolist(),
#       labels=sample_labels.tolist(),
#       target_tokenizer=target_tokenizer,
#       print_text=True
#     )

In [None]:
# from google.colab import runtime
# runtime.unassign()