# Prepare

In [1]:
%cd ../../../..

/home/majed_alshaibani/Experiments/DotlessArabic


In [2]:

import re
import os
import string

import torch

from pytorch_lightning import seed_everything

import datasets

from dotless_arabic.tokenizers import WordTokenizer
from dotless_arabic.experiments.nlms.src.training_pipeline import training_pipeline

from dotless_arabic.datasets.utils import (
    tokens_frequency,
    calculate_entropy,
    tokenize_dataset_for_statistics,
)


In [3]:
os.environ['WANDB_MODE']='disabled'
os.environ['CUDA_LAUNCH_BLOCKING']='1' # to see CUDA errors
torch.cuda.empty_cache() # to free gpu memory
seed_everything(42,workers=True)

Global seed set to 42


42

# Load the dataset

In [4]:
dataset = datasets.load_dataset('wikitext','wikitext-2-raw-v1')

Found cached dataset wikitext (/home/majed_alshaibani/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
dataset

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [6]:
ENGLISH_LETTERS = string.ascii_lowercase

In [7]:
def process_english(text):
    # add spaces between punctuations, if there is not
    text = text.lower()
    text = re.sub(
        r"""([.,!?()\/\\،"'\{\}\(\)\[\]؟<>`؛=+\-\*\&\^\%\$\#\@\!])""",
        r" \1 ",
        text,
    )
    # remove any non arabic character
    text = "".join(
        [c for c in text if c in ENGLISH_LETTERS or c.isspace()]
    )  # keep only english chars and spaces
    text = re.sub("\s{2,}", " ", text).strip()  # remove multiple spaces
    """
      interestingly, there is a difference betwen re.sub('\s+',' ',s) and re.sub('\s{2,}',' ',s)
      the first one remove newlines while the second does not.
    """
    return text.strip()

In [8]:
def strip_vowels(text):
    text_with_no_vowels = re.sub(r'[AEIOU]','',text,flags=re.IGNORECASE)
    return text_with_no_vowels

In [9]:
def prepare_example(example):
    example['processed_text'] = process_english(example['text'])
    example['consonants'] = strip_vowels(example['processed_text'])
    return example

In [10]:
dataset['train'] = dataset['train'].filter(lambda example:len(example['text'].split()) > 50).map(prepare_example)
dataset['validation'] = dataset['validation'].filter(lambda example:len(example['text'].split()) > 50).map(prepare_example)
dataset['test'] = dataset['test'].filter(lambda example:len(example['text'].split()) > 50).map(prepare_example)
dataset

Loading cached processed dataset at /home/majed_alshaibani/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-12e7c27a0fe0238b.arrow
Loading cached processed dataset at /home/majed_alshaibani/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-907ca0784453a0f1.arrow
Loading cached processed dataset at /home/majed_alshaibani/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-79638f707874c45b.arrow
Loading cached processed dataset at /home/majed_alshaibani/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-5e57f540b836b9b4.arrow
Loading cached processed dataset at /home/majed_alshaibani/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa73221

DatasetDict({
    test: Dataset({
        features: ['text', 'processed_text', 'consonants'],
        num_rows: 1626
    })
    train: Dataset({
        features: ['text', 'processed_text', 'consonants'],
        num_rows: 13651
    })
    validation: Dataset({
        features: ['text', 'processed_text', 'consonants'],
        num_rows: 1420
    })
})

In [11]:
dataset['train']['processed_text'][:3]

['senj no valkyria unrecorded chronicles japanese lit valkyria of the battlefield commonly referred to as valkyria chronicles iii outside japan is a tactical role playing video game developed by sega and media vision for the playstation portable released in january in japan it is the third game in the valkyria series employing the same fusion of tactical and real time gameplay as its predecessors the story runs parallel to the first game and follows the nameless a penal military unit serving the nation of gallia during the second europan war who perform secret black operations and are pitted against the imperial unit calamaty raven',
 'the game began development in carrying over a large portion of the work done on valkyria chronicles ii while it retained the standard features of the series it also underwent multiple adjustments such as making the game more forgiving for series newcomers character designer raita honjou and composer hitoshi sakimoto both returned from previous entries al

In [12]:
dataset['train']['consonants'][:3]

['snj n vlkyr nrcrdd chrncls jpns lt vlkyr f th bttlfld cmmnly rfrrd t s vlkyr chrncls  tsd jpn s  tctcl rl plyng vd gm dvlpd by sg nd md vsn fr th plysttn prtbl rlsd n jnry n jpn t s th thrd gm n th vlkyr srs mplyng th sm fsn f tctcl nd rl tm gmply s ts prdcssrs th stry rns prlll t th frst gm nd fllws th nmlss  pnl mltry nt srvng th ntn f gll drng th scnd rpn wr wh prfrm scrt blck prtns nd r pttd gnst th mprl nt clmty rvn',
 'th gm bgn dvlpmnt n crryng vr  lrg prtn f th wrk dn n vlkyr chrncls  whl t rtnd th stndrd ftrs f th srs t ls ndrwnt mltpl djstmnts sch s mkng th gm mr frgvng fr srs nwcmrs chrctr dsgnr rt hnj nd cmpsr htsh skmt bth rtrnd frm prvs ntrs lng wth vlkyr chrncls  drctr tksh zw  lrg tm f wrtrs hndld th scrpt th gm s pnng thm ws sng by my n',
 't mt wth pstv sls n jpn nd ws prsd by bth jpns nd wstrn crtcs ftr rls t rcvd dwnldbl cntnt lng wth n xpndd dtn n nvmbr f tht yr t ws ls dptd nt mng nd n rgnl vd nmtn srs d t lw sls f vlkyr chrncls  vlkyr chrncls  ws nt lclzd bt  f

# Statistics

In [13]:
text_tokens_frequency = tokens_frequency(tuple(dataset['train']['text']))
processed_text_tokens_frequency = tokens_frequency(tuple(dataset['train']['processed_text']))
consonants_tokens_frequency = tokens_frequency(tuple(dataset['train']['consonants']))

  0%|          | 0/13651 [00:00<?, ?it/s]

  0%|          | 0/13651 [00:00<?, ?it/s]

  0%|          | 0/13651 [00:00<?, ?it/s]

In [14]:
len(dict(sorted(consonants_tokens_frequency.items(),key=lambda item:item[1],reverse=True))),len(dict(sorted(processed_text_tokens_frequency.items(),key=lambda item:item[1],reverse=True)))

(41037, 60111)

In [15]:
calculate_entropy(dict(sorted(consonants_tokens_frequency.items(),key=lambda item:item[1],reverse=True))),calculate_entropy(dict(sorted(processed_text_tokens_frequency.items(),key=lambda item:item[1],reverse=True)))

(10.089455279909119, 10.749834530719115)

In [16]:
train_dataset = list(dataset['train']['processed_text'])
val_dataset = list(dataset['validation']['processed_text'])
test_dataset = list(dataset['test']['processed_text'])

consonants_train_dataset = list(dataset['train']['consonants'])
consonants_val_dataset =  list(dataset['validation']['consonants'])
consonants_test_dataset = list(dataset['test']['consonants'])


In [17]:
len(train_dataset),len(val_dataset),len(test_dataset)

(13651, 1420, 1626)

In [18]:
dataset = train_dataset+val_dataset+test_dataset

In [19]:
training_pipeline(
    dataset=dataset,
    is_dotted=True, # do not run the undot() method that is specific for Arabic.
    dataset_id='processed_wikitext',
    batch_size=64,
    gpu_devices=1,
    cpu_devices=1,
    dataset_name='wikitext',
    results_file=None,
    vocab_coverage=0.98,
    tokenizer_class=WordTokenizer,
    dataloader_workers=1,
)

Global seed set to 42


####################################################################################################
Train Samples: 14,275
Val Samples: 752
Test Samples: 1,670
####################################################################################################
####################################################################################################
Calculating vocab size using WordTokenizer:
####################################################################################################


  0%|          | 0/14275 [00:00<?, ?it/s]

####################################################################################################
Considered Vocab (from WordTokenizer): 32,527
All Vocab (WordTokenizer): 61,826
####################################################################################################
Training WordTokenizer ...
####################################################################################################
Tokenizer Vocab Size: 32,527
####################################################################################################
####################################################################################################
Calculating Sequence Length:
####################################################################################################


  0%|          | 0/14275 [00:00<?, ?it/s]

  0%|          | 0/14275 [00:00<?, ?it/s]

####################################################################################################
Sequence Length: 293
####################################################################################################
####################################################################################################
Building DataLoaders
####################################################################################################


  0%|          | 0/14275 [00:00<?, ?it/s]

  0%|          | 0/752 [00:00<?, ?it/s]

  0%|          | 0/1670 [00:00<?, ?it/s]

####################################################################################################
Train DataLoader: 223
Val DataLoader: 11
Test DataLoader: 26
####################################################################################################
####################################################################################################
| Name               | Type      | Params
-------------------------------------------------
0 | embedding_layer    | Embedding | 16.7 M
1 | gru_layer          | GRU       | 6.3 M
2 | first_dense_layer  | Linear    | 262 K
3 | dropout_layer      | Dropout   | 0
4 | relu               | ReLU      | 0
5 | second_dense_layer | Linear    | 16.7 M
-------------------------------------------------
23.3 M    Trainable params
0         Non-trainable params
23.3 M    Total params
93.011    Total estimated model params size (MB)
####################################################################################################


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Output()

  rank_zero_warn(


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Output()

  0%|          | 0/14275 [00:00<?, ?it/s]

  0%|          | 0/223 [00:02<?, ?it/s]

  0%|          | 0/1670 [00:00<?, ?it/s]

  0%|          | 0/13 [00:01<?, ?it/s]

  0%|          | 0/1670 [00:00<?, ?it/s]

Exception ignored in: <function _ConnectionBase.__del__ at 0x7f1946b72950>
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 132, in __del__
    self._close()
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 361, in _close
Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/queues.py", line 239, in _feed
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor
    reader_close()
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.10/threading.py", l

  0%|          | 0/13 [01:07<?, ?it/s]

####################################################################################################
Training Perplexity: 245.8436529271258
Perplexity with OOVs: 435.3442655789242
Perplexity without OOVs: 472.1153597538082
####################################################################################################
####################################################################################################
Training OOVs rate: 2.00
Validation OOVs rate: 2.00
Test OOVs rate: 2.00
####################################################################################################
####################################################################################################
Training Time: 755.62 seconds
####################################################################################################
predicting: the
prompt is: <bos> the
predicting: first
prompt is: <bos> the first
predicting: two
prompt is: <bos> the first two
predicting: s
prompt is: <bos> the first

In [20]:
dataset = consonants_train_dataset+consonants_val_dataset+consonants_test_dataset

In [None]:
training_pipeline(
    dataset=dataset,
    is_dotted=True, # do not run the undot() method that is specific for Arabic.
    dataset_id='consonants_wikitext',
    batch_size=64,
    gpu_devices=1,
    cpu_devices=1,
    dataset_name='consonants_wikitext',
    results_file=None,
    vocab_coverage=0.98,
    tokenizer_class=WordTokenizer,
    dataloader_workers=1,
)

Global seed set to 42


####################################################################################################
Train Samples: 14,275
Val Samples: 752
Test Samples: 1,670
####################################################################################################
####################################################################################################
Calculating vocab size using WordTokenizer:
####################################################################################################


  0%|          | 0/14275 [00:00<?, ?it/s]

####################################################################################################
Considered Vocab (from WordTokenizer): 18,949
All Vocab (WordTokenizer): 41,928
####################################################################################################


Exception ignored in: <function _ConnectionBase.__del__ at 0x7f1946b72950>
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 132, in __del__
    self._close()
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor


Training WordTokenizer ...
####################################################################################################
Tokenizer Vocab Size: 18,949
####################################################################################################
####################################################################################################
Calculating Sequence Length:
####################################################################################################


  0%|          | 0/14275 [00:00<?, ?it/s]

  0%|          | 0/14275 [00:00<?, ?it/s]

####################################################################################################
Sequence Length: 287
####################################################################################################
####################################################################################################
Building DataLoaders
####################################################################################################


  0%|          | 0/14275 [00:00<?, ?it/s]

  0%|          | 0/752 [00:00<?, ?it/s]

  0%|          | 0/1670 [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Output()

####################################################################################################
Train DataLoader: 223
Val DataLoader: 11
Test DataLoader: 26
####################################################################################################
####################################################################################################
| Name               | Type      | Params
-------------------------------------------------
0 | embedding_layer    | Embedding | 9.7 M
1 | gru_layer          | GRU       | 6.3 M
2 | first_dense_layer  | Linear    | 262 K
3 | dropout_layer      | Dropout   | 0
4 | relu               | ReLU      | 0
5 | second_dense_layer | Linear    | 9.7 M
-------------------------------------------------
16.3 M    Trainable params
0         Non-trainable params
16.3 M    Total params
65.149    Total estimated model params size (MB)
####################################################################################################


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Output()

  0%|          | 0/14275 [00:00<?, ?it/s]

  0%|          | 0/223 [00:01<?, ?it/s]

  0%|          | 0/1670 [00:00<?, ?it/s]

  0%|          | 0/13 [00:02<?, ?it/s]

  0%|          | 0/1670 [00:00<?, ?it/s]