In [25]:
import tkseem as tk
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

from dotless_arabic.processing import process
from dotless_arabic.experiments.nlms.src import constants
from dotless_arabic.experiments.constants import COLLECT_DATASET_FOR_LANGUAGE_MODELLING
from dotless_arabic.experiments.nlms.src.models import LitRNNLM
from dotless_arabic.experiments.nlms.src.utils import generate_text,get_best_checkpoint,get_tokenizer,get_dataloader,get_vocab_size
from dotless_arabic.tokenizers import WordTokenizer,FarasaMorphologicalTokenizer,DisjointLetterTokenizer,CharacterTokenizer

In [15]:
# write the dataset name here :)
dataset_name = 'quran'
dataset_type = 'dotted'
tokenizer_class = CharacterTokenizer

In [16]:
dataset = COLLECT_DATASET_FOR_LANGUAGE_MODELLING[dataset_name]()
dataset = list(
    map(
        process,
        dataset,
    )
)

####################################################################################################
Number of samples:
6236
####################################################################################################


In [17]:
train_dataset, test_dataset = train_test_split(
        dataset,
        shuffle=True,
        test_size=constants.TEST_SIZE,
        random_state=constants.RANDOM_SEED,
    )

train_dataset, val_dataset = train_test_split(
    train_dataset,
    shuffle=True,
    test_size=constants.VAL_SIZE,
    random_state=constants.RANDOM_SEED,
)

In [18]:
model = LitRNNLM.load_from_checkpoint(
        get_best_checkpoint(
                dataset_id=f"{dataset_type.upper()}-{dataset_name.upper()}_DATASET",
                tokenizer_class=tokenizer_class,
        )
)
model

checkpiont epoch=9-val_loss=1.677-step=767-0.95.ckpt found.
Cannot tie wight as embedding size is nto equal hidden size: 512!=256


LitRNNLM(
  (embedding_layer): Embedding(36, 512, padding_idx=1)
  (rnn): GRU(512, 256, num_layers=2, batch_first=True, dropout=0.3333)
  (dropout_layer): Dropout(p=0.3333, inplace=False)
  (relu): LeakyReLU(negative_slope=0.01)
  (first_dense_layer): Linear(in_features=256, out_features=256, bias=True)
  (second_dense_layer): Linear(in_features=256, out_features=36, bias=True)
  (train_ppl): Perplexity()
  (val_ppl): Perplexity()
  (test_ppl): Perplexity()
)

In [26]:
tokenizer = get_tokenizer(
    vocab_size=get_vocab_size(
        dataset=train_dataset,
        vocab_coverage=0.95,
        return_all_vocab_size=False,
    ),
    train_dataset=train_dataset,
    # vocab_size=model.vocab_size,
    tokenizer_class=tokenizer_class,
)

  0%|          | 0/5331 [00:00<?, ?it/s]

Training CharacterTokenizer ...


In [41]:
t = generate_text(
    lm_model=model,
    num_tokens=100,
    sequence_length=20,
    tokenizer=tokenizer,
)
print(t)

<bos>  [2]
predicting: ل
prompt is: <bos>  ل
<bos>  ل [2, 7]
predicting: ا
prompt is: <bos>  ل ا
<bos>  ل ا [2, 7, 6]
predicting: <##>
prompt is: <bos>  ل ا <##>
<bos>  ل ا <##> [2, 7, 6, 35]
predicting: ي
prompt is: <bos>  ل ا <##> ي
<bos>  ل ا <##> ي [2, 7, 6, 35, 10]
predicting: و
prompt is: <bos>  ل ا <##> ي و
<bos>  ل ا <##> ي و [2, 7, 6, 35, 10, 14]
predicting: م
prompt is: <bos>  ل ا <##> ي و م
<bos>  ل ا <##> ي و م [2, 7, 6, 35, 10, 14, 4]
predicting: <##>
prompt is: <bos>  ل ا <##> ي و م <##>
<bos>  ل ا <##> ي و م <##> [2, 7, 6, 35, 10, 14, 4, 35]
predicting: ي
prompt is: <bos>  ل ا <##> ي و م <##> ي
<bos>  ل ا <##> ي و م <##> ي [2, 7, 6, 35, 10, 14, 4, 35, 10]
predicting: ك
prompt is: <bos>  ل ا <##> ي و م <##> ي ك
<bos>  ل ا <##> ي و م <##> ي ك [2, 7, 6, 35, 10, 14, 4, 35, 10, 16]
predicting: و
prompt is: <bos>  ل ا <##> ي و م <##> ي ك و
<bos>  ل ا <##> ي و م <##> ي ك و [2, 7, 6, 35, 10, 14, 4, 35, 10, 16, 14]
predicting: ن
prompt is: <bos>  ل ا <##> ي و م <##> ي ك و ن
<bos>