In [1]:
import tkseem as tk
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

from dotless_arabic.processing import process
from dotless_arabic.experiments.nlms.src import constants
from dotless_arabic.experiments.constants import COLLECT_DATASET
from dotless_arabic.experiments.nlms.src.models import LitNeuralLanguageModel
from dotless_arabic.experiments.nlms.src.utils import generate_text,get_best_checkpoint,get_tokenizer,get_dataloader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# write the dataset name here :)
dataset_name = 'poems'
dataset_type = 'dotted'

In [3]:
dataset = COLLECT_DATASET[dataset_name]()
dataset = list(
    map(
        process,
        dataset,
    )
)

Using custom data configuration MagedSaeed--ashaar-719bb58a76ea0092
Found cached dataset parquet (/home/majed.alshaibani/.cache/huggingface/datasets/arbml___parquet/MagedSaeed--ashaar-719bb58a76ea0092/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /home/majed.alshaibani/.cache/huggingface/datasets/arbml___parquet/MagedSaeed--ashaar-719bb58a76ea0092/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-9a01dee7fce54c58.arrow
100%|██████████| 249880/249880 [00:01<00:00, 219255.05it/s]
100%|██████████| 3774110/3774110 [00:57<00:00, 65927.60it/s]


In [8]:
train_dataset, test_dataset = train_test_split(
        dataset,
        shuffle=True,
        test_size=constants.TEST_SIZE,
        random_state=constants.RANDOM_SEED,
    )

train_dataset, val_dataset = train_test_split(
    train_dataset,
    shuffle=True,
    test_size=constants.VAL_SIZE,
    random_state=constants.RANDOM_SEED,
)

In [9]:
model = LitNeuralLanguageModel.load_from_checkpoint(
        get_best_checkpoint(dataset_id=f"{dataset_type.upper()}-{dataset_name.upper()}_DATASET")
)
model

Lightning automatically upgraded your loaded checkpoint from v1.8.6 to v1.9.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file NLMs/DOTTED-POEMS_DATASET/checkpoints/epoch=46-val_loss=7.07-step=497968-WordTokenizer-0.95.ckpt`


LitNeuralLanguageModel(
  (embedding_layer): Embedding(216253, 512)
  (gru_layer): GRU(512, 512, num_layers=4, batch_first=True)
  (first_dense_layer): Linear(in_features=512, out_features=512, bias=True)
  (dropout_layer): Dropout(p=0.333, inplace=False)
  (relu): ReLU()
  (second_dense_layer): Linear(in_features=512, out_features=216253, bias=True)
)

In [10]:
tokenizer = get_tokenizer(
        train_dataset=train_dataset,
        vocab_size=model.vocab_size,
        tokenizer_class=tk.WordTokenizer,
    )

Training WordTokenizer ...


In [11]:
t = generate_text(
    lm_model=model,
    num_tokens=100,
    sequence_length=20,
    tokenizer=tokenizer,
)
print(t)

predicting: يا
prompt is: <bos> يا
predicting: عمرو
prompt is: <bos> يا عمرو
predicting: على
prompt is: <bos> يا عمرو على
predicting: من
prompt is: <bos> يا عمرو على من
predicting: قال
prompt is: <bos> يا عمرو على من قال
predicting: هذا
prompt is: <bos> يا عمرو على من قال هذا
predicting: من
prompt is: <bos> يا عمرو على من قال هذا من
predicting: قريب
prompt is: <bos> يا عمرو على من قال هذا من قريب
predicting: وانت
prompt is: <bos> يا عمرو على من قال هذا من قريب وانت
predicting: كله
prompt is: <bos> يا عمرو على من قال هذا من قريب وانت كله
predicting: يوم
prompt is: <bos> يا عمرو على من قال هذا من قريب وانت كله يوم
predicting: تمام
prompt is: <bos> يا عمرو على من قال هذا من قريب وانت كله يوم تمام
predicting: ود
prompt is: <bos> يا عمرو على من قال هذا من قريب وانت كله يوم تمام ود
predicting: <PAD>
prompt is: <bos> يا عمرو على من قال هذا من قريب وانت كله يوم تمام ود <PAD>
predicting: <eos>
prompt is: <bos> 
predicting: و
prompt is: <bos> و
predicting: لا
prompt is: <bos> و لا
predicting: اب