In [1]:
import tkseem as tk
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

from dotless_arabic.processing import process,undot
from dotless_arabic.experiments.nlms.src import constants
from dotless_arabic.experiments.constants import COLLECT_DATASET
from dotless_arabic.experiments.nlms.src.models import LitNeuralLanguageModel
from dotless_arabic.experiments.nlms.src.utils import generate_text,get_best_checkpoint,get_tokenizer,get_dataloader,calculate_perplexity

In [2]:
# write the dataset name here :)
dataset_name = 'wikipedia'
dataset_type = 'undotted'

In [3]:
dataset = COLLECT_DATASET[dataset_name]()

  0%|          | 0/5609905 [00:00<?, ?it/s]

  0%|          | 0/5578668 [00:00<?, ?it/s]

  0%|          | 0/11872890 [00:00<?, ?it/s]

In [4]:
dataset = list(
    map(
        process,
        tqdm(dataset),
    )
)

  0%|          | 0/1469112 [00:00<?, ?it/s]

In [7]:
if dataset_type=='undotted':
    dataset = list(
        map(
            undot,
            tqdm(dataset),
        )
    )

In [None]:
train_dataset, test_dataset = train_test_split(
        dataset,
        shuffle=True,
        test_size=constants.TEST_SIZE,
        random_state=constants.RANDOM_SEED,
    )

train_dataset, val_dataset = train_test_split(
    train_dataset,
    shuffle=True,
    test_size=constants.VAL_SIZE,
    random_state=constants.RANDOM_SEED,
)

In [None]:
model = LitNeuralLanguageModel.load_from_checkpoint(
        get_best_checkpoint(dataset_id=f"{dataset_type.upper()}-{dataset_name.upper()}_DATASET")
)
model

Lightning automatically upgraded your loaded checkpoint from v1.8.6 to v1.9.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file NLMs/DOTTED-WIKIPEDIA_DATASET/checkpoints/epoch=37-val_loss=5.73-step=372741-WordTokenizer-0.95.ckpt`


LitNeuralLanguageModel(
  (embedding_layer): Embedding(103558, 512)
  (gru_layer): GRU(512, 512, num_layers=4, batch_first=True)
  (first_dense_layer): Linear(in_features=512, out_features=512, bias=True)
  (dropout_layer): Dropout(p=0.333, inplace=False)
  (relu): ReLU()
  (second_dense_layer): Linear(in_features=512, out_features=103558, bias=True)
)

In [None]:
tokenizer = get_tokenizer(
        train_dataset=train_dataset,
        vocab_size=model.vocab_size,
        tokenizer_class=tk.WordTokenizer,
    )

Training WordTokenizer ...


In [None]:
sequence_length=128

In [None]:
training_perplexity = calculate_perplexity(
    lm_model=model,
    tokenizer=tokenizer,
    dataset=train_dataset,
    sequence_length=sequence_length,
    batch_size=constants.DEFAULT_BATCH_SIZE,
)

perplexity_with_oovs = calculate_perplexity(
    lm_model=model,
    tokenizer=tokenizer,
    dataset=test_dataset,
    sequence_length=sequence_length,
)

perplexity_without_oovs = calculate_perplexity(
    lm_model=model,
    tokenizer=tokenizer,
    dataset=test_dataset,
    sequence_length=sequence_length,
    ignore_oovs=True,
)