In [6]:
import os
import wandb
import torch
import random
import numpy as np
import pandas as pd
from rich import print
from pathlib import Path
from tqdm.auto import tqdm
from dotenv import load_dotenv
from collections import defaultdict
from huggingface_hub import login
from sklearn.model_selection import train_test_split
from datasets import DatasetDict, Dataset
import transformers
from transformers import AutoConfig
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
sns.set_theme(context='notebook', style='dark')
tqdm.pandas()

In [8]:
seed = 42

random.seed(seed)

np.random.seed(seed)

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

transformers.set_seed(seed)

In [9]:
dataset_path = Path('./sentences.txt')
tokenizer_path = Path('./models/nano-gpt-tokenizer/')
model_ckpt = 'openai-community/gpt2'
model_name = 'ArabMindGPT'
model_path = Path(f'./{model_name}')

In [10]:
PROJECT_NAME = 'ArabMindGPT'
JOB_TYPE = 'LM-Modeling'
TAGS = ['MOdeling', 'Transformers', 'GPT2', 'Language-Modeling', 'Arabic']
NOTES = 'LM Training on Arabic Data using GPT2 Model Architecture'
RUN_NAME = 'Dummy-data-and-model-LM-Moldeing'
config = defaultdict(dict)

In [11]:
load_dotenv()
wandb.login()
login(token=os.getenv('HF_TOKEN'),
      add_to_git_credential=True, write_permission=True)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmazin-sherif100[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin

Fine-grained tokens added complexity to the permissions, making it irrelevant to check if a token has 'write' access.


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
# load the dataset
with open(dataset_path, 'r') as f:
    dataset = f.readlines()[:100_000]

In [13]:
train, valid = train_test_split(dataset, train_size=0.8, random_state=seed)
valid, test = train_test_split(valid, train_size=0.5, random_state=seed)

In [14]:
dataset = DatasetDict()
dataset['train'] = Dataset.from_dict({'text': train})
dataset['valid'] = Dataset.from_dict({'text': valid})
dataset['test'] = Dataset.from_dict({'text': test})
print(dataset)

In [15]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
print(tokenizer)

In [16]:
def tokenize(batch):
    outputs = tokenizer(
        batch['text'],
        truncation=True,
        return_overflowing_tokens=True,
    )
    return {'input_ids': outputs['input_ids']}


tokenized_dataset = dataset.map(
    tokenize, batched=True, remove_columns=dataset['train'].column_names
)
print(tokenized_dataset)

Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [17]:
model_config = AutoConfig.from_pretrained(
    model_ckpt, vocab_size=tokenizer.vocab_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    n_positions=tokenizer.model_max_length,
    n_ctx=tokenizer.model_max_length,
    n_embd=128, n_head=2, n_layer=2,
)
print(model_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [18]:
model = AutoModelForCausalLM.from_config(model_config)
print(model)

In [19]:
print('Size in MBs:', model.get_memory_footprint() / 1_000_000)

In [20]:
print('Num Params:', model.num_parameters() / 1_000_000, 'M')

In [21]:
num_epochs = 5
batch_size = 16 * 32
total_steps = len(tokenized_dataset['train']) * num_epochs // batch_size
total_steps

781

In [22]:
training_args = TrainingArguments(
    output_dir=model_path,
    run_name=RUN_NAME,
    report_to='wandb',
    save_strategy='no',
    eval_strategy='steps',
    gradient_accumulation_steps=32,
    overwrite_output_dir=True,
    data_seed=seed, seed=seed,
    learning_rate=1e-3,
    weight_decay=0.001,
    warmup_ratio=0.0,
    eval_steps=500,
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
)

In [23]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['valid'],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [25]:
run = wandb.init(project=PROJECT_NAME, job_type=JOB_TYPE,
                 name=RUN_NAME, notes=NOTES, tags=TAGS, config=config)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [26]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=20, training_loss=246.9587890625, metrics={'train_runtime': 8010.9616, 'train_samples_per_second': 49.972, 'train_steps_per_second': 0.002, 'total_flos': 88035203481600.0, 'train_loss': 246.9587890625, 'epoch': 4.8152866242038215})

In [27]:
trainer.evaluate(tokenized_dataset['test'], metric_key_prefix='test')

{'test_loss': 7.354394912719727,
 'test_runtime': 74.7658,
 'test_samples_per_second': 133.818,
 'test_steps_per_second': 0.268,
 'epoch': 4.8152866242038215}

In [28]:
wandb.finish()

0,1
test/loss,▁
test/runtime,▁
test/samples_per_second,▁
test/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
test/loss,7.35439
test/runtime,74.7658
test/samples_per_second,133.818
test/steps_per_second,0.268
total_flos,88035203481600.0
train/epoch,4.81529
train/global_step,20.0
train_loss,246.95879
train_runtime,8010.9616
train_samples_per_second,49.972


In [29]:
trainer.save_model(model_path)

In [30]:
_ = trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/3.75M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]