In [1]:
%%capture
!pip install huggingface_hub

In [2]:
%%capture
! pip install datasets transformers

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.9.2-1).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.


In [5]:
from transformers.utils import send_example_telemetry

send_example_telemetry("language_modeling_notebook", framework="pytorch")

# Fine-tuning a language model

## Preparing the dataset

In [7]:
dataset = pd.readcsv("../data/ads.csv")

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-a4e37bd88c71d800/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-a4e37bd88c71d800/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

To get a sense of what the data looks like, the following function will show some examples picked randomly in the dataset.

In [8]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset)
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [None]:
show_random_elements(dataset["train"])

## Causal Language modeling

In [9]:
model_checkpoint = "aubmindlab/aragpt2-base"

To tokenize all our texts with the same vocabulary that was used when training the model, we have to download a pretrained tokenizer. This is all done by the `AutoTokenizer` class:

In [10]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/1.50M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/4.52M [00:00<?, ?B/s]

We can now call the tokenizer on all our texts. This is very simple, using the [`map`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map) method from the Datasets library. First we define a function that call the tokenizer on our texts:

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['ads_clean'],
        num_rows: 297
    })
})

In [11]:
def tokenize_function(examples):
    return tokenizer(examples["ads_clean"])

Then we apply it to all the splits in our `datasets` object, using `batched=True` and 4 processes to speed up the preprocessing. We won't need the `text` column afterward, so we discard it.

In [12]:
tokenized_dataset = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["ads_clean"])

Map (num_proc=4):   0%|          | 0/297 [00:00<?, ? examples/s]

In [16]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 297
    })
})

If we now look at an element of our datasets, we will see the text have been replaced by the `input_ids` the model will need:

In [None]:
tokenized_dataset["train"][1]

Now for the harder part: we need to concatenate all our texts together then split the result in small chunks of a certain `block_size`. To do this, we will use the `map` method again, with the option `batched=True`. This option actually lets us change the number of examples in the datasets by returning a different number of examples than we got. This way, we can create our new samples from a batch of examples.

First, we grab the maximum length our model was pretrained with. This might be a big too big to fit in your GPU RAM, so here we take a bit less at just 128.

In [13]:
# block_size = tokenizer.model_max_length
block_size = 128

Then we write the preprocessing function that will group our texts:

In [14]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

First note that we duplicate the inputs for our labels. This is because the model of the 🤗 Transformers library apply the shifting to the right, so we don't need to do it manually.


In [15]:
lm_datasets = tokenized_dataset.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/297 [00:00<?, ? examples/s]

And we can check our datasets have changed: now the samples contain chunks of `block_size` contiguous tokens, potentially spanning over several of our original texts.

In [17]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

'لتك ولا فصحابك لي ديما كيسولك نفس سؤالحيت عندنا ديما نتا لول، تبرع بالماكس ديال السخاوة هاد لعواشر مع أورنج بهاد المناسبة بارك و عاود و استافد من الروشارج المضاعفة إبتداءً من درهم عرض صالح إلى غاية أبريلاتبعوا معنا تجربة المقاول عمران سينو رئيس تعاونية الأندلس للنحت على الحجر والرخام، وقدر يزيد به القدام عن طريق الدورات التكوينية في إطار برنامج بشراكة مع اورنجماكين غير فورفي تبرع باللامحدود، تمزك، تفرج و دير لي بغيتي و خليك ديما كونيكطي هضرة و أنترنيت بالإضافة لمواقع'

Now that the data has been cleaned, we're ready to instantiate our `Trainer`. We will a model:

In [18]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

Downloading pytorch_model.bin:   0%|          | 0.00/553M [00:00<?, ?B/s]

And some `TrainingArguments`:

In [19]:
from transformers import Trainer, TrainingArguments

In [32]:
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}",
    #evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    #push_to_hub=True,
)

We pass along all of those to the `Trainer` class:

In [22]:
model_name

'aragpt2-base'

In [33]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    #eval_dataset=lm_datasets["validation"],
)

And we can train our model:

In [34]:
trainer.train()



Step,Training Loss


TrainOutput(global_step=24, training_loss=7.309241612752278, metrics={'train_runtime': 304.7675, 'train_samples_per_second': 0.591, 'train_steps_per_second': 0.079, 'total_flos': 11758141440000.0, 'train_loss': 7.309241612752278, 'epoch': 3.0})

In [24]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [25]:
model.push_to_hub("issamaaaaa/aragpt2-base", use_auth_token=True)

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/553M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/issamaaaaa/aragpt2-base/commit/ffc21e21ea21d7923484ce06730b4dc0a27caadc', commit_message='Upload model', commit_description='', oid='ffc21e21ea21d7923484ce06730b4dc0a27caadc', pr_url=None, pr_revision=None, pr_num=None)

In [26]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("issamaaaaa/aragpt2-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/553M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [56]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "issamaaaaa/aragpt2-base"
model = AutoModelForCausalLM.from_pretrained(model_name)
#tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# Set the generation parameters
prompt = ""
max_length = 100
temperature = 1.0
top_k = 0
top_p = 0.9
num_return_sequences = 1

# Generate text from the model
input_ids = tokenizer.encode(prompt, return_tensors="pt")
output_sequences = model.generate(
    input_ids=input_ids,
    max_length=max_length,
    temperature=temperature,
    top_k=top_k,
    top_p=top_p,
    do_sample=True,
    num_return_sequences=num_return_sequences,
)
generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)

print(generated_text)


AttributeError: ignored

In [29]:
transformers-cli cache clear


SyntaxError: ignored