In [1]:
!nvidia-smi

zsh:1: command not found: nvidia-smi


In [None]:
!pip uninstall -y tensorflow
!pip install git+https://github.com/huggingface/transformers
!pip list | grep -E 'transformers|tokenizers'


In [None]:
%%time 
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

paths = ["macrons.txt"]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

In [None]:
tokenizer.save_model("./PROUT")

In [None]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer = ByteLevelBPETokenizer(
    "./PROUT/vocab.json",
    "./PROUT/merges.txt",
)

In [None]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [None]:
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

### We'll define the following config for the model

In [14]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52000,
    hidden_size=768,
    hidden_act = "gelu",
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=12,
    type_vocab_size=1,
    
)

Now let's re-create our tokenizer in transformers

In [16]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./PROUT", max_len=512)

Some weights of XLMRobertaForMaskedLM were not initialized from the model checkpoint at ./TweetERT and are newly initialized: ['lm_head.decoder.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Finally let's initialize our model.

**Important:**

As we are training from scratch, we only initialize from a config, not from an existing pretrained model or checkpoint.

In [17]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [18]:
model.num_parameters()

126031648

### Now let's build our training Dataset

We'll build our dataset by applying our tokenizer to our text file.

Here, as we only have one text file, we don't even need to customize our `Dataset`. We'll just use the `LineByLineDataset` out-of-the-box.

In [None]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="french_tweets.txt",
    block_size=128,
)

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

### Finally, we are all set to initialize our Trainer

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./PROUT",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

### Start training

In [None]:
%%time
trainer.train()

#### 🎉 Save final model (+ tokenizer + config) to disk

In [None]:
trainer.save_model("./PROUT")

## 4. Check that the LM actually trained

In [5]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./PROUT",
    tokenizer="./PROUT"
)

Some weights of RobertaModel were not initialized from the model checkpoint at ./TweetERT and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at ./TweetERT and are newly initialized: ['lm_head.decoder.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
fill_mask("le gouvernement est une bande de grosse <mask>")


[{'sequence': 'le gouvernement est une bande de grosse chose',
  'score': 0.044854775071144104,
  'token': 680,
  'token_str': ' chose'},
 {'sequence': 'le gouvernement est une bande de grosse affaire',
  'score': 0.040861308574676514,
  'token': 5746,
  'token_str': ' affaire'},
 {'sequence': 'le gouvernement est une bande de grosse erreur',
  'score': 0.03720996528863907,
  'token': 4568,
  'token_str': ' erreur'},
 {'sequence': 'le gouvernement est une bande de grosse qualité',
  'score': 0.02438104897737503,
  'token': 4676,
  'token_str': ' qualité'},
 {'sequence': 'le gouvernement est une bande de grosse déception',
  'score': 0.02104603871703148,
  'token': 8022,
  'token_str': ' déception'}]

In [7]:
def inf(num_element:int, sentence):
    
    for i in range(0 ,num_element):
        partial_res = fill_mask(sentence+'<mask>')
        sentence = partial_res[0]["sequence"]
    return sentence



In [11]:
inf(1,"Le gouvernement est une bande de grosse ")

'Le gouvernement est une bande de grosse chose'