# Clean Wiki

In [None]:
import datasets
from datasets import load_dataset, load_from_disk
from tqdm.autonotebook import tqdm

In [None]:
wikipedia_dataset = load_dataset("wikipedia", "20220301.en", split="train")

In [None]:
wikipedia_dataset = wikipedia_dataset.remove_columns(
    [col for col in wikipedia_dataset.column_names if col != "text"]
)  # only keep the 'text' column

In [None]:
wikipedia_dataset = wikipedia_dataset.map(lambda x: {"len": len(x["text"])}, num_proc=24)

In [None]:
sum(wikipedia_dataset["len"])

In [None]:
# wikipedia_dataset = wikipedia_dataset.select(range(10))

In [None]:
# !python -m spacy download en_core_web_sm

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
def process_see_also(sents):
    start_pos = -1
    end_pos = -1
    for i, s in enumerate(sents):
        if len(s) < 9 and "See also" in s:
            start_pos = i
            continue
        if start_pos > 0:
            if s[0] == " ":
                end_pos = i
            else:
                break
    if start_pos < 0:
        return sents
    if end_pos - start_pos < 1:
        return sents
    sents[start_pos] = sents[start_pos] + ": " + ", ".join(sents[start_pos+1:end_pos]) + "."
    sents = sents[: start_pos + 1]
    return sents

In [None]:
def remove_references(sents):
    start_pos = -1
    for i, s in enumerate(sents):
        if len(s) < 12 and 'References' in s:
            start_pos = i
            break
    return sents[:start_pos]

In [None]:
def filter_empty(sents, th=3):
    return [s for s in sents if len(s) > th]

In [None]:
def clean_wiki(examples):
    sents_merged = []
    for text in examples["text"]:
        paragraphs = text.split("\n")
        sents = [sent.text for p in paragraphs for sent in nlp(p).sents]
        sents = filter_empty(sents)
        sents = process_see_also(sents)
        sents = remove_references(sents)
        sents_merged.extend(sents)
    return {"text": sents_merged}

In [None]:
wikipedia_dataset = wikipedia_dataset.map(
    lambda x: clean_wiki(x),
    batched=True,
    remove_columns=wikipedia_dataset.column_names
)

In [None]:
wikipedia_dataset.save_to_disk("wikipedia_dataset_cleaned.hf")

# bookcorpusopen dataset cleaning

In [None]:
import datasets
from datasets import load_dataset, load_from_disk
from tqdm.autonotebook import tqdm

In [None]:
bookcorpus_dataset = load_dataset("bookcorpusopen", "plain_text", split="train")

In [None]:
bookcorpus_dataset = bookcorpus_dataset.remove_columns(
    [col for col in bookcorpus_dataset.column_names if col != "text"]
)  # only keep the 'text' column

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
def filter_empty(sents, th=3):
    return [s for s in sents if len(s) > th]

In [None]:
def clean_bookcorpus(examples):
    sents_merged = []
    for text in examples["text"]:
        paragraphs = text.split("\n")
        sents = [sent.text for p in paragraphs for sent in nlp(p).sents]
        sents = filter_empty(sents)
        sents_merged.extend(sents)
    return {"text": sents_merged}

In [None]:
bookcorpus_dataset = bookcorpus_dataset.map(
    clean_bookcorpus,
    batched=True,
    remove_columns=bookcorpus_dataset.column_names,
    batch_size=1,
    num_proc=24
)

In [None]:
bookcorpus_dataset.save_to_disk("bookcorpus_dataset_cleaned.hf")

Notebook is based on https://huggingface.co/blog/how-to-train

## Prepare tokenized dataset

In [1]:
import datasets
from datasets import load_dataset, load_from_disk
from tqdm.autonotebook import tqdm

In [2]:
wikipedia_dataset = load_from_disk("wikipedia_dataset_cleaned.hf")

In [3]:
bookcorpus_dataset = load_from_disk("bookcorpus_dataset_cleaned.hf")

In [4]:
wikipedia_dataset

Dataset({
    features: ['text'],
    num_rows: 205447996
})

In [5]:
bookcorpus_dataset

Dataset({
    features: ['text'],
    num_rows: 97892049
})

In [6]:
# wikipedia_dataset = wikipedia_dataset.map(lambda x: {"len": len(x["text"])}, num_proc=24)
# sum(wikipedia_dataset["len"])

In [7]:
assert bookcorpus_dataset.features.type == wikipedia_dataset.features.type

In [8]:
raw_datasets = datasets.combine.concatenate_datasets([wikipedia_dataset, bookcorpus_dataset])

In [9]:
from transformers import RobertaTokenizerFast
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base", use_fast=True, add_prefix_space=True)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [10]:
from tokenizers import normalizers
from tokenizers.normalizers import BertNormalizer

In [11]:
tokenizer.backend_tokenizer.normalizer = normalizers.Sequence([BertNormalizer()])

In [None]:
def get_training_corpus():
    return (
        raw_datasets[i : i + 1000]["text"]
        for i in tqdm(range(0, len(raw_datasets), 1000))
    )

In [None]:
tokenizer.tokenize("Hello world")

In [None]:
training_corpus = get_training_corpus()
tokenizer = tokenizer.train_new_from_iterator(training_corpus, vocab_size=32_768)

In [None]:
tokenizer.tokenize("Hello world")

In [None]:
tokenizer.save_pretrained("bergman-tokenizer_32k")

In [14]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./bergman-tokenizer_32k/", max_len=512)

In [15]:
raw_datasets = raw_datasets.map(
    lambda examples: tokenizer(examples["text"], return_special_tokens_mask=True),
    batched=True,
    remove_columns=raw_datasets.column_names,
    num_proc=24,
)

Map (num_proc=12):   0%|          | 0/303340045 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (525 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (636 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (570 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (614 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (703 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

In [16]:
raw_datasets.save_to_disk("raw_dataset_tokenized_32k.hf")

Saving the dataset (0/81 shards):   0%|          | 0/303340045 [00:00<?, ? examples/s]

# Train a language model from scratch

In [1]:
import datasets
from datasets import load_dataset, load_from_disk
from tqdm.autonotebook import tqdm

In [2]:
raw_datasets = load_from_disk("raw_dataset_tokenized_32k.hf")

### We'll define the following config for the model

In [3]:
from transformers import RobertaTokenizerFast
tokenizer = RobertaTokenizerFast.from_pretrained("./bergman-tokenizer_32k/", max_len=512)

from tokenizers import normalizers
from tokenizers.normalizers import BertNormalizer
tokenizer.backend_tokenizer.normalizer = normalizers.Sequence([BertNormalizer()])

In [4]:
from bergman import BergmanConfig

# # Bergman_Apr02_06-14-51_raven_200_000
# config = BergmanConfig(
#     vocab_size=tokenizer.vocab_size,
#     max_position_embeddings=512,
#     num_hidden_layers=4,
#     type_vocab_size=1,
#     hidden_size=768,
#     position_embedding_type="none",
#     matrix_norm_alg=None,
#     matrix_dim=4,
#     num_matrix_heads=32,
#     vector_init_direction="one",
#     use_for_context=["lr_excl", "rl_excl"],
#     networks_for_heads="common",
#     matrix_encoder_two_layers=True,
#     #
#     matrix_norm_loss_type=None,
#     matrix_norm_loss_k=0.0,
#     matrix_unitary_loss=None,
#     matrix_unitary_loss_k = 0.0,
#     norm_vectors=True,
#     complex_matrix=True,
#     complex_matrix_abs=True,
# )

# # Apr06_23-21-44_raven
# config = BergmanConfig(
#     vocab_size=tokenizer.vocab_size,
#     max_position_embeddings=512,
#     num_hidden_layers=1,
#     type_vocab_size=1,
#     hidden_size=768,
#     position_embedding_type="none",
#     matrix_norm_alg=None,
#     matrix_dim=8,
#     num_matrix_heads=96,
#     vector_init_direction="one",
#     use_for_context=["lr_excl", "rl_excl"],
#     networks_for_heads="common",
#     matrix_encoder_two_layers=True,
#     #
#     matrix_norm_loss_type=None,
#     matrix_norm_loss_k=0.0,
#     matrix_unitary_loss=None,
#     matrix_unitary_loss_k = 0.0,
#     norm_vectors=True,
#     complex_matrix=True,
#     complex_matrix_abs=True,
# )

# # Apr07_03-18-33_raven
# config = BergmanConfig(
#     vocab_size=tokenizer.vocab_size,
#     max_position_embeddings=512,
#     num_hidden_layers=2,
#     type_vocab_size=1,
#     hidden_size=768,
#     position_embedding_type="none",
#     matrix_norm_alg=None,
#     matrix_dim=8,
#     num_matrix_heads=48,
#     vector_init_direction="one",
#     use_for_context=["lr_excl", "rl_excl"],
#     networks_for_heads="common",
#     matrix_encoder_two_layers=True,
#     #
#     matrix_norm_loss_type=None,
#     matrix_norm_loss_k=0.0,
#     matrix_unitary_loss=None,
#     matrix_unitary_loss_k = 0.0,
#     norm_vectors=True,
#     complex_matrix=True,
#     complex_matrix_abs=True,
# )

# # Apr07_15-21-03_raven
# config = BergmanConfig(
#     vocab_size=tokenizer.vocab_size,
#     max_position_embeddings=512,
#     num_hidden_layers=1,
#     type_vocab_size=1,
#     hidden_size=768,
#     position_embedding_type="none",
#     matrix_norm_alg=None,
#     matrix_dim=4,
#     num_matrix_heads=48,
#     vector_init_direction="one",
#     use_for_context=["lr_excl", "rl_excl"],
#     networks_for_heads="common",
#     matrix_encoder_two_layers=True,
#     #
#     matrix_norm_loss_type=None,
#     matrix_norm_loss_k=0.0,
#     matrix_unitary_loss=None,
#     matrix_unitary_loss_k = 0.0,
#     norm_vectors=True,
#     complex_matrix=True,
#     complex_matrix_abs=True,
# )

# # Apr07_16-39-44_raven
# config = BergmanConfig(
#     vocab_size=tokenizer.vocab_size,
#     max_position_embeddings=512,
#     num_hidden_layers=2,
#     type_vocab_size=1,
#     hidden_size=768,
#     position_embedding_type="none",
#     matrix_norm_alg=None,
#     matrix_dim=4,
#     num_matrix_heads=48,
#     vector_init_direction="one",
#     use_for_context=["lr_excl", "rl_excl"],
#     networks_for_heads="common",
#     matrix_encoder_two_layers=True,
#     #
#     matrix_norm_loss_type=None,
#     matrix_norm_loss_k=0.0,
#     matrix_unitary_loss=None,
#     matrix_unitary_loss_k = 0.0,
#     norm_vectors=True,
#     complex_matrix=True,
#     complex_matrix_abs=True,
# )

# # Apr11_20-08-57_raven
# config = BergmanConfig(
#     vocab_size=tokenizer.vocab_size,
#     max_position_embeddings=512,
#     num_hidden_layers=6,
#     type_vocab_size=1,
#     hidden_size=768,
#     position_embedding_type="none",
#     matrix_norm_alg=None,
#     matrix_dim=4,
#     num_matrix_heads=64,
#     vector_init_direction="one",
#     use_for_context=["lr_excl", "rl_excl"],
#     networks_for_heads="common",
#     matrix_encoder_two_layers=True,
#     #
#     matrix_norm_loss_type=None,
#     matrix_norm_loss_k=0.0,
#     matrix_unitary_loss=None,
#     matrix_unitary_loss_k = 0.0,
#     norm_vectors=True,
#     complex_matrix=True,
#     complex_matrix_abs=True,
#     rl_lr_matrix_different=True,
# )

#
config = BergmanConfig(
    vocab_size=tokenizer.vocab_size,
    max_position_embeddings=512,
    num_hidden_layers=3,
    type_vocab_size=1,
    hidden_size=768,
    position_embedding_type="none",
    matrix_norm_alg=None,
    matrix_dim=4,
    num_matrix_heads=96,
    vector_init_direction="one",
    use_for_context=["lr_excl", "rl_excl"],
    networks_for_heads="common",
    matrix_encoder_two_layers=True,
    #
    matrix_norm_loss_type=None,
    matrix_norm_loss_k=0.0,
    matrix_unitary_loss=None,
    matrix_unitary_loss_k = 0.0,
    norm_vectors=True,
    complex_matrix=True,
    complex_matrix_abs=True,
    rl_lr_matrix_different=False,
)

Now let's re-create our tokenizer in transformers

Finally let's initialize our model.

**Important:**

As we are training from scratch, we only initialize from a config, not from an existing pretrained model or checkpoint.

In [5]:
from bergman import BergmanForMaskedLM

model = BergmanForMaskedLM(config=config)

# model = BergmanForMaskedLM.from_pretrained("./Bergman_Apr02_06-14-51_raven_200_000/")

In [6]:
model.num_parameters()

52769792

In [7]:
# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print(name, param.size(), param.numel())

### Now let's build our training Dataset

We'll build our dataset by applying our tokenizer to our text file.

Here, as we only have one text file, we don't even need to customize our `Dataset`. We'll just use the `LineByLineDataset` out-of-the-box.

In [8]:
from datasets import load_dataset

In [9]:
import multiprocessing
num_proc = multiprocessing.cpu_count()

In [10]:
dataset = raw_datasets

In [11]:
max_seq_length = tokenizer.model_max_length
max_seq_length = 150

In [12]:
merge_texts = True

In [13]:
def group_texts(examples, max_seq_length, merge_texts):
    """
    >>> group_texts({"a": [list(range(5))]}, 4, True)
    {'a': [[0, 1, 2, 4], [0, 3, 4]]}
    >>> group_texts({"a": [list(range(3)), list(range(4))]}, 5, True)
    {'a': [[0, 1, 1, 2, 3]]}
    >>> group_texts({"a": [list(range(3)), list(range(4))]}, 5, False)
    {'a': [[0, 1, 2], [0, 1, 2, 3]]}
    >>> group_texts({"a": [list(range(4)), list(range(4))]}, 5, True)
    {'a': [[0, 1, 2, 3], [0, 1, 2, 3]]}
    """
    # Concatenate all texts.
    result = {}
    for k, v in examples.items():
        acc = []
        for text in v:
            if (
                len(acc) > 0
                and len(acc[-1]) + len(text) - 2 <= max_seq_length
                and merge_texts
            ):
                acc[-1].pop()  # remove </s>
                acc[-1].extend(text[1:])  # remove <s>
            else:
                b = text[0]
                e = text[-1]
                content = text[1:-1]
                for i in range((len(content)) // (max_seq_length - 2) + 1):
                    body = content[
                        (max_seq_length - 2) * i : (i + 1) * (max_seq_length - 2)
                    ]
                    if len(body) > 0:
                        acc.append(
                            [b]  # <s> or corresponding mask
                            + body
                            + [e]  # </s> or corresponding mask
                        )
        result[k] = acc

    return result

In [14]:
dataset = dataset.map(
    lambda x: group_texts(x, max_seq_length, merge_texts),
    batched=True,
    num_proc=num_proc,
)

Loading cached processed dataset at /home/eugene/Projects/matrix_network/raw_dataset_tokenized_32k.hf/cache-7a818b2d5092987a_*_of_00024.arrow


In [15]:
dataset.set_format(type="torch", columns=["input_ids"])  # , 'special_tokens_mask'])

In [16]:
tokenizer.decode(dataset[0]["input_ids"])

'<s> anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful. as a historically left-wing movement, placed on the farthest left of the political spectrum, it is usually described alongside communalism and libertarian marxism as the libertarian wing (libertarian socialism) of the socialist movement, and has a strong historical association with anti-capitalism and socialism. humans lived in societies without formal hierarchies long before the establishment of formal states, realms, or empires. with the rise of organised hierarchical bodies, scepticism toward authority also rose.</s>'

In [17]:
dataset = dataset.shuffle()

Like in the [`run_language_modeling.py`](https://github.com/huggingface/transformers/blob/master/examples/language-modeling/run_language_modeling.py) script, we need to define a data_collator.

This is just a small helper that will help us batch different samples of the dataset together into an object that PyTorch knows how to perform backprop on.

Like in the [`run_language_modeling.py`](https://github.com/huggingface/transformers/blob/master/examples/language-modeling/run_language_modeling.py) script, we need to define a data_collator.

This is just a small helper that will help us batch different samples of the dataset together into an object that PyTorch knows how to perform backprop on.

In [18]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

### Finally, we are all set to initialize our Trainer

In [19]:
from transformers import Trainer, TrainingArguments

In [20]:
training_args = TrainingArguments(
    output_dir="./Bergman",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=62,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_steps=100,
    learning_rate=5E-4,
    weight_decay=0.01,
)

In [21]:
from transformers.trainer import (
    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
    is_torch_tpu_available,
)
import torch


class BergmanTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.

        Subclass and override for custom behavior.
        """
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        outputs = model(**inputs)
        # Save past state if it exists
        # TODO: this needs to be fixed and made cleaner later.
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        metrics = outputs["metrics"] if isinstance(outputs, dict) else outputs[-1]
        self.metrics = {
            m: v if isinstance(v, float) else v.detach() for m, v in metrics.items()
        }

        if labels is not None:
            if (
                unwrap_model(model)._get_name()
                in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values()
            ):
                loss = self.label_smoother(outputs, labels, shift_labels=True)
            else:
                loss = self.label_smoother(outputs, labels)
        else:
            if isinstance(outputs, dict) and "loss" not in outputs:
                raise ValueError(
                    "The model did not return a loss from the inputs, only the following keys: "
                    f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
                )
            # We don't use .loss here since the model may return tuples instead of ModelOutput.
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]

        return (loss, outputs) if return_outputs else loss

    def _maybe_log_save_evaluate(
        self, tr_loss, model, trial, epoch, ignore_keys_for_eval
    ):
        if not hasattr(self, "metrics_acc"):
            self.metrics_acc: Dict[str, torch.Tensor] = {}

        for m, v in self.metrics.items():
            if v is None:
                continue
            if m not in self.metrics_acc:
                self.metrics_acc[m] = torch.tensor(0.0).to(model.device)
            self.metrics_acc[m] += v

        if self.control.should_log:
            if is_torch_tpu_available():
                xm.mark_step()

            metrics = {
                m: self._nested_gather(v).mean().item()
                for m, v in self.metrics_acc.items()
            }
            # reset counters
            self.metrics_acc = {}

            logs = {
                m: round(
                    v / (self.state.global_step - self._globalstep_last_logged),
                    4,
                )
                for m, v in metrics.items()
            }

            # all_gather + mean() to get average loss over all processes
            tr_loss_scalar = self._nested_gather(tr_loss).mean().item()

            # reset tr_loss to zero
            tr_loss -= tr_loss

            logs["loss"] = round(
                tr_loss_scalar
                / (self.state.global_step - self._globalstep_last_logged),
                4,
            )
            logs["learning_rate"] = self._get_learning_rate()

            self._total_loss_scalar += tr_loss_scalar
            self._globalstep_last_logged = self.state.global_step
            self.store_flos()

            self.log(logs)

        metrics = None
        if self.control.should_evaluate:
            if isinstance(self.eval_dataset, dict):
                for eval_dataset_name, eval_dataset in self.eval_dataset.items():
                    metrics = self.evaluate(
                        eval_dataset=eval_dataset,
                        ignore_keys=ignore_keys_for_eval,
                        metric_key_prefix=f"eval_{eval_dataset_name}",
                    )
            else:
                metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
            self._report_to_hp_search(trial, self.state.global_step, metrics)

        if self.control.should_save:
            self._save_checkpoint(model, trial, metrics=metrics)
            self.control = self.callback_handler.on_save(
                self.args, self.state, self.control
            )

In [22]:
trainer = BergmanTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

### Start training

In [None]:
# %#%time
# with torch.autograd.detect_anomaly(True):
trainer.train()

The following columns in the training set don't have a corresponding argument in `BergmanForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BergmanForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 41096777
  Num Epochs = 1
  Instantaneous batch size per device = 62
  Total train batch size (w. parallel, distributed & accumulation) = 62
  Gradient Accumulation steps = 1
  Total optimization steps = 662852
  Number of trainable parameters = 52769792
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
100,7.6822
200,6.8881
300,6.5162
400,6.232
500,6.042
600,5.8899
700,5.7415
800,5.66
900,5.5936
1000,5.5068


#### 🎉 Save final model (+ tokenizer + config) to disk

In [None]:
trainer.save_model("./Bergman")

In [None]:
raise Exception()

## 4. Check that the LM actually trained

Aside from looking at the training and eval losses going down, the easiest way to check whether our language model is learning anything interesting is via the `FillMaskPipeline`.

Pipelines are simple wrappers around tokenizers and models, and the 'fill-mask' one will let you input a sequence containing a masked token (here, `<mask>`) and return a list of the most probable filled sequences, with their probabilities.



In [None]:
model = model.from_pretrained("Bergman_Mar31_05-04-15_raven_70000/")

In [None]:
model = model.to("cpu")

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer
)

In [None]:
# The sun <mask>.
# =>

fill_mask("while the term <mask> has been largely synonymous with anarchism")

In [None]:
fill_mask("Jen la komenco de bela <mask>.")

# Save graph

In [None]:
torch.onnx.export(model, torch.LongTensor([[0,0,0,0,0]]), 'Bergman.onnx')

Ok, simple syntax/grammar works. Let’s try a slightly more interesting prompt:



## 5. Share your model 🎉

Finally, when you have a nice model, please think about sharing it with the community:

- upload your model using the CLI: `transformers-cli upload`
- write a README.md model card and add it to the repository under `model_cards/`. Your model card should ideally include:
    - a model description,
    - training params (dataset, preprocessing, hyperparameters), 
    - evaluation results,
    - intended uses & limitations
    - whatever else is helpful! 🤓

### **TADA!**

➡️ Your model has a page on http://huggingface.co/models and everyone can load it using `AutoModel.from_pretrained("username/model_name")`.

[![tb](https://huggingface.co/blog/assets/01_how-to-train/model_page.png)](https://huggingface.co/julien-c/EsperBERTo-small)


If you want to take a look at models in different languages, check https://huggingface.co/models

[![all models](https://huggingface.co/front/thumbnails/models.png)](https://huggingface.co/models)
