<a href="https://colab.research.google.com/github/NastasiaMazur/Exercise-2-Procedural-to-OOP-Class-customization/blob/main/EN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
!pip install transformers
!pip install datasets
!pip install bertviz transformers
!pip install transformers[torch]



In [25]:
from transformers import AutoTokenizer,  AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base")

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:
from datasets import load_dataset

dataset_en = load_dataset("cnn_dailymail", '3.0.0')
dataset_en

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [28]:
from datasets import DatasetDict

def truncate(example):
    return {
        'article': " ".join(example['article'].split()[:50]),
        'highlights': example['highlights']
    }

small_en_dataset = DatasetDict(
    train=dataset_en['train'].shuffle(seed=24).select(range(128)).map(truncate),
    validation=dataset_en['validation'].shuffle(seed=24).select(range(32)).map(truncate),
)

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

In [30]:
small_en_dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 128
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 32
    })
})

In [31]:
small_en_dataset['train'][:10]

{'article': ["By . John Hall . PUBLISHED: . 12:17 EST, 6 March 2014 . | . UPDATED: . 15:03 EST, 6 March 2014 . Predator: Ouldkount Belabbas sexually assaulted an eight-year-old girl in Madame Tussauds' gift shop . An Algerian illegal immigrant who sexually assaulted an eight-year-old girl while she shopped",
  'Former Defense Secretary Leon Panetta is the latest ex-Cabinet official in the Obama administration to publicly come out against President Barack Obama\'s handling of Syria and Iraq. In an interview on CBS News\' "60 Minutes" that aired Sunday evening, Panetta told Scott Pelley that he "really thought that it was',
  'A former Delaware middle school teacher has avoided jail after exchanging explicit photos and having sex with one of her 13-year-old students. Stephanie Seabury, then 22, was captured on surveillance video climbing into her car outside Fred Fifer III Middle School in Camden with the unidentified boy on February 26.',
  'With Lent just a few weeks away, chocolate ma

In [32]:
def tokenize_function(examples):
    return tokenizer(examples["article"], padding=True, truncation=True)

small_tokenized_en_dataset = small_en_dataset.map(tokenize_function, batched=True, batch_size=8) #batch!
small_tokenized_en_dataset = small_tokenized_en_dataset.remove_columns(["article"])
small_tokenized_en_dataset = small_tokenized_en_dataset.remove_columns(["highlights"])
small_tokenized_en_dataset = small_tokenized_en_dataset.remove_columns(["id"])


#small_tokenized_en_dataset = small_tokenized_en_dataset.rename_column("label", "labels")


# Tokenize the small summarization dataset
#small_tokenized_en_dataset = small_en_dataset.map(tokenize_function, batched=True, batch_size=8)

# Set the format to PyTorch tensors
small_tokenized_en_dataset.set_format("torch")

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

In [33]:
small_tokenized_en_dataset['train'][0:2]

{'input_ids': tensor([[     0,   3311,      6,      5,   4939,  19449,      6,      5,      6,
          211232,  79909,  17610,     12,      6,      5,    427,  22950,  50149,
               4,    305,  11994,   1049,      6,      5,      6,  58745,      6,
               5, 161521,    397,     12,      6,      5,    423,  28724,  50149,
               4,    305,  11994,   1049,      6,      5,   7145,   4597,     12,
             180,  52366,    265,   9109,  12628,  10545,    162,  17688,    538,
              10,   1192,    202,   3674,    142, 136659,      9,  46799,      9,
           18345,  23040,     23,  43706,    282,   1371,   1192,   1674,      7,
              25,  18466,  19927,      6,      5,    893,    884,  27811,     66,
           85325, 177705,   2750,  17688,    538,     10,   1192,    202,   3674,
             142, 136659,      9,  46799,      9,  18345,  23040,  12960,   2412,
           19927,  20051,      2],
         [     0,  15236,     56, 207068, 196212, 

In [34]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [36]:
from torch.utils.data import DataLoader

train_dataloader_en = DataLoader(small_tokenized_en_dataset['train'], batch_size=8, shuffle=True, collate_fn=data_collator)
eval_dataloader_en = DataLoader(small_tokenized_en_dataset['validation'], batch_size=8, collate_fn=data_collator)


### **Training**