### Exercise 3: Train a Sequence to Sequence Model with Attention on the ROC Story Dataset
**Using the ROC Story Dataset of day #3 (lien [ici](https://drive.google.com/file/d/1eJINcSbC3JLl0hTNbhh5G94zTuXinpC-/view?usp=sharing)), build a sequence to sequence model with attention that takes as input sentence the sentence #1 (input of the encoder), and takes as target sentence (input of the decoder), the sentence #2**.   

This creates an encoder-decoder model for story continuation.
For the model, you can use either: 
1. The Seq2Seq RNN Model with attention of [notebook of day #4](https://colab.research.google.com/drive/1GPnhw6iQzVSMPvr8-dU8n3gJD67b9Upr?usp=sharing)
> You can tweak the model, for example using a Multiplicative Attention instead of an Additive Attention
2. A Transformer model. There is a great tutorial and transformer implementation in tensorflow here: https://www.tensorflow.org/text/tutorials/transformer

In [2]:
!pip install tensorflow
!pip install transformers
from transformers import pipeline

You should consider upgrading via the '/Users/mohamedaminechafik/opt/anaconda3/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/mohamedaminechafik/opt/anaconda3/bin/python3 -m pip install --upgrade pip' command.[0m


In [3]:
import pandas as pd 
data = pd.read_csv("/Users/mohamedaminechafik/Downloads/ROCStories_winter2017.csv")
data_train, data_test= train_test_split(data[['sentence1',"sentence2"]], test_size=0.2, random_state=1)
data.sentence1

NameError: name 'train_test_split' is not defined

In [None]:
from datasets import Dataset
import datasets

train_dataset = Dataset.from_pandas(data_train)
test_dataset = Dataset.from_pandas(data_test)

In [None]:
dataset = datasets.DatasetDict({"train":train_dataset,"test":test_dataset})

# Let's create a Transformer

In [None]:
from transformers import GPT2Tokenizer, GPT2Model
import torch

tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
model = GPT2Model.from_pretrained("distilgpt2")


outputs = model(**inputs)


last_hidden_states = outputs.last_hidden_state

Some weights of the model checkpoint at distilgpt2 were not used when initializing GPT2Model: ['lm_head.weight']
- This IS expected if you are initializing GPT2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 '__index_level_0__': Value(dtype='int64', id=None)}

In [None]:
from transformers import AutoTokenizer

checkpoint = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])

In [None]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenized_dataset = tokenizer(
    dataset["train"]["sentence1"],
    dataset["train"]["sentence2"],
    padding=True,
    truncation=True,
)

Assigning [PAD] to the pad_token key of the tokenizer


In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', '__index_level_0__'],
        num_rows: 42132
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', '__index_level_0__'],
        num_rows: 10533
    })
})

In [None]:
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

In [None]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets

100%|██████████| 43/43 [00:06<00:00,  6.49ba/s]
100%|██████████| 11/11 [00:01<00:00, 10.09ba/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 42132
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 10533
    })
})

In [None]:
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]]

[27, 32, 25, 24, 14, 14, 24, 13]

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Training
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")


In [None]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [1]:
trainer.train()

NameError: name 'trainer' is not defined

In [None]:
# Evaluation

predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

In [None]:
import numpy as np
from datasets import load_metric

preds = np.argmax(predictions.predictions, axis=-1)
metric = load_metric("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

In [96]:
def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [23]:
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['attention_mask', 'input_ids', 'labels', 'token_type_ids']

In [None]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

In [None]:
#progress bar 

from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
# Evaluation loop:

from datasets import load_metric

metric = load_metric("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()