<a href="https://colab.research.google.com/github/amaydle/MergeX/blob/model-train/Models/MergeX.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install huggingface_hub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import huggingface_hub

huggingface_hub.login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Create the dataset

In [3]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
from datasets import load_dataset

dataset = load_dataset('json', data_files='dataset.json')
dataset = dataset['train'].train_test_split(test_size=0.1)



  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
dataset.push_to_hub("amaydle/npc-dialogue")



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Name', 'Biography', 'Query', 'Response', 'Emotion'],
        num_rows: 1723
    })
    test: Dataset({
        features: ['Name', 'Biography', 'Query', 'Response', 'Emotion'],
        num_rows: 192
    })
})

# Preprocessing 

In [7]:
from datasets import load_dataset

dataset = load_dataset("amaydle/npc-dialogue")

Downloading readme:   0%|          | 0.00/430 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/amaydle___parquet/amaydle--npc-dialogue-c7c64815e2cb7b9d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/37.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/166k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating test split:   0%|          | 0/192 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1723 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/amaydle___parquet/amaydle--npc-dialogue-c7c64815e2cb7b9d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
!pip install torch pytesseract transformers datasets nltk tensorboard py7zr

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [9]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id="google/flan-t5-base"

# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [10]:
def format_dataset(example):
    input_text = f"Biography: {example['Biography']}\n\nQuestion: {example['Query']}"
    target_text = f"Emotion: {example['Emotion']}\n\nAnswer: {example['Response']}"
    return {"input_text": input_text, "target_text": target_text}

In [11]:
dataset["train"] = dataset["train"].map(
    format_dataset,
    batched=False,
    num_proc=4,
    remove_columns=["Name", "Biography", "Query", "Response", "Emotion"]
)

dataset["test"] = dataset["test"].map(
    format_dataset,
    batched=False,
    num_proc=4,
    remove_columns=["Name", "Biography", "Query", "Response", "Emotion"]
)

Map (num_proc=4):   0%|          | 0/1723 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/192 [00:00<?, ? examples/s]

In [12]:
max_source_length = 256
max_target_length = 128

In [13]:
def preprocess_function(sample,padding="max_length"):
    model_inputs = tokenizer(sample["input_text"], max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["target_text"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["input_text", "target_text"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

Map:   0%|          | 0/192 [00:00<?, ? examples/s]

Map:   0%|          | 0/1723 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


# Train

In [14]:
from transformers import AutoModelForSeq2SeqLM

# huggingface hub model id
model_id="google/flan-t5-base"

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

Downloading pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [15]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [16]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Hugging Face repository id
repository_id = f"amaydle/mergex"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=5e-5,
    num_train_epochs=5,
    # logging & evaluation strategies
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    # metric_for_best_model="overall_f1",
    # push to hub parameters
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    # compute_metrics=compute_metrics,
)

Cloning https://huggingface.co/amaydle/mergex into local empty directory.


In [None]:
# Start training
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,1.689053
