In [6]:
from datasets import load_dataset

dataset = load_dataset("jhu-clsp/jfleg")

In [7]:
from datasets import concatenate_datasets

In [8]:
print(dataset)

DatasetDict({
    validation: Dataset({
        features: ['sentence', 'corrections'],
        num_rows: 755
    })
    test: Dataset({
        features: ['sentence', 'corrections'],
        num_rows: 748
    })
})


In [9]:
combined_dataset = dataset["validation"].train_test_split(test_size=0.5)

train_dataset = concatenate_datasets([combined_dataset["train"], dataset['test']])
test_dataset = combined_dataset["test"]

In [10]:
print(train_dataset)

Dataset({
    features: ['sentence', 'corrections'],
    num_rows: 1125
})


In [11]:
print(test_dataset)

Dataset({
    features: ['sentence', 'corrections'],
    num_rows: 378
})


In [12]:
def preprocess_data(example):
    inputs = "grammar: " + example["sentence"] 
    targets = example["corrections"][0] 
    return {"input_text": inputs, "target_text": targets}

train_dataset = train_dataset.map(preprocess_data)
test_dataset = test_dataset.map(preprocess_data)

Map:   0%|          | 0/1125 [00:00<?, ? examples/s]

Map:   0%|          | 0/378 [00:00<?, ? examples/s]

In [40]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-base")

def tokenize_data(example):
    inputs = tokenizer(example["input_text"], truncation=True, padding="max_length", max_length=512)
    targets = tokenizer(example["target_text"], truncation=True, padding="max_length", max_length=512)
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": targets["input_ids"]  
    }

tokenized_train = train_dataset.map(tokenize_data)
tokenized_test = test_dataset.map(tokenize_data)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/1125 [00:00<?, ? examples/s]

Map:   0%|          | 0/378 [00:00<?, ? examples/s]

In [42]:
from transformers import AutoModelForSeq2SeqLM, Trainer, TrainingArguments

model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [43]:
training_args = TrainingArguments(
    output_dir="./t5-grammar-correction",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
)

In [44]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0379,0.03076
2,0.0334,0.028743
3,0.023,0.028721


TrainOutput(global_step=846, training_loss=0.22032286635386464, metrics={'train_runtime': 6456.0127, 'train_samples_per_second': 0.523, 'train_steps_per_second': 0.131, 'total_flos': 2055232880640000.0, 'train_loss': 0.22032286635386464, 'epoch': 3.0})

In [45]:
model.save_pretrained('./t5_base_emotion_model')
tokenizer.save_pretrained('./t5_base_emotion_model')

('./t5_base_emotion_model\\tokenizer_config.json',
 './t5_base_emotion_model\\special_tokens_map.json',
 './t5_base_emotion_model\\spiece.model',
 './t5_base_emotion_model\\added_tokens.json',
 './t5_base_emotion_model\\tokenizer.json')