In [1]:
!pip install datasets tqdm pandas
!pip install -U accelerate
!pip install sentencepiece
!pip install transformers
!pip install rouge_score
!pip install wandb
!pip install torch



In [2]:
import nltk
import torch
import datasets
nltk.download('punkt')
from datasets import load_metric
from transformers import (T5ForConditionalGeneration, T5Tokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq)
from transformers import (AdamW, T5ForConditionalGeneration, T5Tokenizer, get_linear_schedule_with_warmup)
from sklearn.model_selection import train_test_split
from nltk.translate.gleu_score import sentence_gleu
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import sent_tokenize
rouge_metric = load_metric("rouge")
from datasets import load_dataset
from string import punctuation
from google.colab import drive
from datasets import Dataset
from itertools import chain
from tqdm import tqdm
import pandas as pd
import transformers
import numpy as np
import argparse
import logging
import random
import json
import time
import glob
import os
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  rouge_metric = load_metric("rouge")


In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
gec_dataset = pd.read_csv('/content/drive/MyDrive/GecDataset/gec_dataset.csv')

In [5]:
gec_dataset = gec_dataset[['correct', 'incorrect']].copy()

In [6]:
gec_dataset = gec_dataset.dropna()

In [7]:
model_name = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [8]:
def calculate_token_length(example):
    return len(tokenizer(example).input_ids)

In [9]:
train_dataset, test_dataset = train_test_split(gec_dataset, test_size=0.10, shuffle=True)

In [10]:
test_dataset['input_token_len'] = test_dataset['incorrect'].apply(calculate_token_length)

Token indices sequence length is longer than the specified maximum sequence length for this model (626 > 512). Running this sequence through the model will result in indexing errors


In [11]:
train_dataset = Dataset.from_pandas(train_dataset)
test_dataset = Dataset.from_pandas(test_dataset)

In [12]:
class Gec(Dataset):


    def __init__(self, dataset, tokenizer, print_text=False):
        self.dataset = dataset
        self.pad_to_max_length = False
        self.tokenizer = tokenizer
        self.print_text = print_text
        self.max_len = 64

    def __len__(self):
        return len(self.dataset)

    def tokenize_data(self, example):
        target_, input_ = example['correct'], example['incorrect']
        tokenized_inputs = tokenizer(input_, pad_to_max_length=self.pad_to_max_length,
                                     max_length=self.max_len,
                                     return_attention_mask=True)

        tokenized_targets = tokenizer(target_, pad_to_max_length=self.pad_to_max_length,
                                      max_length=self.max_len,
                                      return_attention_mask=True)

        inputs={"input_ids": tokenized_inputs['input_ids'],
                "attention_mask": tokenized_inputs['attention_mask'],
                "labels": tokenized_targets['input_ids']}
        return inputs

    def __getitem__(self, index):
        inputs = self.tokenize_data(self.dataset[index])
        if self.print_text:
            for k in inputs.keys():
                print(k, len(inputs[k]))
        return inputs

In [13]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding='longest', return_tensors='pt')

In [14]:
batch_size = 64
args = Seq2SeqTrainingArguments(output_dir="/t5/weights",
                                evaluation_strategy="steps",
                                per_device_train_batch_size=batch_size,
                                per_device_eval_batch_size=batch_size,
                                learning_rate=2e-5,
                                num_train_epochs=1,
                                weight_decay=0.01,
                                save_total_limit=2,
                                predict_with_generate=True,
                                fp16 = True,
                                gradient_accumulation_steps = 6,
                                eval_steps = 2000,
                                save_steps = 2000,
                                load_best_model_at_end=True,
                                logging_dir="/logs",
                                report_to="wandb")

In [15]:
def compute_metrics(evaluation_prediction):
    predictions, labels = evaluation_prediction
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}

In [16]:
trainer = Seq2SeqTrainer(model=model,
                         args=args,
                         train_dataset= Gec(train_dataset, tokenizer),
                         eval_dataset=Gec(test_dataset, tokenizer),
                         tokenizer=tokenizer,
                         data_collator=data_collator,
                         compute_metrics=compute_metrics)

In [None]:
trainer.train()

In [None]:
trainer.save_model('t5_gec_model')

In [18]:
model_name = r'deep-learning-analytics/GrammarCorrector'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(torch_device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [19]:
def correct(input_text, num_return_sequences):
  batch = tokenizer([input_text], truncation=True, padding='max_length', max_length=64, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch, max_length=64, num_beams=4, num_return_sequences=num_return_sequences, temperature=1.5)
  target_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return target_text

In [22]:
test_dataset = pd.read_csv(r"/content/drive/MyDrive/GecDataset/gec_test.csv")

In [23]:
def gleu_score(gec_dataset):
    glue_score_arr = []
    for i in tqdm(range(500)):
        reference = [gec_dataset['correct'].iloc[i].split()]
        pred = correct(gec_dataset['incorrect'].iloc[i], num_return_sequences=1)[0]
        candidate = pred.split()
        glue_score_arr.append(sentence_gleu(reference, candidate))
    return np.mean(glue_score_arr)

In [24]:
print(f"Glue score:", gleu_score(test_dataset))

100%|██████████| 500/500 [03:44<00:00,  2.22it/s]

Glue score: 0.41865092272657295



