In [1]:
import logging
import warnings

In [2]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
warnings.filterwarnings('ignore')

In [3]:
logger.info("start")

INFO:__main__:start


In [4]:
from datasets import load_dataset

INFO:datasets:PyTorch version 2.7.1 available.


In [5]:
dataset = load_dataset('jfleg')

In [6]:
dataset["validation"]["sentence"][0]

'So I think we can not live if old people could not find siences and tecnologies and they did not developped . '

In [7]:
dataset["validation"]["corrections"][0]

['So I think we would not be alive if our ancestors did not develop sciences and technologies . ',
 'So I think we could not live if older people did not develop science and technologies . ',
 'So I think we can not live if old people could not find science and technologies and they did not develop . ',
 'So I think we can not live if old people can not find the science and technology that has not been developed . ']

In [8]:
len(dataset["validation"])

755

In [9]:
dataset

DatasetDict({
    validation: Dataset({
        features: ['sentence', 'corrections'],
        num_rows: 755
    })
    test: Dataset({
        features: ['sentence', 'corrections'],
        num_rows: 748
    })
})

In [115]:
from transformers import AutoTokenizer, T5ForConditionalGeneration
# MODELNAME = "vennify/t5-base-grammar-correction"
MODELNAME = "t5-small"
PREFIX = "grammar: "
tokenizer = AutoTokenizer.from_pretrained(MODELNAME)
model = T5ForConditionalGeneration.from_pretrained(MODELNAME)

In [116]:
# Define the input text with a task prefix
input_text = PREFIX + "he go to school yesterday."

# Tokenize the input
input_ids = tokenizer.encode(
    input_text, return_tensors="pt", max_length=128, truncation=True)

# Generate output (corrected text)
output = model.generate(
    input_ids,
    max_length=128,
    num_beams=5,  # Beam search for better quality
    early_stopping=True,
    repetition_penalty=2.5  # Penalize repetitive output
)

# Decode the generated text
corrected_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(f"Original: he go to school yesterday.")
print(f"Corrected: {corrected_text}")

Original: he go to school yesterday.
Corrected: grammar: he go to school yesterday.


In [12]:
tokenizer.decode(output[0], skip_special_tokens=True)

'grammar: he go to school yesterday.'

In [13]:
ds = load_dataset("dim/grammarly_coedit")

In [14]:
input_ids

tensor([[19519,    10,     3,    88,   281,    12,   496,  4981,     5,     1]])

In [15]:
type(dataset["validation"]["sentence"])

list

In [16]:
from transformers import T5Config

In [17]:
print(T5Config())

T5Config {
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "transformers_version": "4.52.4",
  "use_cache": true,
  "vocab_size": 32128
}



In [18]:
from torch.utils.data import DataLoader, Dataset
import torch

In [117]:
class GrammarCorrectionDataset(Dataset):
    """A PyTorch Dataset for grammar correction tasks.

    This dataset takes input text with grammatical errors and their corrected target text,
    tokenizes them using the provided tokenizer, and prepares them for training a grammar
    correction model.

    Args:
            tokenizer: The tokenizer to use for encoding the text
            input_text (list): List of input texts containing grammatical errors
            target_text (list): List of corresponding corrected texts
            max_length (int, optional): Maximum sequence length for tokenization. Defaults to 256.

    Returns:
            dict: Dictionary containing:
                    - input_ids: Tokenized and padded input text
                    - attention_mask: Attention mask for input text
                    - labels: Target labels for training (-100 for padding tokens)
    """

    def __init__(self, tokenizer, input_text, target_text, max_length=256):
        self.tokenizer = tokenizer
        self.input_text = input_text
        self.target_text = target_text
        self.max_length = max_length

    def __len__(self):
        return len(self.input_text)

    def __getitem__(self, id):
        source = str("grammar: " + self.input_text[id])
        target = str(self.target_text[id])

        source_tokens = self.tokenizer(
            source, return_tensors="pt", max_length=self.max_length,
            padding="max_length", truncation=True, return_attention_mask=True)

        target_tokens = self.tokenizer(
            target, return_tensors="pt", max_length=self.max_length,
            padding="max_length", truncation=True, return_attention_mask=True)

        labels = target_tokens["input_ids"].clone()
        labels[labels == tokenizer.pad_token_id] = -100

        return {
            "input_ids": source_tokens["input_ids"].flatten(),
            "attention_mask": source_tokens["attention_mask"].flatten(),
            "labels": labels.flatten(),
        }

In [118]:
dataset

DatasetDict({
    validation: Dataset({
        features: ['sentence', 'corrections'],
        num_rows: 755
    })
    test: Dataset({
        features: ['sentence', 'corrections'],
        num_rows: 748
    })
})

In [119]:
train_source = dataset["validation"]["sentence"]
train_labels = dataset["validation"]["corrections"]

val_source = dataset["test"]["sentence"]
val_labels = dataset["test"]["corrections"]

In [120]:
train_dataset = GrammarCorrectionDataset(tokenizer, train_source, train_labels)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

In [121]:
val_dataset = GrammarCorrectionDataset(tokenizer, val_source, val_labels)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=True)

In [122]:
MODELNAME

't5-small'

In [123]:
from transformers import TrainingArguments, Trainer

In [23]:

train_loader

<torch.utils.data.dataloader.DataLoader at 0x2709076b620>

In [None]:
for i, data in enumerate(train_loader):
    print(data)
    break

{'input_ids': tensor([[19519,    10,   216,    87,     7,    88,  2746,    12,   214,   762,
            81,     8,  8084,     3,     6,  2746,    12,   214,   762,    81,
             8,   601,     3,     6,     8,  1687,   672,     5,   175,   128,
          6085,    24,   921,    54,  2454,     3,     9,   418,    13,   912,
            68,     8,  6688,    33,    59,   435,   780,     3,     5,     1,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [25]:
dataset["validation"]["sentence"][0]  # type: ignore

'So I think we can not live if old people could not find siences and tecnologies and they did not developped . '

In [None]:
tokenizer(
    # type: ignore
    str("grammar: " + dataset["validation"]["sentence"][0]),
    max_length=128, padding="max_length", truncation=True, return_tensors="pt",
    return_attention_mask=True)

{'input_ids': tensor([[19519,    10,   264,    27,   317,    62,    54,    59,   619,     3,
            99,   625,   151,   228,    59,   253,   108,  1433,     7,    11,
             3,  5822,    29,  4137,     7,    11,    79,   410,    59,  1344,
          3138,     3,     5,     1,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [27]:
text = "ArithmeticError"

In [28]:
# Define the input text with a task prefix
input_text = PREFIX + "he go to school yesterday."

# Tokenize the input
input_ids = tokenizer.encode(
    input_text, return_tensors="pt", max_length=128, truncation=True)

# Generate output (corrected text)
output = model.generate(
    input_ids,
    max_length=128,
    num_beams=5,  # Beam search for better quality
    early_stopping=True,
    repetition_penalty=2.5  # Penalize repetitive output
)

# Decode the generated text
corrected_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(f"Original: he go to school yesterday.")
print(f"Corrected: {corrected_text}")

Original: he go to school yesterday.
Corrected: grammar: he go to school yesterday.


In [29]:
wi_dataset = load_dataset("wi_locness", "wi", trust_remote_code=True)

In [30]:
wi_dataset["train"]

Dataset({
    features: ['id', 'userid', 'cefr', 'text', 'edits'],
    num_rows: 3000
})

In [31]:
wi_dataset["train"]["text"][0]

'My town is a medium size city with eighty thousand inhabitants. It has a high density population because its small territory. Despite of it is an industrial city, there are many shops and department stores.  I recommend visiting the artificial lake in the certer of the city which is surrounded by a park. Pasteries are very common and most of them offer the special dessert from the city. There are a comercial zone along the widest street of the city where you can find all kind of establishments: banks, bars, chemists, cinemas, pet shops, restaurants, fast food restaurants, groceries, travel agencies, supermarkets and others. Most of the shops have sales and offers at least three months of the year: January, June and August. The quality of the products and services are quite good, because there are a huge competition, however I suggest you taking care about some fakes or cheats.'

In [32]:
wi_dataset["train"]["edits"][0].keys()

dict_keys(['start', 'end', 'text'])

In [33]:
starts = wi_dataset["train"]["edits"][0]["start"]

In [34]:
ends = wi_dataset["train"]["edits"][0]["end"]

In [35]:
wi_dataset["train"]["text"][0][starts[0]:ends[0]]

'medium size'

In [36]:
wi_dataset["train"]["edits"][0]["text"][0]

'medium-sized'

In [37]:
text = wi_dataset["train"]["text"][0]
edits = wi_dataset["train"]["edits"][0]

In [38]:
edits

{'start': [13,
  77,
  104,
  126,
  134,
  256,
  306,
  375,
  396,
  402,
  476,
  484,
  579,
  671,
  774,
  804,
  808,
  826,
  838,
  850,
  857,
  862,
  868],
 'end': [24,
  78,
  104,
  133,
  136,
  262,
  315,
  379,
  399,
  411,
  480,
  498,
  588,
  671,
  777,
  807,
  810,
  835,
  845,
  856,
  861,
  867,
  873],
 'text': ['medium-sized',
  '-',
  ' of',
  'Although',
  '',
  'center',
  None,
  'of',
  'is',
  'commercial',
  'kinds',
  'businesses',
  'grocers',
  ' in',
  'is',
  'is',
  '',
  '. However,',
  'recommend',
  'be',
  'careful',
  'of',
  '']}

In [None]:
edits_list = list(zip(edits["start"], edits["end"], edits["text"]))
edits_list.sort(key=lambda x: x[0], reverse=True)

str = text
for start, end, replacement in edits_list:
    print(replacement)
    if replacement == None:
        replacement = ""
    str = str[:start] + replacement + str[end:]


of
careful
be
recommend
. However,

is
is
 in
grocers
businesses
kinds
commercial
is
of
None
center

Although
 of
-
medium-sized


In [40]:
str

'My town is a medium-sized city with eighty thousand inhabitants. It has a high-density population because of its small territory. Although  it is an industrial city, there are many shops and department stores.  I recommend visiting the artificial lake in the center of the city which is surrounded by a park.  are very common and most of them offer the special dessert of the city. There is a commercial zone along the widest street of the city where you can find all kinds of businesses: banks, bars, chemists, cinemas, pet shops, restaurants, fast food restaurants, grocers, travel agencies, supermarkets and others. Most of the shops have sales and offers in at least three months of the year: January, June and August. The quality of the products and services is quite good, because there is huge competition. However, I recommend you be careful of fakes or cheats.'

In [41]:
edits_list.sort(key=lambda x: x[0], reverse=True)
edits_list

[(868, 873, ''),
 (862, 867, 'of'),
 (857, 861, 'careful'),
 (850, 856, 'be'),
 (838, 845, 'recommend'),
 (826, 835, '. However,'),
 (808, 810, ''),
 (804, 807, 'is'),
 (774, 777, 'is'),
 (671, 671, ' in'),
 (579, 588, 'grocers'),
 (484, 498, 'businesses'),
 (476, 480, 'kinds'),
 (402, 411, 'commercial'),
 (396, 399, 'is'),
 (375, 379, 'of'),
 (306, 315, None),
 (256, 262, 'center'),
 (134, 136, ''),
 (126, 133, 'Although'),
 (104, 104, ' of'),
 (77, 78, '-'),
 (13, 24, 'medium-sized')]

In [43]:
wi_dataset.keys()

dict_keys(['train', 'validation'])

In [74]:
wi_ds = {}
for sets in wi_dataset.keys():
    for features in wi_dataset[sets]:
        incorrect_text = ""
        correct_text = ""
        if "text" in features and "edits" in features:
            incorrect_text = features["text"]  # type: ignore
            # print(incorrect_text)
            correct_text = features["text"]

            correct_list = features["edits"]

            edits_list = list(zip(correct_list["start"],
                                  correct_list["end"], correct_list["text"]))
            edits_list.sort(key=lambda x: x[0], reverse=True)

            for start, end, replacement in edits_list:
                if replacement == None:
                    replacement = ""
                correct_text = correct_text[:start] + \
                    replacement + correct_text[end:]

        if sets not in wi_ds:
            wi_ds[sets] = {"incorrect_text": [], "correct_text": []}

        if incorrect_text != correct_text:
            wi_ds[sets]["incorrect_text"].append(incorrect_text)
            wi_ds[sets]["correct_text"].append(correct_text)        # break

In [80]:
wi_ds["train"]["correct_text"][0]

'My town is a medium-sized city with eighty thousand inhabitants. It has a high-density population because of its small territory. Although  it is an industrial city, there are many shops and department stores.  I recommend visiting the artificial lake in the center of the city which is surrounded by a park.  are very common and most of them offer the special dessert of the city. There is a commercial zone along the widest street of the city where you can find all kinds of businesses: banks, bars, chemists, cinemas, pet shops, restaurants, fast food restaurants, grocers, travel agencies, supermarkets and others. Most of the shops have sales and offers in at least three months of the year: January, June and August. The quality of the products and services is quite good, because there is huge competition. However, I recommend you be careful of fakes or cheats.'

In [91]:
import numpy as np

In [93]:
np.max([len(wi_ds["train"]["incorrect_text"][i].split())
        for i in range(len(wi_ds["train"]["incorrect_text"]))])

np.int64(1551)

In [82]:
wi_ds.keys()

dict_keys(['train', 'validation'])

In [96]:
paws = load_dataset("paws", "labeled_final")

In [102]:
paraphrases = paws.filter(lambda x: x['label'] == 1)

Filter: 100%|██████████| 49401/49401 [00:00<00:00, 177511.22 examples/s]
Filter: 100%|██████████| 8000/8000 [00:00<00:00, 128974.15 examples/s]
Filter: 100%|██████████| 8000/8000 [00:00<00:00, 137822.05 examples/s]


In [103]:
paraphrases

DatasetDict({
    train: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 21829
    })
    test: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 3536
    })
    validation: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 3539
    })
})

In [None]:
# Setup comprehensive logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('training.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

In [106]:
def verify_environment():
    """Verify computational environment and requirements"""
    logger.info("🔍 ENVIRONMENT VERIFICATION")
    logger.info("=" * 50)
    logger.info(f"   PyTorch: {torch.__version__}")
    logger.info(f"   CUDA Available: {torch.cuda.is_available()}")

    if torch.cuda.is_available():
        gpu_props = torch.cuda.get_device_properties(0)
        logger.info(f"   GPU: {torch.cuda.get_device_name()}")
        logger.info(f"   GPU Memory: {gpu_props.total_memory / 1e9:.1f} GB")
        logger.info(f"   CUDA Version: {torch.version.cuda}")
    else:
        logger.warning("   ⚠️  No GPU detected - training will be slower")

    # Check available memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        memory_allocated = torch.cuda.memory_allocated() / 1e9
        memory_reserved = torch.cuda.memory_reserved() / 1e9
        logger.info(f"   GPU Memory Used: {memory_allocated:.1f} GB")
        logger.info(f"   GPU Memory Reserved: {memory_reserved:.1f} GB")

    logger.info("✅ Environment verification complete!")
    logger.info("=" * 50)


verify_environment()

INFO:__main__:🔍 ENVIRONMENT VERIFICATION
INFO:__main__:   PyTorch: 2.7.1+cpu
INFO:__main__:   CUDA Available: False
INFO:__main__:✅ Environment verification complete!


In [144]:
train_source = ["grammar: " +
                sentence for sentence in dataset["validation"]["sentence"]]
train_target = [correction[0]
                for correction in dataset["validation"]["corrections"]]

In [145]:
source_tokens = tokenizer(train_source, max_length=256,
                          truncation=True, padding=False)
target_tokens = tokenizer(train_target, max_length=256,
                          truncation=True, padding=False)

In [151]:
token_dataset = source_tokens
token_dataset["labels"] = target_tokens["input_ids"]

In [153]:
token_dataset.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [154]:
dataset.column_names

{'validation': ['sentence', 'corrections'],
 'test': ['sentence', 'corrections']}

In [192]:
def preprocess(sources, target, max_length=256):
    dataset = {}
    source = [f"grammar: {sentence}" for sentence in sources]
    targets = [correction[0]
               for correction in target]

    source_tokens = tokenizer(source, max_length=max_length,
                              truncation=True, padding=False,
                              return_tensors=None)
    target_tokens = tokenizer(targets, max_length=max_length,
                              truncation=True, padding=False,
                              return_tensors=None)
    # print(target_tokens)

    dataset["input_ids"] = source_tokens["input_ids"]
    dataset["attention_mask"] = source_tokens["attention_mask"]
    dataset["labels"] = target_tokens["input_ids"]

    return dataset

In [193]:
train_ds = dataset["validation"].map(lambda ds: preprocess(dataset["validation"]["sentence"],
                                                           dataset["validation"]["corrections"]),
                                     batched=True,
                                     remove_columns=dataset["validation"].column_names)

Map: 100%|██████████| 755/755 [00:00<00:00, 7211.80 examples/s]


In [194]:
val_ds = dataset["test"].map(lambda ds: preprocess(dataset["test"]["sentence"],
                                                   dataset["test"]["corrections"]),
                             batched=True,
                             remove_columns=dataset["test"].column_names)

Map: 100%|██████████| 748/748 [00:00<00:00, 5310.24 examples/s]


In [195]:
val_ds

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 748
})

In [196]:
training_arg = TrainingArguments(
    output_dir=r"D:\MScDataScience\9.Research_Methods\Assignment\Assignment2\Checkpoints",
    # Basic setup
    overwrite_output_dir=False,          # Overwrite output directory if exists
    do_train=True,                       # Whether to run training
    do_eval=False,                       # Whether to run evaluation
    do_predict=False,                    # Whether to run predictions

    # Training hyperparameters
    num_train_epochs=1.0,                # Number of training epochs
    # Max training steps (overrides epochs if set)
    max_steps=-1,
    per_device_train_batch_size=8,       # Batch size per device during training
    per_device_eval_batch_size=8,        # Batch size per device during evaluation
    gradient_accumulation_steps=1,        # Steps to accumulate gradients

    # Learning rate and optimization
    learning_rate=5e-5,                  # Initial learning rate
    weight_decay=0.0,                    # Weight decay coefficient
    adam_beta1=0.9,                      # Beta1 for Adam optimizer
    adam_beta2=0.999,                    # Beta2 for Adam optimizer
    adam_epsilon=1e-8,                   # Epsilon for Adam optimizer
    max_grad_norm=1.0,                   # Max gradient norm for clipping

    # Learning rate scheduling
    lr_scheduler_type="linear",          # Type of LR scheduler
    warmup_ratio=0.0,                    # Ratio of warmup steps
    warmup_steps=0,
)

In [197]:
from transformers import DataCollatorForSeq2Seq

In [198]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    return_tensors="pt"
)

In [201]:
trainer = Trainer(
    model=model,                        # The model to train/evaluate/predict
    args=training_arg,                         # TrainingArguments instance
    data_collator=data_collator,                # Function to collate batch data
    train_dataset=train_ds,                # Training dataset
    eval_dataset=val_ds,                 # Evaluation dataset
    tokenizer=tokenizer,                    # Tokenizer for the model
)

In [202]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss


TrainOutput(global_step=95, training_loss=1.1725575497275904, metrics={'train_runtime': 219.018, 'train_samples_per_second': 3.447, 'train_steps_per_second': 0.434, 'total_flos': 9238399647744.0, 'train_loss': 1.1725575497275904, 'epoch': 1.0})

In [205]:
dataset["validation"][0]

{'sentence': 'So I think we can not live if old people could not find siences and tecnologies and they did not developped . ',
 'corrections': ['So I think we would not be alive if our ancestors did not develop sciences and technologies . ',
  'So I think we could not live if older people did not develop science and technologies . ',
  'So I think we can not live if old people could not find science and technologies and they did not develop . ',
  'So I think we can not live if old people can not find the science and technology that has not been developed . ']}

In [232]:
import re

In [238]:
augmented_data = []
punc = re.compile(r'\s+([.!?,:;])')
for items in dataset["validation"]:
    print(type(items))
    source = punc.sub(r"\1", items["sentence"])
    print(source)
    targets = items["corrections"]
    for correction in targets:
        print(re.sub(punc, r"\1", correction))
        # augmented_data.append[{
        #     "sentence": f"grammar: {source}",
        #     "correction": correction,
        #     "original_sentence": source,
        #     "all_corrections": targets
        # }]
    break

<class 'dict'>
So I think we can not live if old people could not find siences and tecnologies and they did not developped. 
So I think we would not be alive if our ancestors did not develop sciences and technologies. 
So I think we could not live if older people did not develop science and technologies. 
So I think we can not live if old people could not find science and technologies and they did not develop. 
So I think we can not live if old people can not find the science and technology that has not been developed. 


In [227]:
from typing import Dict, List, Tuple, Optional
import re

In [239]:
split_number = r'\b(\d+(?:\s+\d+)+)\b'

In [None]:
class JFLEGDataset:
    def __init__(self, tokenizer, augment=True):
        self.tokenizer = tokenizer
        self.augment = augment
        # loaading the datasets
        self.train_data = load_dataset("jfleg", split="validation")
        self.validation_data = load_dataset("jfleg", split="test")

    def _preprocess(self, text):
        """
        Preprocess and normalize text by fixing common formatting issues.

        This method performs comprehensive text cleaning to handle poorly formatted
        text, such as OCR output or text with inconsistent spacing. It fixes issues
        with numbers, punctuation, quotes, and whitespace normalization.

        Args:
            text (str): The input text to preprocess. Can be None or empty string.

        Returns:
            str: The preprocessed and normalized text, or the original input if
                it's not a valid string.

        Transformations performed:
            - Removes multiple consecutive dashes (-- → "")
            - Fixes decimal formatting (0 . 1 → 0.1)
            - Fixes fraction formatting (1 / 2 → 1/2)
            - Removes leading zeros in decimals (00.5 → 0.5)
            - Joins split numbers (1 2 3 4 → 1234)
            - Fixes punctuation spacing (word , → word,)
            - Normalizes quote spacing (" word " → "word")
            - Collapses multiple spaces to single spaces
            - Strips leading and trailing whitespace
        """
        
        if not text or not isinstance(text, str):
            return text

        # Step 1: Remove unwanted characters (double dashes, etc.)
        text = re.sub(r"-{2,}", "", text)

        # Step 2: Fix decimal numbers (0 . 1 → 0.1)
        text = re.sub(r"(\d+)\s+\.\s+(\d+)", r"\1.\2", text)

        # Step 3: Fix fractions (1 / 2 → 1/2)
        text = re.sub(r"(\d+)\s+/\s+(\d+)", r"\1/\2", text)

        # Step 4: Fix leading zeros in decimals (00 . 5 → 0.5)
        text = re.sub(r"\b0+(\d+)\.(\d+)", r"\1.\2", text)

        # Step 5: Split number handling (any length)
        text = re.sub(r"\b(\d+(?:\s+\d+)+)\b",
                      lambda m: m.group(1).replace(" ", ""), text)

        # Step 6: Fix punctuation spacing (, . ! ? : ;)
        text = re.sub(r"\s+([,.!?:;])", r"\1", text)

        # Step 7: Fix double quote spacing
        text = re.sub(r'\s+"', '"', text)  # Remove space before quote
        text = re.sub(r'"\s+', '"', text)  # Remove space after quote

        # Step 8: Normalize multiple spaces to single space
        text = re.sub(r"\s{2,}", " ", text)

        # Step 9: Remove leading/trailing spaces
        text = text.strip()

        return text
    
    def _apply_agument(self, data)

In [245]:
load_dataset('jfleg', split="validation")

Dataset({
    features: ['sentence', 'corrections'],
    num_rows: 755
})