#### **7PAM2015-0509-2024 -- Research Methods in Data Science**
##### Mult-Task -- Grammar Correctiona and Paraphrasing -- Implementation on T5 through LoRA.
---
**Mohit Agarwal (Student ID-22031257)**

This notebook trains and evaluate the model.


##### T5

Importing Required Libraries

In [1]:
# importing required libraries
import logging
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datasets import load_dataset

# Core T5 training libraries
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    TrainingArguments, Trainer, DataCollatorForSeq2Seq,
    EarlyStoppingCallback, get_linear_schedule_with_warmup
)

Changing the default styles and palettes

In [2]:
# setting style
sns.set_style("darkgrid")
# setting context
sns.set_context("paper")
# setting palette
sns.set_palette("deep", color_codes=True)

### JFLEG Dataset Class for Preparation and Augmentation Strategy

In [None]:
class JFLEGDataset:
    """
    A comprehensive dataset processor for JFLEG (JHU FLuency-Extended GUG) grammar correction data.

    This class handles the complete pipeline for preparing JFLEG data for T5-based grammar correction
    training, including data loading, preprocessing, augmentation, tokenization, and train/validation/test
    splitting. The JFLEG dataset contains 1,511 examples with 4 human-written corrections each, focusing
    on fluency improvements rather than minimal edits.

    Key Features:
        - Comprehensive text preprocessing to handle formatting issues
        - Data augmentation using all 4 JFLEG corrections per sentence
        - Proper tokenization for T5 sequence-to-sequence training
        - Train/validation/test splitting with preserved evaluation metadata
        - Temperature-scaled mixing support for multi-task learning

    Dataset Sources:
        - Training: JFLEG validation split with 4x augmentation (~6,044 examples)
        - Validation/Test: JFLEG test split without augmentation, then split 90%/10%

    Attributes:
        tokenizer (T5Tokenizer): T5 tokenizer for text processing
        max_length (int): Maximum sequence length for tokenization
        test_split_ratio (float): Proportion of validation data to use for testing
        train_data (Dataset): JFLEG validation split used for training
        validation_data (Dataset): JFLEG test split used for validation/testing

    Example:
        >>> from transformers import T5Tokenizer
        >>> tokenizer = T5Tokenizer.from_pretrained("t5-base")
        >>> dataset = JFLEGDataset(tokenizer, max_length=256, test_split_ratio=0.10)
        >>> train_data, val_data, test_data = dataset.create_train_val_test_datasets()

    References:
        - JFLEG Paper: Napoles et al. (2017) "JFLEG: A Fluency Corpus and Benchmark 
          for Grammatical Error Correction"
        - Dataset: https://huggingface.co/datasets/jhu-clsp/jfleg
    """

    def __init__(self, tokenizer, max_length=256, test_split_ratio=0.10):
        """
        Initialize the JFLEG dataset processor with specified configuration.

        Sets up the dataset processor with the provided tokenizer and configuration
        parameters, then loads the raw JFLEG datasets for subsequent processing.

        Args:
            tokenizer (T5Tokenizer): HuggingFace T5 tokenizer instance for text processing.
                Must be a properly initialized T5 tokenizer (e.g., from t5-base).
            max_length (int, optional): Maximum sequence length for tokenization. 
                Sequences longer than this will be truncated. Defaults to 256.
                Recommended range: 128-512 depending on GPU memory constraints.
            test_split_ratio (float, optional): Proportion of validation data to reserve 
                for final testing. Must be between 0.0 and 1.0. Defaults to 0.10 (10%).
                The remaining validation data will be used for model validation during training.

        Raises:
            ValueError: If test_split_ratio is not between 0.0 and 1.0
            TypeError: If tokenizer is not a valid T5Tokenizer instance

        Note:
            The JFLEG dataset splits are used as follows:
            - JFLEG 'validation' split → Training data (with augmentation)
            - JFLEG 'test' split → Validation and test data (split according to test_split_ratio)

            This approach follows standard practice since JFLEG's validation split is larger
            and more suitable for training, while the test split is reserved for evaluation.
        """
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.test_split_ratio = test_split_ratio

        # Validate test_split_ratio
        if not 0.0 <= test_split_ratio <= 1.0:
            raise ValueError(
                f"test_split_ratio must be between 0.0 and 1.0, got {test_split_ratio}")

        # Load the JFLEG datasets
        print(f"[INFO] Initializing JFLEG Dataset Processor...")
        print(
            f"[INFO] Max length: {max_length}, Test split ratio: {test_split_ratio:.1%}")

        self.train_data = load_dataset("jfleg", split="validation")
        self.validation_data = load_dataset("jfleg", split="test")

        print(
            f"[INFO] Loaded JFLEG validation split: {len(self.train_data)} examples")
        print(
            f"[INFO] Loaded JFLEG test split: {len(self.validation_data)} examples")

    def _preprocess(self, text):
        """
        Preprocess and normalize text by fixing common formatting issues.

        This method performs comprehensive text cleaning to handle poorly formatted
        text, such as OCR output or text with inconsistent spacing. It fixes issues
        with numbers, punctuation, quotes, and whitespace normalization.

        Args:
                text (str): The input text to preprocess. Can be None or empty string.

        Returns:
                str: The preprocessed and normalized text, or the original input if
                        it's not a valid string.

        Transformations performed:
                - Removes multiple consecutive dashes (-- → "")
                - Fixes decimal formatting (0 . 1 → 0.1)
                - Fixes fraction formatting (1 / 2 → 1/2)
                - Removes leading zeros in decimals (00.5 → 0.5)
                - Joins split numbers (1 2 3 4 → 1234)
                - Fixes punctuation spacing (word , → word,)
                - Normalizes quote spacing (" word " → "word")
                - Collapses multiple spaces to single spaces
                - Strips leading and trailing whitespace
        """

        # if not text or not isinstance(text, str):
        #     return text

        # Step 1: Remove unwanted characters (double dashes, etc.)
        text = re.sub(r"-{2,}", "", text)

        # Step 2: Fix decimal numbers (0 . 1 → 0.1)
        text = re.sub(r"(\d+)\s+\.\s+(\d+)", r"\1.\2", text)

        # Step 3: Fix fractions (1 / 2 → 1/2)
        text = re.sub(r"(\d+)\s+/\s+(\d+)", r"\1/\2", text)

        # Step 4: Fix leading zeros in decimals (00 . 5 → 0.5)
        text = re.sub(r"\b0+(\d+)\.(\d+)", r"\1.\2", text)

        # Step 5: Split number handling (any length)
        text = re.sub(r"\b(\d+(?:\s+\d+)+)\b",
                      lambda m: m.group(1).replace(" ", ""), text)

        # Step 6: Fix punctuation spacing (, . ! ? : ;)
        text = re.sub(r"\s+([,.!?:;])", r"\1", text)

        # Step 7: Fix double quote spacing
        text = re.sub(r'\s+"', '"', text)  # Remove space before quote
        text = re.sub(r'"\s+', '"', text)  # Remove space after quote

        # Step 8: Normalize multiple spaces to single space
        text = re.sub(r"\s{2,}", " ", text)

        # Step 9: Remove leading/trailing spaces
        text = text.strip()

        return text

    def _apply_augmentation(self, data, augment=True):
        """
        Apply data augmentation to JFLEG dataset using all available corrections.

        This function processes JFLEG examples to create augmented data by utilizing
        all 4 human-written corrections per sentence. Each original sentence is paired with
        each of its corrections to create multiple training examples, significantly increasing
        the dataset size and providing the model with diverse correction targets.

        Args:
                data (List[Dict]): List of JFLEG dataset examples, where each example contains:
                        - 'sentence' (str): Original grammatically incorrect sentence
                        - 'corrections' (List[str]): List of 4 human-written corrections
                augment (bool, optional): Whether to use all corrections for augmentation.
                        - If True: Creates 4 examples per input (uses all corrections)
                        - If False: Creates 1 example per input (uses only first correction)
                        Default is True.

        Returns:
                List[Dict]: Augmented dataset where each dictionary contains:
                        - 'input' (str): Preprocessed input with "grammar: " prefix
                        - 'target' (str): Preprocessed target correction
                        - 'processed_sentence' (str): Preprocessed original sentence
                        - 'processed_corrections' (List[str]): All 4 preprocessed corrections for evaluation
                        - 'raw_original' (str): Unprocessed original sentence (for debugging)
                        - 'raw_corrections' (List[str]): Unprocessed corrections (for debugging)
        """
        # storage for augmented data
        augmented_data = []
        for items in data:
            # getting original sentence -- incorrect
            original_sentence = items["sentence"]
            # formatting the incorrect sentence
            processed_sentence = self._preprocess(original_sentence)

            # getting all the original corrected sentences
            corrections = items["corrections"]

            # formatting all the corrected sentences -- evaluation
            processed_corrections = []
            # looping over all 4 corrections
            for correction in corrections:
                if correction.strip():  # Skip empty corrections
                    # storing all the processed corrections
                    processed_corrections.append(self._preprocess(correction))

            # looping over processed corrections
            for processed_correction in processed_corrections:
                # creating a dataset
                augmented_data.append({
                    "input": f"grammar: {processed_sentence}",
                    "target": processed_correction,
                    "processed_sentence": processed_sentence,
                    "processed_corrections": processed_corrections,
                    "raw_original": original_sentence,
                    "raw_corrections": corrections
                })
                # checking if to augment or not
                if not augment:
                    break
        # displaying the length of data
        print("\t[INFO] Length of Dataset is: ", len(augmented_data))
        return augmented_data

    def _apply_tokenization(self, data):
        """
        Apply tokenization to preprocessed JFLEG dataset examples for T5 model training.

        This function converts text data (input sentences and target corrections) into 
        tokenized format suitable for T5 model training. It processes both the input 
        grammar correction task and the target correction, creating the necessary 
        input_ids, attention_mask, and labels required by the HuggingFace Trainer.

        Args:
                data (Dict): A single preprocessed example containing:
                - 'input' (str): Preprocessed input text with "grammar: " prefix
                - 'target' (str): Preprocessed target correction text
                - 'processed_sentence' (str): Preprocessed original sentence (preserved but not tokenized)
                - 'processed_corrections' (List[str]): All preprocessed corrections (preserved but not tokenized)
                - 'raw_original' (str): Raw original sentence (preserved but not tokenized)
                - 'raw_corrections' (List[str]): Raw corrections (preserved but not tokenized)


        Returns:
                Dict: Tokenized example ready for model training containing:
                        - 'input_ids' (List[int]): Token IDs for the input sequence
                        - 'attention_mask' (List[int]): Attention mask for input (1 for real tokens, 0 for padding)
                        - 'labels' (List[int]): Token IDs for the target sequence (used for loss computation)

        Tokenization Settings:
                - max_length (int): Maximum sequence length (defined by self.max_length)
                - truncation (bool): True - truncates sequences longer than max_length
                - padding (bool): False - no padding applied (Trainer handles dynamic padding)
                - return_tensors: None - returns Python lists instead of PyTorch tensors
        """
        # tokenizing the input of the dataset
        input_encodings = self.tokenizer(data["input"],
                                         max_length=self.max_length,
                                         truncation=True,
                                         padding=False,  # trainer handles the dynamic padding
                                         return_tensors=None)  # returns lists not tensor
        # tokenizing the target of the dataset
        target_encodings = self.tokenizer(data["target"],
                                          max_length=self.max_length,
                                          truncation=True,
                                          padding=False,  # trainer handles the dynamic padding
                                          return_tensors=None)  # returns lists not tensor

        return {
            "input_ids": input_encodings["input_ids"],
            "attention_mask": input_encodings["attention_mask"],
            "labels": target_encodings["input_ids"]
        }

    def create_train_val_test_datasets(self):
        """
        Create training, validation, and test datasets with proper augmentation and tokenization.

        This function orchestrates the complete data processing pipeline for JFLEG grammar 
        correction training. It applies data augmentation, converts to HuggingFace datasets,
        applies tokenization, and splits the data into appropriate train/validation/test sets
        while preserving essential evaluation metadata.

        Processing Pipeline:
                1. Apply augmentation to training data (4x expansion using all corrections)
                2. Apply augmentation to validation data (no expansion, uses first correction only)
                3. Convert Python lists to HuggingFace Datasets
                4. Apply tokenization using .map() for efficiency
                5. Split validation data into validation and test sets (90%/10%)
                6. Preserve evaluation metadata for proper GLEU scoring

        Data Sources:
                - Training: JFLEG validation split with 4x augmentation (~6,044 examples)
                - Validation/Test: JFLEG test split without augmentation, then split 90%/10%

        Returns:
                Tuple[Dataset, List[Dict], List[Dict]]: A tuple containing:
                        - train_dataset (Dataset): HuggingFace Dataset with tokenized training examples
                        - val_data (List[Dict]): List of tokenized validation examples with metadata
                        - test_data (List[Dict]): List of tokenized test examples with metadata

        Data Augmentation Strategy:
                - Training: augment=True (uses all 4 JFLEG corrections per sentence)
                - Validation: augment=False (uses only first correction per sentence)
        """

        from datasets import Dataset
        from sklearn.model_selection import train_test_split

        print("[INFO] Creating datasets with augmentation and tokenization...")

        # Step 1: Apply augmentation (returns Python lists)
        print("\n[INFO] Applying augmentation to training data...")
        train_augmented_list = self._apply_augmentation(
            self.train_data, augment=True)

        print("[INFO] Applying augmentation to validation data...")
        val_augmented_list = self._apply_augmentation(
            self.validation_data, augment=False)

        # Step 2: Convert Python lists to HuggingFace Datasets
        train_augmented_data = Dataset.from_list(train_augmented_list)
        val_augmented_data = Dataset.from_list(val_augmented_list)

        # Step 3: Apply tokenization using map
        print("\n[INFO] Tokenizing training data...")
        train_augmented_map_data = train_augmented_data.map(
            lambda example: self._apply_tokenization(example),
            batched=False,
            remove_columns=["input", "target"],
            desc="Tokenizing Training Data"
        )

        print("[INFO] Tokenizing validation data...")
        val_augmented_map_data = val_augmented_data.map(
            lambda example: self._apply_tokenization(example),
            batched=False,
            remove_columns=["input", "target"],
            desc="Tokenizing Validation Data"
        )

        # Step 4: Split validation dataset into validation and test sets
        print(
            f"\n[INFO] Splitting Validation Data ({100-self.test_split_ratio*100:.0f}%/{self.test_split_ratio*100:.0f}%)...")
        val_data, test_data = train_test_split(
            list(val_augmented_map_data),
            test_size=self.test_split_ratio,
            random_state=42
        )

        # Convert Python validation and test lists to HuggingFace Datasets
        val_data = Dataset.from_list(val_data)
        test_data = Dataset.from_list(test_data)

        # Summary
        print(f"\nDataset Creation Complete:")
        print(f"\t[INFO] Training Dataset:   {len(train_augmented_map_data)}")
        print(f"\t[INFO] Validation Dataset: {len(val_data)}")
        print(f"\t[INFO] Test Dataset:       {len(test_data)}")

        return train_augmented_map_data, val_data, test_data

In [4]:
dataset = JFLEGDataset(tokenizer)

NameError: name 'tokenizer' is not defined