# 📚 Grammar Correction Model ✍️

This notebook demonstrates how to fine-tune a T5 model for grammar correction using the C4_200M dataset.

**Steps:**

1.  **Setup and Installation:** Install necessary libraries (Hugging Face Transformers, PyTorch, nltk, kagglehub, pandas).
2.  **Data Download:** Download the C4_200M dataset using `kagglehub`.
3.  **Data Loading and Preprocessing:** Load a subset of the dataset, handle missing values, and format the input and output columns.
4.  **Model and Tokenizer Initialization:** Load a pre-trained T5 model and tokenizer.
5.  **Dataset Preparation:** Create a custom PyTorch `Dataset` for the grammar correction task and split the data into training and validation sets.
6.  **Training:** Configure training arguments and train the T5 model on the prepared dataset.
7.  **Saving the Model:** Save the fine-tuned model and tokenizer to a local directory.
8.  **Testing:** Test the fine-tuned model with example sentences to see its grammar correction capabilities.
9.  **Download Model:** Zip and download the trained model for future use.

Let's get started! ✨

## ⚙️ Setup and Installation

In [None]:
!pip install torch transformers nltk pandas kagglehub

## ✨ Upgrade Transformers and Accelerate

In [None]:
!pip install --upgrade transformers accelerate datasets

## ⬇️ Download NLTK Data and Check CUDA Availability

In [None]:
import nltk
nltk.download('punkt')
import torch
print(torch.cuda.is_available())

## ⬇️ Download Additional NLTK Data

In [None]:
 import nltk
 nltk.download('punkt_tab')

## 💾 Download C4_200M Dataset

In [None]:
import kagglehub
import os

# Download C4_200M dataset
path = kagglehub.dataset_download("dariocioni/c4200m")
print("Path to dataset files:", path)

# List files in the downloaded directory to verify
files = os.listdir(path)
print("Downloaded files:", files)

## 📊 Load and Preprocess Data

In [None]:
print("--- Loading and Preprocessing Data ---")
# Path to your already downloaded dataset
data_path = "/root/.cache/kagglehub/datasets/dariocioni/c4200m/versions/4/"
tsv_file_name = "C4_200M.tsv-00000-of-00010"
tsv_path = os.path.join(data_path, tsv_file_name)

try:
    df = pd.read_csv(tsv_path, sep='\t', nrows=NUM_ROWS_TO_LOAD, names=['input', 'output'], header=None, on_bad_lines='skip')
    df.dropna(inplace=True)
    df['input'] = "grammar: " + df['input'].astype(str)
    df['output'] = df['output'].astype(str)
    print(f"Dataset loaded successfully. Size: {len(df)}")
except FileNotFoundError:
    print(f"ERROR: Could not find the dataset file at {tsv_path}. Please check the path.")
    exit()

## ⚙️ Configuration Parameters

In [None]:
import os
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import (
    T5ForConditionalGeneration,
    T5TokenizerFast,
    Trainer,
    TrainingArguments,
)

# Configuration parameters
MODEL_NAME = "t5-small"
OUTPUT_DIR = "./t5-grammar-correction-model"
NUM_ROWS_TO_LOAD = 100000
MAX_INPUT_LENGTH = 128
MAX_TARGET_LENGTH = 128
BATCH_SIZE = 16
LEARNING_RATE = 3e-4
EPOCHS = 2

## 🏗️ Define Grammar Dataset Class

In [None]:
class GrammarDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_input_len, max_target_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_input_len = max_input_len
        self.max_target_len = max_target_len
        self.inputs = self.data['input'].tolist()
        self.targets = self.data['output'].tolist()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        source_text = self.inputs[index]
        target_text = self.targets[index]

        source = self.tokenizer(source_text, max_length=self.max_input_len, padding="max_length", truncation=True, return_tensors="pt")
        target = self.tokenizer(target_text, max_length=self.max_target_len, padding="max_length", truncation=True, return_tensors="pt")

        input_ids = source["input_ids"].squeeze()
        attention_mask = source["attention_mask"].squeeze()
        labels = target["input_ids"].squeeze()
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

## 🤖 Initialize Model and Prepare Datasets

In [None]:
print("\n--- Initializing Model and Preparing Datasets ---")
tokenizer = T5TokenizerFast.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
train_dataset = GrammarDataset(train_df, tokenizer, MAX_INPUT_LENGTH, MAX_TARGET_LENGTH)
val_dataset = GrammarDataset(val_df, tokenizer, MAX_INPUT_LENGTH, MAX_TARGET_LENGTH)
print(f"Training set size: {len(train_dataset)}, Validation set size: {len(val_dataset)}")

## 🏋️ Configure and Start Training

In [None]:
print("\n--- Configuring and Starting Training ---")
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()
print("Training complete.")

## 💾 Save the Final Model

In [None]:
# --- Save the Final Model ---
print(f"\n--- Saving model to {OUTPUT_DIR} ---")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Model saved successfully. ✅")

## ✅ Test the Fine-Tuned Model

In [None]:
# --- Test the Fine-Tuned Model ---
print("\n--- Testing the fine-tuned model ---")

# Load the model and tokenizer from your output directory
trained_model = T5ForConditionalGeneration.from_pretrained(OUTPUT_DIR)
trained_tokenizer = T5TokenizerFast.from_pretrained(OUTPUT_DIR)

# Set up the device (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_model.to(device)

def correct_grammar(text: str):
    """Uses the fine-tuned model to correct a given sentence."""
    # Prepare the input text with the "grammar:" prefix
    input_text = "grammar: " + text

    # Tokenize the input
    inputs = trained_tokenizer(
        input_text,
        return_tensors="pt",
        max_length=MAX_INPUT_LENGTH,
        padding="max_length",
        truncation=True
    )

    # Move tensors to the correct device
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    # Generate the corrected output
    outputs = trained_model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=MAX_TARGET_LENGTH,
        num_beams=4,  # Use beam search for better results
        early_stopping=True
    )

    # Decode the generated ids to a string
    corrected_text = trained_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected_text

# Example sentences to test
test_sentences = [
    "The cats plays in the garden.",
    "He is a honest man.",
    "I will went to the store tomorrow.",
    "This book is more better than that one.",
    "Me and him are going to the movies.",
    "She is good in math.",
    "There is many reasons to be happy.",
    "He don't know the answer."
]

# Run the tests and print the results
for sentence in test_sentences:
    correction = correct_grammar(sentence)
    print(f"Original:   '{sentence}'")
    print(f"Corrected:  '{correction}'\n")

In [None]:
test_sentences = [
    "He is a honest man.",
    "I will went to the store tomorrow.",
    "This book is more better than that one.",
    "There is many reasons to be happy.",
    "He don't know the answer."
]
# Run the tests and print the results
for sentence in test_sentences:
    correction = correct_grammar(sentence)
    print(f"Original:   '{sentence}'")
    print(f"Corrected:  '{correction}'\n")

## 📦 Download the Trained Model

In [None]:
from google.colab import files

# Zip the entire model directory
!zip -r /content/model.zip /content/t5-grammar-correction-model

# Download the zip file
files.download('/content/model.zip')