In [None]:
!pip install transformers datasets torch


Collecting datasets
  Using cached datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from datasets)
  Using cached multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Using cached fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.4.5.107 (from torch)
  Using cached nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.1.0.106 (from torch)
  Using cached nvidia_cusparse_cu12-12.1.0.

In [None]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
import torch

# load the dataset
dataset = load_dataset("jfleg")

# one hot encoding
def add_labels(example):
    example['label'] = 0 if example['sentence'] in example['corrections'] else 1
    return example

dataset = dataset.map(add_labels)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

def tokenize_function(example):
    return tokenizer(example['sentence'], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['validation'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

model.save_pretrained('./grammar-corrector-model')
tokenizer.save_pretrained('./grammar-corrector-tokenizer')


corrector_model = BertForSequenceClassification.from_pretrained('./grammar-corrector-model')
corrector_tokenizer = BertTokenizer.from_pretrained('./grammar-corrector-tokenizer')

# Function to predict and correct grammar
def correct_grammar(sentence):
    inputs = corrector_tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)
    outputs = corrector_model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)
    if predictions.item() == 0:
        return sentence  # Assuming label 0 is 'correct'
    else:
        # Assuming label 1 is 'incorrect', and you need to correct it
        return "The sentence is grammatically incorrect."

# Test the model
test_sentence = "She go to school every day."
corrected_sentence = correct_grammar(test_sentence)
print(corrected_sentence)


Map:   0%|          | 0/755 [00:00<?, ? examples/s]

Map:   0%|          | 0/748 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/755 [00:00<?, ? examples/s]

Map:   0%|          | 0/748 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,No log,0.455463
2,No log,0.422754
3,No log,0.409393


The sentence is grammatically incorrect.


In [None]:
from datasets import load_dataset, DatasetDict
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch


dataset = load_dataset("jfleg")

def flatten_corrections(examples):
    sentences = []
    corrections = []
    for sentence, correction_list in zip(examples['sentence'], examples['corrections']):
        for correction in correction_list:
            sentences.append(sentence)
            corrections.append(correction)
    return {'sentence': sentences, 'corrections': corrections}


dataset = dataset.map(flatten_corrections, batched=True, remove_columns=['sentence', 'corrections'])


train_test_dataset = DatasetDict({
    'train': dataset['validation'],
    'test': dataset['test']
})

def preprocess_function(examples):
    inputs = ["correct: " + ex for ex in examples['sentence']]
    targets = [ex for ex in examples['corrections']]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")


    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')


tokenized_datasets = train_test_dataset.map(preprocess_function, batched=True)


training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    save_steps=500,
    logging_steps=500,
    report_to="none"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
)


trainer.train()

model.save_pretrained('./grammar-corrector-model')
tokenizer.save_pretrained('./grammar-corrector-tokenizer')

# Load the model for inference
corrector_model = T5ForConditionalGeneration.from_pretrained('./grammar-corrector-model')
corrector_tokenizer = T5Tokenizer.from_pretrained('./grammar-corrector-tokenizer')

# Function to predict and correct grammar
def correct_grammar(sentence):
    inputs = corrector_tokenizer.encode("correct: " + sentence, return_tensors="pt", max_length=128, truncation=True)
    outputs = corrector_model.generate(inputs, max_length=128, num_beams=4, early_stopping=True)
    corrected_sentence = corrector_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected_sentence

# Test the model
test_sentence = "She go to school every day."
corrected_sentence = correct_grammar(test_sentence)
print("Original:", test_sentence)
print("Corrected:", corrected_sentence)


Map:   0%|          | 0/755 [00:00<?, ? examples/s]

Map:   0%|          | 0/748 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3020 [00:00<?, ? examples/s]

Map:   0%|          | 0/2992 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,No log,0.163901
2,1.032600,0.1268
3,0.162800,0.124341


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Original: She go to school every day.
Corrected: She goes to school every day.


In [None]:
test_sentences = [
    "She go to school every day.",
    "He don't like to play football.",
    "They is going to the market.",
    "I has a pen.",
    "The cat eat its food.",
    "He run fastly.",
    "She were very tired.",
    "We was happy to see him.",
    "This are my friends.",
    "She have a beautiful dress."
]

for sentence in test_sentences:
    corrected_sentence = correct_grammar(sentence)
    print("Original:", sentence)
    print("Corrected:", corrected_sentence)
    print()


Original: She go to school every day.
Corrected: She goes to school every day.

Original: He don't like to play football.
Corrected: He doesn't like to play football.

Original: They is going to the market.
Corrected: They are going to the market.

Original: I has a pen.
Corrected: I have a pen.

Original: The cat eat its food.
Corrected: The cat eats its food.

Original: He run fastly.
Corrected: He runs fast.

Original: She were very tired.
Corrected: She was very tired.

Original: We was happy to see him.
Corrected: We were happy to see him.

Original: This are my friends.
Corrected: This are my friends.

Original: She have a beautiful dress.
Corrected: She has a beautiful dress.

