## Dataset Load

In [12]:
import pandas as pd

# Load the dataset
dataset_path = "Dataset/dataset2.csv"  # Path to the dataset
data = pd.read_csv(dataset_path)

# Display the dataset structure
print(data[['Grammar Error', 'Original Sentence']].head())


                  Grammar Error            Original Sentence
0                  මම ගමට යනවා.                  මම ගමට යමි.
1                ඔහු පාසලට ගිය.             ඔහු පාසලට ගියේය.
2            අපි ගඟේ දිය නෑවෙම.          අපි ගඟේ දිය නෑවෙමු.
3  ඔවුන් විභාගය ජයග්‍රහණය කළමු.  ඔවුන් විභාගය ජයග්‍රහණය කළහ.
4              ඔහු ගෙදරට ආවායය.              ඔහු ගෙදරට ආවේය.


## Model - Google/mt5-small

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the pre-trained mT5 model
model_name = "google/mt5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Fine-tune the model (example training loop)
def fine_tune_model(data, tokenizer, model, epochs=3):
    from torch.utils.data import DataLoader
    import torch

    # Prepare dataset for training
    inputs = tokenizer(list(data['Grammar Error']), padding=True, truncation=True, max_length=128, return_tensors="pt")
    labels = tokenizer(list(data['Original Sentence']), padding=True, truncation=True, max_length=128, return_tensors="pt")
    dataset = torch.utils.data.TensorDataset(inputs.input_ids, labels.input_ids)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

    # Define optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

    # Training loop
    model.train()
    for epoch in range(epochs):
        for batch in dataloader:
            input_ids, label_ids = batch
            outputs = model(input_ids=input_ids, labels=label_ids)
            loss = outputs.loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch + 1} completed. Loss: {loss.item()}")
    return model

# Fine-tune the model
fine_tuned_model = fine_tune_model(data, tokenizer, model)

# Save the fine-tuned model
fine_tuned_model.save_pretrained("Models/grammar_correction_model")
tokenizer.save_pretrained("Tokenizer/grammar_correction_model")


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


Epoch 1 completed. Loss: 49.62501907348633
Epoch 2 completed. Loss: 30.58425521850586
Epoch 3 completed. Loss: 47.42890930175781


('grammar_correction_model\\tokenizer_config.json',
 'grammar_correction_model\\special_tokens_map.json',
 'grammar_correction_model\\spiece.model',
 'grammar_correction_model\\added_tokens.json')

In [14]:
# Correct a sentence using the fine-tuned model
def correct_grammar(sentence, tokenizer, model):
    inputs = tokenizer(sentence, return_tensors="pt", max_length=128, truncation=True)
    outputs = model.generate(inputs.input_ids, max_length=128, num_beams=5, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test the grammar checker
test_sentence = data['Grammar Error'][0]
corrected_sentence = correct_grammar(test_sentence, tokenizer, fine_tuned_model)
print(f"Original: {test_sentence}")
print(f"Corrected: {corrected_sentence}")


Original: මම ගමට යනවා.
Corrected: <extra_id_0>voltaоизоляци跬 <extra_id_11>illionramaiyalatanbegrepクロノグラフ~$(ayev


In [16]:
# Define the accuracy calculation function
def calculate_sentence_accuracy(original_sentences, corrected_sentences):
	correct_count = 0
	for original, corrected in zip(original_sentences, corrected_sentences):
		if original == corrected:
			correct_count += 1
	return (correct_count / len(original_sentences)) * 100

# Evaluate accuracy for the fine-tuned model
corrected_sentences = [correct_grammar(sentence, tokenizer, fine_tuned_model) for sentence in data['Grammar Error']]
transformer_accuracy = calculate_sentence_accuracy(data['Original Sentence'], corrected_sentences)
print(f"Transformer-Based Grammar Checker Accuracy: {transformer_accuracy:.2f}%")


Transformer-Based Grammar Checker Accuracy: 0.00%


## Model - Facebook/bart-base

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd

# Load dataset
dataset_path = "Dataset/dataset2.csv"  # Update with your dataset path
data = pd.read_csv(dataset_path)

# Define Dataset class
class GrammarDataset(Dataset):
    def __init__(self, input_texts, target_texts, tokenizer, max_length=128):
        self.input_texts = input_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        input_text = self.input_texts[idx]
        target_text = self.target_texts[idx]

        inputs = self.tokenizer(
            input_text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt"
        )
        targets = self.tokenizer(
            target_text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt"
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": targets["input_ids"].squeeze(),
        }

# Initialize the tokenizer and model
tokenizer.src_lang = "si_LK"
model_name = "facebook/bart-base"  # Use "bart-large" for better performance
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Prepare data for training
input_texts = list(data["Grammar Error"])
target_texts = list(data["Original Sentence"])
dataset = GrammarDataset(input_texts, target_texts, tokenizer)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss / len(dataloader):.4f}")

# Save the fine-tuned model
model.save_pretrained("Models/bart_grammar_checker")
tokenizer.save_pretrained("Tokenizer/bart_grammar_checker")


Epoch 1/3, Loss: 5.7914
Epoch 2/3, Loss: 4.5640
Epoch 3/3, Loss: 2.9543




('Tokenizer/bart_grammar_checker\\tokenizer_config.json',
 'Tokenizer/bart_grammar_checker\\special_tokens_map.json',
 'Tokenizer/bart_grammar_checker\\vocab.json',
 'Tokenizer/bart_grammar_checker\\merges.txt',
 'Tokenizer/bart_grammar_checker\\added_tokens.json')

In [8]:
# Load the fine-tuned BART model
from transformers import BartForConditionalGeneration, BartTokenizer

model = BartForConditionalGeneration.from_pretrained("Models/bart_grammar_checker")
tokenizer = BartTokenizer.from_pretrained("Tokenizer/bart_grammar_checker")

# Grammar correction function
def correct_grammar(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", max_length=128, truncation=True)
    outputs = model.generate(inputs["input_ids"].to(device), max_length=128, num_beams=5, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test the model
test_sentence = "අපි ගමට යනවා."
corrected_sentence = correct_grammar(test_sentence)
print("Original Sentence:", test_sentence)
print("Corrected Sentence:", corrected_sentence)


Original Sentence: අපි ගමට යනවා.
Corrected Sentence: ගමට යනවා.


## Test print the result

In [17]:
from transformers import BartForConditionalGeneration, BartTokenizer
import pandas as pd

# Load the fine-tuned grammar correction model and tokenizer
model_path = "Models/bart_grammar_checker"
model = BartForConditionalGeneration.from_pretrained(model_path)
tokenizer = BartTokenizer.from_pretrained(model_path)

# Define a function to generate corrections and compare tokens
def correct_grammar(sentence):
    # Tokenize input and generate corrected output
    inputs = tokenizer(sentence, return_tensors="pt", max_length=128, truncation=True)
    outputs = model.generate(inputs["input_ids"], max_length=128, num_beams=5, early_stopping=True)
    corrected_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Tokenize the original and corrected sentences
    original_tokens = sentence.split()
    corrected_tokens = corrected_sentence.split()

    # Identify mismatches (spelling/grammar corrections)
    mistakes = [(original, corrected) for original, corrected in zip(original_tokens, corrected_tokens) if original != corrected]

    return corrected_sentence, mistakes

# Load your dataset with grammar errors
dataset_path = "Dataset/dataset2.csv"  # Path to your dataset
data = pd.read_csv(dataset_path)

# Generate corrections and collect results
results = []
for idx, row in data.iterrows():
    grammar_error = row["Grammar Error"]
    corrected_sentence, mistakes = correct_grammar(grammar_error)
    results.append((idx + 1, mistakes, corrected_sentence))

# Print results in the required format
for result in results:
    sentence_id, mistakes, corrected_sentence = result
    print(f"Sentence {sentence_id}:")
    print(f"  Spelling Mistakes and Corrections: {mistakes}")
    print(f"  Corrected Sentence: {corrected_sentence}")
    print()


Sentence 1:
  Spelling Mistakes and Corrections: [('මම', 'ම\u0df8'), ('ගමට', 'ගයනවා.')]
  Corrected Sentence: ම෸ ගයනවා.

Sentence 2:
  Spelling Mistakes and Corrections: [('ගිය.', 'ොිය.')]
  Corrected Sentence: ඔහු පාසලට ොිය.

Sentence 3:
  Spelling Mistakes and Corrections: [('ගඟේ', 'ගඟේය'), ('දිය', '\u0df1ෑවෙම.')]
  Corrected Sentence: අපි ගඟේය ෱ෑවෙම.

Sentence 4:
  Spelling Mistakes and Corrections: [('විභාගය', 'සිභාගය'), ('ජයග්\u200dරහණය', 'ජක\u0d98\u0dc9\u200dරහණම\u0dce.')]
  Corrected Sentence: ඔවුන් සිභාගය ජක඘෉‍රහණම෎.

Sentence 5:
  Spelling Mistakes and Corrections: [('ආවායය.', 'ෆවාය\u0dfa.')]
  Corrected Sentence: ඔහු ගෙදරට ෆවාය෺.

Sentence 6:
  Spelling Mistakes and Corrections: [('ගුරුවරයා', 'ගුර්ව\u0d98යා'), ('පාඩම', 'පිඩම'), ('ඉගැන්වීමුමු.', '\u0dc9ජැන\u0dccසීභ����\u0df7�.')]
  Corrected Sentence: ගුර්ව඘යා පිඩම ෉ජැන෌සීභ����෷�.

Sentence 7:
  Spelling Mistakes and Corrections: [('වර්ධනය', 'ව\u0dfb්ධ\u0df1\u0dfa'), ('වේ.', 'Â�����.')]
  Corrected Sentence: මේ නගරය ව෻්ධ෱෺ Â��