# BERT model grammar error correction training adopted from Sunil Chomal(@sunilchomal on Github)

In [1]:
!pip install --upgrade transformers datasets tensorboard tensorflow

Collecting transformers
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting tensorboard
  Downloading tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting tensorflow
  Downloading tensorflow-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess

In [2]:
import torch
from transformers import BertTokenizer, BertForTokenClassification
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# loading dummy data
data = pd.read_csv("/content/drive/MyDrive/DeepLearningFinalProject/asante_twi_grammar_dataset.csv")

In [5]:
data

Unnamed: 0,input_text,target_text
0,Me kɔ adwuma no,Mekɔ adwuma no
1,Me yɛ papa,Meyɛ papa
2,Woara yɛ m'ani sɔ,Woara yɛ m'ani sɔ
3,Kɔɔ dua be akyi,Kɔɔ dua bi akyi
4,Me kɔɔ sukuu nnora,Mekɔɔ sukuu nnora
5,Ma wɔn ho yɛ huan,Maa wɔn ho yɛɛ huam
6,Pi a no beo,Pia no bio
7,Emu beiaara,Emu biara
8,Ne nea Ɔsɛe de yɛ buburo way,Ne nea Ɔsɛe de yɛɛ buburoo wei
9,Kɔ dua bi akye,Kɔɔ dua bi akyi


In [6]:
class TwiDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        input_text = str(self.data.iloc[index]['input_text'])
        corrected_text = str(self.data.iloc[index]['target_text'])

        inputs = self.tokenizer(
            input_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        targets = self.tokenizer(
            corrected_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': targets['input_ids'].squeeze()
        }


In [7]:
# ## Model Initialization
model_name = "bert-base-uncased"  # Replace with a fine-tuned Twi-English BERT model if available.
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name,  num_labels=tokenizer.vocab_size)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# ## Train-Test Split
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

train_dataset = TwiDataset(train_data, tokenizer, max_len=128)
val_dataset = TwiDataset(val_data, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)

In [9]:
# ## Training Loop
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [10]:
# Training setup
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

Epoch 1, Loss: 10.090776443481445
Epoch 2, Loss: 8.480736017227173
Epoch 3, Loss: 6.482422828674316


In [11]:
def generate_sentence_variants(input_sentence):
    """
    Generate potential corrected variants of the input sentence for Twi-English.
    """
    variants = [input_sentence]  # Start with the original sentence

    # Common transformations for Twi-English
    variants.append(input_sentence.replace("eti s3n", "ɛtɛ sɛn"))
    variants.append(input_sentence.replace("Fa ma m3", "Fa ma me"))

    # Remove punctuation
    variants.append("".join(char for char in input_sentence if char.isalnum() or char.isspace()))

    # Split into words and shuffle (simulate alternate word orders)
    words = input_sentence.split()
    if len(words) > 1:
        variants.append(" ".join(reversed(words)))  # Example: reverse order

    # Deduplicate variants
    variants = list(set(variants))

    return variants


In [12]:
def select_best_sentence(input_sentence):
    """
    Takes a transcription and selects the most probable corrected version.
    """
    model.eval()

    # Generate variants
    sentence_variants = generate_sentence_variants(input_sentence)
    scores = []

    for sentence in sentence_variants:
        inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            scores.append(torch.softmax(logits, dim=-1)[0][1].item())

    # Find the variant with the highest probability
    best_index = np.argmax(scores)
    best_sentence = sentence_variants[best_index]

    return best_sentence, scores

In [13]:
def select_best_sentence(input_sentence):
    """
    Takes a transcription and selects the most probable corrected version.
    """
    model.eval()

    # Generate variants
    sentence_variants = generate_sentence_variants(input_sentence)
    scores = []

    for sentence in sentence_variants:
        inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits

            # Calculate the average log-probability across all tokens
            # as a proxy for sentence probability
            probs = torch.softmax(logits, dim=-1)
            # Get the average log probability for the first sequence in the batch
            # using probs[0, :]. This selects all token probabilities for the first sequence
            avg_log_prob = torch.log(probs[0, :]).mean().item()  # Use .mean() to get a single value
            scores.append(avg_log_prob)

    # Find the variant with the highest probability
    best_index = np.argmax(scores)
    best_sentence = sentence_variants[best_index]

    return best_sentence, scores

In [14]:
# ## Example Usage

sentence = "Good morning Kojo, iti s3n?"  # Example transcription
best_sentence, probabilities = select_best_sentence(sentence)

In [15]:
print("Input Sentence >>>", sentence)
print("Best Suggestion >>>", best_sentence)
print("Probabilities(avg. logits) >>>", probabilities)

Input Sentence >>> Good morning Kojo, iti s3n?
Best Suggestion >>> Good morning Kojo, iti s3n?
Probabilities(avg. logits) >>> [-10.384939193725586, -10.383551597595215, -10.384991645812988]
