**Enhanced Transformer Method,Enhanced LSTM checker and Enhanced Sinhala Text Corrector**

In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional, Input, Conv1D, MaxPooling1D, concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import xgboost as xgb
from transformers import AutoTokenizer, AutoModelForMaskedLM, TrainingArguments, Trainer
import torch
import json
import re
from collections import defaultdict

class EnhancedTransformerChecker:
    def __init__(self, model_name='xlm-roberta-base'):
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.model = AutoModelForMaskedLM.from_pretrained(model_name)
            self.model.eval()
            # Fine-tune on Sinhala data if available
            self.context_window = 5  # Words before and after for context
        except Exception as e:
            print(f"Warning: Transformer initialization failed: {e}")
            self.tokenizer = None
            self.model = None

        self.cache = {}  # Cache for frequently corrected words
        self.confidence_threshold = 0.75

    def build_context(self, words, current_idx):
        start = max(0, current_idx - self.context_window)
        end = min(len(words), current_idx + self.context_window + 1)
        return ' '.join(words[start:end])

    def correct_word(self, word, context=''):
        if self.model is None:
            return word

        # Check cache first
        if word in self.cache:
            return self.cache[word]

        try:
            # Combine word with context
            text_to_check = f"{context} {word}" if context else word
            inputs = self.tokenizer(text_to_check, return_tensors="pt", truncation=True, max_length=512)

            # Get word position
            word_tokens = self.tokenizer.encode(word, add_special_tokens=False)

            with torch.no_grad():
                outputs = self.model(**inputs)

            predictions = []
            for token_idx in range(len(word_tokens)):
                logits = outputs.logits[0, token_idx]
                probs = torch.softmax(logits, dim=-1)
                pred_token = torch.argmax(probs).item()
                confidence = probs[pred_token].item()

                if confidence > self.confidence_threshold:
                    predictions.append(self.tokenizer.decode([pred_token]))
                else:
                    predictions.append(self.tokenizer.decode([word_tokens[token_idx]]))

            corrected = ''.join(predictions).strip()

            # Cache the result
            self.cache[word] = corrected
            return corrected
        except Exception as e:
            print(f"Transformer correction failed: {e}")
            return word

class EnhancedLSTMChecker:
    def __init__(self, max_sequence_length=50):
        self.max_sequence_length = max_sequence_length
        self.char_tokenizer = Tokenizer(char_level=True, filters='', lower=False)
        self.word_tokenizer = Tokenizer(filters='', lower=False)
        self.model = None
        self.char_patterns = defaultdict(int)
        self.word_patterns = defaultdict(int)

    def build_advanced_model(self, char_vocab_size, word_vocab_size):
        # Character input branch
        char_input = Input(shape=(self.max_sequence_length,))
        char_emb = Embedding(char_vocab_size, 100)(char_input)

        # CNN layers for character patterns
        conv1 = Conv1D(64, 3, activation='relu')(char_emb)
        pool1 = MaxPooling1D(2)(conv1)
        conv2 = Conv1D(128, 3, activation='relu')(pool1)
        pool2 = MaxPooling1D(2)(conv2)

        # Bidirectional LSTM layers
        bilstm1 = Bidirectional(LSTM(128, return_sequences=True))(pool2)
        bilstm2 = Bidirectional(LSTM(64))(bilstm1)

        # Dense layers with dropout
        dense1 = Dense(256, activation='relu')(bilstm2)
        dropout1 = Dropout(0.3)(dense1)
        dense2 = Dense(128, activation='relu')(dropout1)
        dropout2 = Dropout(0.2)(dense2)

        # Output layer
        output = Dense(char_vocab_size, activation='softmax')(dropout2)

        model = Model(inputs=char_input, outputs=output)
        model.compile(
            optimizer='adam',
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )
        return model

    def prepare_training_data(self, text_data):
        if isinstance(text_data, dict):
            words = list(text_data.keys())
        else:
            words = text_data

        # Prepare character sequences
        char_sequences = []
        next_chars = []

        for word in words:
            for i in range(len(word) - 1):
                char_sequences.append(word[i:i+self.max_sequence_length])
                next_chars.append(word[i+1])
                # Store character patterns
                self.char_patterns[word[i:i+3]] += 1

        # Fit tokenizers
        self.char_tokenizer.fit_on_texts(char_sequences)
        self.word_tokenizer.fit_on_texts(words)

        # Convert to sequences
        X = self.char_tokenizer.texts_to_sequences(char_sequences)
        y = self.char_tokenizer.texts_to_sequences(next_chars)

        # Pad sequences
        X = pad_sequences(X, maxlen=self.max_sequence_length)
        y = to_categorical(y, num_classes=len(self.char_tokenizer.word_index) + 1)

        return X, y

    def train(self, text_data):
        try:
            X, y = self.prepare_training_data(text_data)

            # Split data
            X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

            # Build model
            self.model = self.build_advanced_model(
                len(self.char_tokenizer.word_index) + 1,
                len(self.word_tokenizer.word_index) + 1
            )

            # Callbacks
            callbacks = [
                EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
                ModelCheckpoint('best_model.h5', monitor='val_accuracy', save_best_only=True)
            ]

            # Train model
            self.model.fit(
                X_train, y_train,
                validation_data=(X_val, y_val),
                epochs=20,
                batch_size=64,
                callbacks=callbacks
            )

        except Exception as e:
            print(f"LSTM training failed: {e}")

    def correct_word(self, word):
        if self.model is None:
            return word

        try:
            # Check character patterns
            for i in range(len(word) - 2):
                pattern = word[i:i+3]
                if self.char_patterns[pattern] == 0:
                    # Potential error found, generate correction
                    char_seq = self.char_tokenizer.texts_to_sequences([word[i:i+self.max_sequence_length]])
                    padded_seq = pad_sequences(char_seq, maxlen=self.max_sequence_length)
                    pred = self.model.predict(padded_seq, verbose=0)[0]
                    predicted_char = self.char_tokenizer.index_word[np.argmax(pred)]
                    word = word[:i+1] + predicted_char + word[i+2:]

            return word
        except Exception as e:
            print(f"LSTM correction failed: {e}")
            return word

class EnhancedSinhalaTextCorrector:
    def __init__(self, dictionary_path):
        self.dictionary = self.load_dictionary(dictionary_path)
        self.transformer_checker = EnhancedTransformerChecker()
        self.lstm_checker = EnhancedLSTMChecker()

    def load_dictionary(self, file_path):
        try:
            with open(file_path, "r", encoding="utf-8") as file:
                return json.load(file)
        except Exception as e:
            print(f"Error loading dictionary: {e}")
            return {}

    def initialize_models(self):
        print("Training models...")

        sample_incorrect = [
            "මල්", "සුගිය", "අත්න්ු", "කල", "ගිය", "කරන්",
            "යන්න", "එන්න", "බලන්", "කියන්", "දෙන්න"
        ]

        self.lstm_checker.train(self.dictionary)

    def correct_text(self, text):
        words = text.split()
        corrected_words = []

        for idx, word in enumerate(words):
            context = self.transformer_checker.build_context(words, idx)
            corrected_word = self.transformer_checker.correct_word(word, context)
            corrected_words.append(corrected_word)

        return " ".join(corrected_words)

if __name__ == "__main__":
    dictionary_path = "/content/drive/MyDrive/cleaned_sinhala_words.json"
    corrector = EnhancedSinhalaTextCorrector(dictionary_path)

    test_text = "මම පාසල් නිවාඩු කාලයේදී දෙමව්පියන් සමග අනුරාධපුර වන්දනා චාරිකාවක ගියෙය."
    print(f"Original Text: {test_text}")

    corrected_text = corrector.correct_text(test_text)
    print(f"Corrected Text: {corrected_text}")

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Original Text: මම පාසල් නිවාඩු කාලයේදී දෙමව්පියන් සමග අනුරාධපුර වන්දනා චාරිකාවක ගියෙය.
Corrected Text: <s> <s> <s> <s>මම <s>මමපාසල්නිවාඩුකාලය <s> <s> <s>නිවාඩුකාලය <s>කාලයේදීදෙ <s>දෙමව්


In [4]:
class EnhancedSinhalaTextCorrector:
    def __init__(self, dictionary_path):
        self.dictionary = self.load_dictionary(dictionary_path)
        self.transformer_checker = EnhancedTransformerChecker()
        self.lstm_checker = EnhancedLSTMChecker()
        self.rf_classifier = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
        self.gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
        self.xgb_classifier = xgb.XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, use_label_encoder=False, eval_metric='mlogloss')

    def load_dictionary(self, file_path):
        try:
            with open(file_path, "r", encoding="utf-8") as file:
                return json.load(file)
        except Exception as e:
            print(f"Error loading dictionary: {e}")
            return {}

    def train_ensemble_models(self, correct_words, incorrect_words):
        try:
            # Combine correct and incorrect words
            words = correct_words + incorrect_words
            labels = [1] * len(correct_words) + [0] * len(incorrect_words)

            # Use TF-IDF for feature extraction
            vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 5))
            features = vectorizer.fit_transform(words)

            # Train each model
            self.rf_classifier.fit(features, labels)
            self.gb_classifier.fit(features, labels)
            self.xgb_classifier.fit(features, labels)

            # Store the vectorizer for future use
            self.vectorizer = vectorizer
        except Exception as e:
            print(f"Ensemble model training failed: {e}")

    def ensemble_correct_word(self, word):
        try:
            if not hasattr(self, 'vectorizer'):
                return word

            # Extract features for the input word
            features = self.vectorizer.transform([word])

            # Get predictions from each model
            rf_pred = self.rf_classifier.predict_proba(features)[0][1]
            gb_pred = self.gb_classifier.predict_proba(features)[0][1]
            xgb_pred = self.xgb_classifier.predict_proba(features)[0][1]

            # Weighted ensemble prediction
            ensemble_score = (0.4 * rf_pred + 0.3 * gb_pred + 0.3 * xgb_pred)

            # Determine if the word is likely incorrect
            if ensemble_score < 0.5:  # Threshold for correction
                # Use transformer or LSTM for final correction
                return self.lstm_checker.correct_word(word)
            return word
        except Exception as e:
            print(f"Ensemble correction failed: {e}")
            return word

    def correct_text(self, text, use_ensemble=False):
        words = text.split()
        corrected_words = []

        for idx, word in enumerate(words):
            if use_ensemble:
                corrected_word = self.ensemble_correct_word(word)
            else:
                context = self.transformer_checker.build_context(words, idx)
                corrected_word = self.transformer_checker.correct_word(word, context)
            corrected_words.append(corrected_word)

        return " ".join(corrected_words)

if __name__ == "__main__":
    dictionary_path = "/content/drive/MyDrive/cleaned_sinhala_words.json"
    corrector = EnhancedSinhalaTextCorrector(dictionary_path)

    # Example correct and incorrect words for training
    correct_words = ["මම", "පාසල්", "නිවාඩු", "අනුරාධපුර"]
    incorrect_words = ["මමම", "පසල්", "නවාඩු", "අනුරාධපුර"]

    # Train ensemble models
    corrector.train_ensemble_models(correct_words, incorrect_words)

    test_text = "මමම පසල් නවාඩු කාලයේදී දෙමව්පියන් සමග අනුරාධපුර වන්දනා චාරිකාවක ගියෙය."
    print(f"Original Text: {test_text}")

    # Test with transformer-based correction
    corrected_text = corrector.correct_text(test_text, use_ensemble=False)
    print(f"Transformer Corrected Text: {corrected_text}")

    # Test with ensemble-based correction
    corrected_text_ensemble = corrector.correct_text(test_text, use_ensemble=True)
    print(f"Ensemble Corrected Text: {corrected_text_ensemble}")


Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Parameters: { "use_label_encoder" } are not used.



Original Text: මමම පසල් නවාඩු කාලයේදී දෙමව්පියන් සමග අනුරාධපුර වන්දනා චාරිකාවක ගියෙය.
Transformer Corrected Text: <s>මම <s>ල් <s>නවාම <s>ේදී <s>මමමපසල් <s> <s> <s>නවා <s>කාලයේදීදෙ <s>දෙමව්
Ensemble Corrected Text: මමම පසල් නවාඩු කාලයේදී දෙමව්පියන් සමග අනුරාධපුර වන්දනා චාරිකාවක ගියෙය.


In [2]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import pandas as pd

# Dataset Class for PyTorch
class CorrectionDataset(Dataset):
    def __init__(self, input_texts, target_texts, tokenizer, max_length=50):
        self.input_texts = input_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        input_text = self.input_texts[idx]
        target_text = self.target_texts[idx]

        # Tokenize the input and target text
        input_encoding = self.tokenizer(
            input_text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        target_encoding = self.tokenizer(
            target_text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )

        return {
            "input_ids": input_encoding["input_ids"].squeeze(),
            "attention_mask": input_encoding["attention_mask"].squeeze(),
            "labels": target_encoding["input_ids"].squeeze(),
        }

# Load Dataset
file_path = "/content/drive/MyDrive/correct incorrect sentences.csv"  # Update this with your dataset path
data = pd.read_csv(file_path)

# Prepare the data
data_long = pd.melt(
    data,
    id_vars=["Correct Sentences"],
    value_vars=["Incorrect Sentences", "Incorrect Sentences.1"],
    var_name="Variant Type",
    value_name="Incorrect Sentence"
)
data_long = data_long.dropna()
data_long["Correct Sentences"] = data_long["Correct Sentences"].str.strip()
data_long["Incorrect Sentence"] = data_long["Incorrect Sentence"].str.strip()

# Split data into training and validation sets
train_data, val_data = train_test_split(data_long, test_size=0.2, random_state=42)

# Tokenizer and Model Initialization
model_name = "t5-small"  # Replace with "t5-base" or "mbart-large" for larger models
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Dataset and DataLoader
train_dataset = CorrectionDataset(
    train_data["Incorrect Sentence"].tolist(),
    train_data["Correct Sentences"].tolist(),
    tokenizer,
)
val_dataset = CorrectionDataset(
    val_data["Incorrect Sentence"].tolist(),
    val_data["Correct Sentences"].tolist(),
    tokenizer,
)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Training Loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
epochs =20

for epoch in range(epochs):
    model.train()
    train_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()

    val_loss = 0
    model.eval()
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

    print(f"Epoch {epoch + 1}: Train Loss = {train_loss / len(train_loader)}, Validation Loss = {val_loss / len(val_loader)}")

# Save the model
model.save_pretrained("grammar_correction_model")
tokenizer.save_pretrained("grammar_correction_model")

# Prediction Function
def predict(input_text):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=50, padding="max_length").to(device)
        outputs = model.generate(inputs["input_ids"], max_length=50, num_beams=5, early_stopping=True)
        return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example Predictions
example_sentences = [
    "මම වීදිවල ඇවිදිනවා.",
    "අපි කඳු නගිනවා."
]
for sentence in example_sentences:
    corrected = predict(sentence)
    print(f"Original: {sentence}")
    print(f"Corrected: {corrected}")


Epoch 1: Train Loss = 1.2462338139512101, Validation Loss = 0.04717972405975865
Epoch 2: Train Loss = 0.058796348772486864, Validation Loss = 0.049479847082928304
Epoch 3: Train Loss = 0.03860225510232303, Validation Loss = 0.042126755380342086
Epoch 4: Train Loss = 0.033208879805645165, Validation Loss = 0.024119753465657274
Epoch 5: Train Loss = 0.030135586200167937, Validation Loss = 0.012483481332779892
Epoch 6: Train Loss = 0.027230193258776347, Validation Loss = 0.014574354214053
Epoch 7: Train Loss = 0.026189704535871137, Validation Loss = 0.01180592596515893
Epoch 8: Train Loss = 0.023978185288760126, Validation Loss = 0.01020276329044493
Epoch 9: Train Loss = 0.022137888720525162, Validation Loss = 0.012292570911437994
Epoch 10: Train Loss = 0.021385067798273295, Validation Loss = 0.011078840489649484
Epoch 11: Train Loss = 0.020442507083394696, Validation Loss = 0.01720425397563245
Epoch 12: Train Loss = 0.019027026539326322, Validation Loss = 0.011062426094506536
Epoch 13: T

In [3]:
# Updated Prediction Function
def predict(input_text, model, tokenizer, device, max_length=50):
    model.eval()
    with torch.no_grad():
        # Tokenize the input sentence
        inputs = tokenizer(
            input_text,
            return_tensors="pt",
            truncation=True,
            max_length=max_length,
            padding="max_length"
        ).to(device)

        # Generate the corrected output
        outputs = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            num_beams=5,
            early_stopping=True
        )

        # Decode the generated sequence
        corrected_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return corrected_sentence

# Example Predictions
example_sentences = [
    "මම වීදිවල ඇවිදිනවා.",
    "අපි කඳු නගිනවා."
]

# Ensure the model and tokenizer are on the same device
model.to(device)

# Generate predictions for each example
for sentence in example_sentences:
    corrected = predict(sentence, model, tokenizer, device)
    print(f"Original: {sentence}")
    print(f"Corrected: {corrected}")


Original: මම වීදිවල ඇවිදිනවා.
Corrected: .
Original: අපි කඳු නගිනවා.
Corrected: .


LSTM based Method

In [10]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, TimeDistributed, Bidirectional
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the dataset
file_path = '/content/drive/MyDrive/grammar_correction_pairs.csv'
data = pd.read_csv(file_path)

# Split the data into training and testing sets
train_incorrect, test_incorrect, train_correct, test_correct = train_test_split(
    data['incorrect_sentence'], data['correct_sentence'], test_size=0.2, random_state=42
)

# Tokenizer setup
tokenizer = Tokenizer(oov_token="")
tokenizer.fit_on_texts(train_incorrect.tolist() + train_correct.tolist())

# Convert text to sequences
train_incorrect_sequences = tokenizer.texts_to_sequences(train_incorrect)
train_correct_sequences = tokenizer.texts_to_sequences(train_correct)

test_incorrect_sequences = tokenizer.texts_to_sequences(test_incorrect)
test_correct_sequences = tokenizer.texts_to_sequences(test_correct)

# Padding sequences
max_len = max(
    max(len(seq) for seq in train_incorrect_sequences),
    max(len(seq) for seq in train_correct_sequences)
)
train_incorrect_padded = pad_sequences(train_incorrect_sequences, maxlen=max_len, padding='post')
train_correct_padded = pad_sequences(train_correct_sequences, maxlen=max_len, padding='post')[..., None]

test_incorrect_padded = pad_sequences(test_incorrect_sequences, maxlen=max_len, padding='post')
test_correct_padded = pad_sequences(test_correct_sequences, maxlen=max_len, padding='post')[..., None]

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# Build the LSTM model
lstm_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.2),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    TimeDistributed(Dense(vocab_size, activation='softmax'))
])

# Compile the model
lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
lstm_model.fit(
    train_incorrect_padded, train_correct_padded,
    epochs=10,
    batch_size=64,
    validation_split=0.2
)

# Evaluate the model on test data
results = lstm_model.evaluate(test_incorrect_padded, test_correct_padded)
print(f"Test Loss: {results[0]}, Test Accuracy: {results[1]}")

# Function to predict corrected sentence
def predict_sentence(input_sentence):
    input_sequence = tokenizer.texts_to_sequences([input_sentence])
    input_padded = pad_sequences(input_sequence, maxlen=max_len, padding='post')
    predictions = lstm_model.predict(input_padded)
    predicted_sequence = tf.argmax(predictions[0], axis=-1).numpy()
    predicted_sentence = " ".join(
        [word for word in tokenizer.sequences_to_texts([predicted_sequence])[0].split() if word != ""]
    )
    return predicted_sentence

# Example usage
input_sentence = "මම ගෙදර යැවෙමු"
predicted_sentence = predict_sentence(input_sentence)
print("Input Sentence:", input_sentence)
print("Predicted Sentence:", predicted_sentence)




Epoch 1/10
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 13ms/step - accuracy: 0.2803 - loss: 3.6158 - val_accuracy: 0.6914 - val_loss: 1.2587
Epoch 2/10
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.7503 - loss: 1.0695 - val_accuracy: 0.8243 - val_loss: 0.7130
Epoch 3/10
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8273 - loss: 0.7026 - val_accuracy: 0.8723 - val_loss: 0.5190
Epoch 4/10
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.8842 - loss: 0.5050 - val_accuracy: 0.9141 - val_loss: 0.3747
Epoch 5/10
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.9149 - loss: 0.3734 - val_accuracy: 0.9398 - val_loss: 0.2779
Epoch 6/10
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9398 - loss: 0.2756 - val_accuracy: 0.9544 - val_loss: 0.2060
Epoch 7/10
[1m144/144

In [11]:
# Example usage
input_sentence = "මම ගෙදර යැවෙමු"
predicted_sentence = predict_sentence(input_sentence)
print("Input Sentence:", input_sentence)
print("Predicted Sentence:", predicted_sentence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
Input Sentence: මම ගෙදර යැවෙමු
Predicted Sentence: මම ගෙදර යැවෙමි


In [2]:
pip install langchain huggingface-hub sentence-transformers faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [6]:
pip install --upgrade langchain


Collecting langchain
  Downloading langchain-0.3.14-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.29 (from langchain)
  Downloading langchain_core-0.3.29-py3-none-any.whl.metadata (6.3 kB)
Downloading langchain-0.3.14-py3-none-any.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading langchain_core-0.3.29-py3-none-any.whl (411 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m411.6/411.6 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain-core, langchain
  Attempting uninstall: langchain-core
    Found existing installation: langchain-core 0.3.25
    Uninstalling langchain-core-0.3.25:
      Successfully uninstalled langchain-core-0.3.25
  Attempting uninstall: langchain
    Found existing installation: langchain 0.3.12
    Uninstalling langchain-0.3.12:
      Successfully uninstalled langchain-0.3.12
Successf

In [8]:
pip install langchain-community


Collecting langchain-community
  Downloading langchain_community-0.3.14-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.7.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.23.3-py3-none-any.whl.metadata (7.1 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB

In [10]:
pip install --upgrade langchain




In [43]:
from google.colab import userdata

# Fetch the API key securely
api_key = userdata.get('OPENAI_API_KEY')

# Use the fetched API key in your application
checker = GrammarChecker(api_key=api_key)


GPT-based Grammar and spell Checker with RAG Architecture

In [46]:
import json
from typing import List, Dict
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

class GrammarChecker:
    def __init__(self, model_name: str = "gpt-3.5-turbo", api_key: str = None):
        self.embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-mpnet-base-v2"
        )
        self.llm = ChatOpenAI(
            model_name=model_name,
            temperature=0,
            api_key=api_key
        )
        self.vector_store = None
        self.prompt = PromptTemplate(
            template="""Context: {context}

Input sentence: {input_text}

Analyze the given Sinhala sentence and provide:
1. List of grammatical or spelling errors
2. Explanations for each error
3. Corrected sentence

Response in format:
Errors:
[List errors]

Explanations:
[Explain each error]

Corrected:
[Corrected sentence]""",
            input_variables=["context", "input_text"]
        )
        self.chain = LLMChain(llm=self.llm, prompt=self.prompt)

    def load_training_data(self, file_path: str):
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        documents = []
        for item in data['training_data']:
            content = f"""
            Incorrect: {item['incorrect_sentence']}
            Correct: {item['correct_sentence']}
            Error: {item['error_details']}
            """
            documents.append(Document(page_content=content))

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200
        )
        split_docs = text_splitter.split_documents(documents)
        self.vector_store = FAISS.from_documents(split_docs, self.embeddings)

    def check_grammar(self, text: str, k: int = 3) -> str:
        if not self.vector_store:
            raise ValueError("Training data not loaded. Call load_training_data first.")

        similar_examples = self.vector_store.similarity_search(text, k=k)
        context = "\n".join([doc.page_content for doc in similar_examples])

        response = self.chain.run(
            context=context,
            input_text=text
        )
        return response

    def batch_check(self, texts: List[str]) -> List[str]:
        return [self.check_grammar(text) for text in texts]

def main():


    # Load training data
    checker.load_training_data("/content/drive/MyDrive/sinhala_grammar_training.json")

    # Example usage
    text = "අපි උත්සාහයෙන් වැඩ කරමි"
    result = checker.check_grammar(text)
    print(f"Input: {text}\n")
    print(f"Analysis:\n{result}")

    # Batch processing example
    texts = [
        "මම කෑම ගනිමු",
        "අපි පොත කියවමි",
        "මේ දින වල සිග්‍රයන් පැතිර යන වසන්ගත උන රෝගයක් නිසා අප විශ්විද්‍යායේ බොහෝ සිසුන් පීඩාවට පත්වී සිටී."
    ]
    results = checker.batch_check(texts)
    for text, result in zip(texts, results):
        print(f"\nInput: {text}")
        print(f"Analysis:\n{result}")

if __name__ == "__main__":
    main()


Input: අපි උත්සාහයෙන් වැඩ කරමි

Analysis:
Errors:
1. වැඩ කරමි

Explanations:
1. Verb 'කරමි' does not match the subject 'අපි'. It should end with 'මු'.

Corrected:
අපි උත්සාහයෙන් වැඩ කරමු

Input: මම කෑම ගනිමු
Analysis:
Errors:
1. Verb Agreement Error

Explanations:
1. The verb 'ගනිමු' does not match the subject 'මම'. It should end with 'මි'.

Corrected:
මම කෑම ගනිමි

Input: අපි පොත කියවමි
Analysis:
Errors:
1. කියවමි should be කියවිමි

Explanations:
1. Verb 'කියවමි' does not match the subject 'අපි'. It should end with 'කියවිමි'.

Corrected:
අපි පොත කියවිමි

Input: මේ දින වල සිග්‍රයන් පැතිර යන වසන්ගත උන රෝගයක් නිසා අප විශ්විද්‍යායේ බොහෝ සිසුන් පීඩාවට පත්වී සිටී.
Analysis:
Errors:
1. වසන්ගත should be වසන්ගත්
2. උන should be උනා
3. පීඩාවට should be පීඩාවෙන්

Explanations:
1. The word වසන්ගත should have the case marker වසන්ගත් to indicate the direct object.
2. The verb උන should be conjugated as උනා to match the subject.
3. The preposition පීඩාවට should be followed by the case marker පීඩාවෙන

Best one is GPT based Grammar and spell checker