In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, Dropout, Attention
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Load the dataset
dataset_path = 'Dataset/sinhala_dataset.csv'
data = pd.read_csv(dataset_path)

# Prepare data
input_texts = data['grammar_error_sentence'].values
target_texts = data['corrected_sentence'].values

# Tokenize the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(input_texts + target_texts)
input_sequences = tokenizer.texts_to_sequences(input_texts)
target_sequences = tokenizer.texts_to_sequences(target_texts)

# Padding sequences
max_seq_len = max(max(len(seq) for seq in input_sequences), max(len(seq) for seq in target_sequences))
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='post')
target_sequences = pad_sequences(target_sequences, maxlen=max_seq_len, padding='post')

# Split the data
X_train, X_test, y_train, y_test = train_test_split(input_sequences, target_sequences, test_size=0.2, random_state=42)

# Model parameters
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 1024
lstm_units = 1024

# Define the model
inputs = Input(shape=(max_seq_len,))
x = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs)
x = Bidirectional(LSTM(lstm_units, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(x)
attention = Attention()([x, x])  
x = Dense(256, activation='relu')(x)
x = tf.keras.layers.Concatenate()([x, attention]) 
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
outputs = Dense(vocab_size, activation='softmax')(x)

model = Model(inputs, outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)

# Save the model
model.save('Models/Advanced_LSTM/advanced_lstm_sinhala_grammar_checker.h5')

# Example prediction function
def correct_sentence(input_sentence):
    sequence = tokenizer.texts_to_sequences([input_sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_seq_len, padding='post')
    prediction = model.predict(padded_sequence)
    predicted_sequence = np.argmax(prediction, axis=-1)[0]
    corrected_sentence = tokenizer.sequences_to_texts([predicted_sequence])[0]
    return corrected_sentence

# Test the model
test_sentence = "මම ගමට යනවා"
print("Corrected Sentence:", correct_sentence(test_sentence))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Corrected Sentence: මම ගමට වාහන කළේය කළේය කළේය කළේය


In [7]:
# Example prediction function
def correct_sentence(input_sentence):
    sequence = tokenizer.texts_to_sequences([input_sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_seq_len, padding='post')
    prediction = model.predict(padded_sequence)
    predicted_sequence = np.argmax(prediction, axis=-1)[0]
    corrected_sentence = tokenizer.sequences_to_texts([predicted_sequence])[0]
    return corrected_sentence

# Test the model
test_sentence = "මම ඔහුගෙන් පොතක් දෙන්නෙමු"
print("Corrected Sentence:", correct_sentence(test_sentence))

Corrected Sentence: වාහන ඔහුගෙන් පොතක් කළේය කළේය කළේය කළේය
