sinhala_grammer_checker

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Load the dataset
dataset_path = 'Dataset/sinhala_dataset.csv'
data = pd.read_csv(dataset_path)

# Prepare data
input_texts = data['grammar_error_sentence'].values
target_texts = data['corrected_sentence'].values

# Tokenize the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(input_texts + target_texts)
input_sequences = tokenizer.texts_to_sequences(input_texts)
target_sequences = tokenizer.texts_to_sequences(target_texts)

# Padding sequences
max_seq_len = max(max(len(seq) for seq in input_sequences), max(len(seq) for seq in target_sequences))
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='post')
target_sequences = pad_sequences(target_sequences, maxlen=max_seq_len, padding='post')

# Split the data
X_train, X_test, y_train, y_test = train_test_split(input_sequences, target_sequences, test_size=0.2, random_state=42)

# Model parameters
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 128
lstm_units = 256

# Define the model
inputs = Input(shape=(max_seq_len,))
x = Embedding(vocab_size, embedding_dim)(inputs)
x = LSTM(lstm_units, return_sequences=True)(x)
outputs = Dense(vocab_size, activation='softmax')(x)

model = Model(inputs, outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)

# Save the model
model.save('Models/LSTM_Model/lstm_sinhala_grammar_checker.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [21]:
# Example prediction
def correct_sentence(input_sentence):
    sequence = tokenizer.texts_to_sequences([input_sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_seq_len, padding='post')
    prediction = model.predict(padded_sequence)
    predicted_sequence = tf.argmax(prediction[0], axis=-1).numpy()
    corrected_sentence = tokenizer.sequences_to_texts([predicted_sequence])[0]
    return corrected_sentence

# Test the model
test_sentence = "මම කමින් වේගයෙන් යනනෙමු"
print("Corrected Sentence:", correct_sentence(test_sentence))

Corrected Sentence: මම නැටුම් වේගයෙන් යවමි
