In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed
from tensorflow.keras.utils import to_categorical

# Load your dataset
df = pd.read_csv('Error Annotated Corpus.csv')  # Replace with your dataset path in Colab

# Drop rows with missing values
df = df.dropna(subset=['Error word & consecutive word', 'Corrected words & its'])

# Preprocess the text (strip extra spaces)
df['Error word & consecutive word'] = df['Error word & consecutive word'].str.strip()
df['Corrected words & its'] = df['Corrected words & its'].str.strip()

# Prepare features and labels
X = df['Error word & consecutive word'].values  # Features: error words
y = df['Corrected words & its'].values  # Labels: corrected words

# Tokenization
tokenizer_x = Tokenizer(char_level=False)  # Word-level tokenizer for input
tokenizer_x.fit_on_texts(X)

tokenizer_y = Tokenizer(char_level=False)  # Word-level tokenizer for output
tokenizer_y.fit_on_texts(y)

# Convert text to sequences
X_sequences = tokenizer_x.texts_to_sequences(X)
y_sequences = tokenizer_y.texts_to_sequences(y)

# Pad sequences
X_padded = pad_sequences(X_sequences, padding='post')
y_padded = pad_sequences(y_sequences, maxlen=X_padded.shape[1], padding='post')  # Pad y to match max length of X

# Prepare output for categorical cross-entropy
y_onehot = []
for seq in y_padded:
    y_onehot.append(to_categorical(seq, num_classes=len(tokenizer_y.word_index) + 1))

# Convert to numpy array for model training
y_onehot = np.array(y_onehot)  # Ensure this is a numpy array

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X_padded, y_onehot, test_size=0.2, random_state=42)

# Define the RNN model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer_x.word_index) + 1, output_dim=128))  # Removed input_length
model.add(LSTM(64, return_sequences=True))
model.add(TimeDistributed(Dense(len(tokenizer_y.word_index) + 1, activation='softmax')))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

# Evaluate the model
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation Accuracy: {accuracy:.2f}')

# Function to correct grammar
def correct_grammar(sentence):
    seq = tokenizer_x.texts_to_sequences([sentence])
    padded = pad_sequences(seq, maxlen=X_padded.shape[1], padding='post')
    pred = model.predict(padded)

    # Convert predicted one-hot vectors back to words
    pred_sentence = []
    for word_probs in pred[0]:
        pred_word_index = np.argmax(word_probs)
        if pred_word_index != 0:  # Ignore padding
            pred_sentence.append(tokenizer_y.index_word[pred_word_index])

    return ' '.join(pred_sentence)

print(f'Corrected Sentence:')

# Save the model (optional)
model.save('tamil_spell_checker_model.h5')

Epoch 1/10
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 65ms/step - accuracy: 0.5099 - loss: 7.2012 - val_accuracy: 0.5351 - val_loss: 4.1143
Epoch 2/10
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 48ms/step - accuracy: 0.5357 - loss: 4.0325 - val_accuracy: 0.5351 - val_loss: 4.1121
Epoch 3/10
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 60ms/step - accuracy: 0.5346 - loss: 3.9358 - val_accuracy: 0.5351 - val_loss: 4.1424
Epoch 4/10
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 59ms/step - accuracy: 0.5365 - loss: 3.8631 - val_accuracy: 0.5351 - val_loss: 4.1569
Epoch 5/10
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 48ms/step - accuracy: 0.5347 - loss: 3.8289 - val_accuracy: 0.5351 - val_loss: 4.1701
Epoch 6/10
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 57ms/step - accuracy: 0.5371 - loss: 3.7709 - val_accuracy: 0.5351 - val_loss: 4.1801
Epoch 7/10
[1m126/



Corrected Sentence: 


In [30]:
# Example usage
incorrect_sentence = "மனவுளைச்சலே ஏற்படுகிறவை"
corrected_sentence = correct_grammar(incorrect_sentence)
print(f'Corrected Sentence: {corrected_sentence}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
Corrected Sentence: மனவுளைச்சலே ஏற்படுகிறது


In [31]:
# Example usage
incorrect_sentence = "மரங்கள் வேரோடு சாய்ந்தது"
corrected_sentence = correct_grammar(incorrect_sentence)
print(f'Corrected Sentence: {corrected_sentence}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Corrected Sentence: மரங்கள் வேரோடு சாய்ந்தன


In [39]:
!pip install stanza

import stanza
from collections import Counter

# Load Tamil Stanza pipeline for POS tagging
stanza.download('ta')  # Download the Tamil language model if not already done
nlp = stanza.Pipeline('ta', processors='tokenize,pos', use_gpu=False)
from collections import Counter

# Load Tamil Stanza pipeline for POS tagging
stanza.download('ta')  # Download the Tamil language model if not already done
nlp = stanza.Pipeline('ta', processors='tokenize,pos', use_gpu=False)
def analyze_and_correct_tamil_sentence(input_sentence):
    # Process the sentence using Stanza NLP
    doc = nlp(input_sentence)
    grammar_issues = []
    corrected_words = input_sentence.split()  # Initialize corrected words as original words

    # Extract words and POS tags from the processed sentence
    word_list = []
    pos_list = []
    for sentence in doc.sentences:
        for token in sentence.words:
            word_list.append(token.text)
            pos_list.append(token.upos)

    # Rule 1: Subject-Object-Verb (SOV) Order
    if 'PRON' in pos_list and 'NOUN' in pos_list and 'VERB' in pos_list:
        pronoun_idx = pos_list.index('PRON')
        noun_idx = pos_list.index('NOUN')
        verb_idx = pos_list.index('VERB')

        if not (pronoun_idx < noun_idx < verb_idx):
            grammar_issues.append("Error: Sentence should follow Subject-Object-Verb (SOV) order.")
            # Adjust word order to SOV
            corrected_words = (
                [word_list[pronoun_idx], word_list[noun_idx], word_list[verb_idx]] +
                [w for i, w in enumerate(word_list) if i not in (pronoun_idx, noun_idx, verb_idx)]
            )

    # Rule 2: Adjective-Noun Order
    if 'ADJ' in pos_list and 'NOUN' in pos_list:
        for idx, (pos_tag, word) in enumerate(zip(pos_list, word_list)):
            if pos_tag == 'ADJ':
                adj_idx = idx
                for j in range(adj_idx + 1, len(pos_list)):
                    if pos_list[j] == 'NOUN':
                        if adj_idx > j:
                            grammar_issues.append("Error: Adjectives should precede the nouns they modify.")
                            # Swap adjective and noun positions
                            word_list[adj_idx], word_list[j] = word_list[j], word_list[adj_idx]
                        break

    # Rule 3: Plural Agreement
    if 'PRON' in pos_list and 'VERB' in pos_list:
        pronoun_idx = pos_list.index('PRON')
        verb_idx = pos_list.index('VERB')
        pronoun_word = word_list[pronoun_idx]
        verb_word = word_list[verb_idx]

        if pronoun_word.endswith("ள்") and not verb_word.endswith("ோம்"):
            grammar_issues.append("Error: Plural pronouns should match plural verb forms.")
            # Adjust verb to plural form
            if "ேன்" in verb_word:
                corrected_words[verb_idx] = verb_word.replace("ேன்", "ோம்")
            else:
                corrected_words[verb_idx] += "ோம்"

    # Return results
    if grammar_issues:
        corrected_sentence = " ".join(corrected_words)
        return {
            "status": "errors",
            "details": grammar_issues,
            "corrected_sentence": corrected_sentence,
        }
    else:
        return {
            "status": "correct",
            "details": "The sentence is grammatically correct.",
        }






Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: ta (Tamil) ...
INFO:stanza:File exists: /root/stanza_resources/ta/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: ta (Tamil):
| Processor | Package      |
----------------------------
| tokenize  | ttb          |
| mwt       | ttb          |
| pos       | ttb_nocharlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: ta (Tamil) ...
INFO:stanza:File exists: /root/stanza_resources/ta/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: ta (Tamil):
| Processor | Package      |
----------------------------
| tokenize  | ttb          |
| mwt       | ttb          |
| pos       | ttb_nocharlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


In [41]:

input_sentence = "கணிதம்  நான் படிக்கிறேன்"   # Example incorrect Tamil sentence
analysis_result = analyze_and_correct_tamil_sentence(input_sentence)

# Display the analysis results
if analysis_result["status"] == "correct":
    print(analysis_result["details"])
else:
    print("Grammar Issues Detected:")
    for issue in analysis_result["details"]:
        print(f"- {issue}")
    if "corrected_sentence" in analysis_result:
        print(f"Corrected Sentence: {analysis_result['corrected_sentence']}")

Grammar Issues Detected:
- Error: Sentence should follow Subject-Object-Verb (SOV) order.
Corrected Sentence: நான் கணிதம் படிக்கிறேன்


In [42]:
input_sentence = "வரலாறு படிக்கிறேன் நான்"
analysis_result = analyze_and_correct_tamil_sentence(input_sentence)

# Display the analysis results
if analysis_result["status"] == "correct":
    print(analysis_result["details"])
else:
    print("Grammar Issues Detected:")
    for issue in analysis_result["details"]:
        print(f"- {issue}")
    if "corrected_sentence" in analysis_result:
        print(f"Corrected Sentence: {analysis_result['corrected_sentence']}")

Grammar Issues Detected:
- Error: Sentence should follow Subject-Object-Verb (SOV) order.
Corrected Sentence: நான் வரலாறு படிக்கிறேன்


In [43]:
input_sentence = "படிக்கிறேன் நான் அறிவியல்"
analysis_result = analyze_and_correct_tamil_sentence(input_sentence)

# Display the analysis results
if analysis_result["status"] == "correct":
    print(analysis_result["details"])
else:
    print("Grammar Issues Detected:")
    for issue in analysis_result["details"]:
        print(f"- {issue}")
    if "corrected_sentence" in analysis_result:
        print(f"Corrected Sentence: {analysis_result['corrected_sentence']}")

Grammar Issues Detected:
- Error: Sentence should follow Subject-Object-Verb (SOV) order.
Corrected Sentence: நான் அறிவியல் படிக்கிறேன்


In [2]:

# # Install TensorFlow if not already installed
# !pip install tensorflow

# import pandas as pd
# import numpy as np
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding, LSTM, Dense
# from tensorflow.keras.utils import to_categorical

# # Load your dataset
# df = pd.read_csv('Error Annotated Corpus.csv')



In [16]:
df.head(20)

Unnamed: 0,Error word & consecutive word,Corrected words & its,Annotation
0,10கனநீர் உலைகள்,10கணநீர் உலைகள்,வேற்றெழுத்து
1,அகளவிரித்து,அகலவிரித்து,வேற்றெழுத்து
2,அணைத்து ஊழியர்களுக்குமான,அனைத்து ஊழியர்களுக்குமான,வேற்றெழுத்து
3,அதள பாதாளதுக்கு,அதல பாதாளத்துக்கு,வேற்றெழுத்து
4,அதற்குறிய தீர்வுகளை,அதற்குரிய தீர்வுகளை,வேற்றெழுத்து
5,அதற்கேற்றார்போல அதன்,அதற்கேற்றாற்போல அதன்,வேற்றெழுத்து
6,அந்நியமாக நடந்தது,"அன்னியமாக,அன்னியமாக்க நடந்தது",வேற்றெழுத்து
7,அம்பிலாந்துறை,அம்பிளாந்துறை,வேற்றெழுத்து
8,அரவனைத்துச் செல்கின்ற,அரவணைத்துச் செல்கின்ற,வேற்றெழுத்து
9,அருவறுப்பாக நினைக்கிறார்கள்,அறுவறுப்பாக நினைக்கிறார்கள்,வேற்றெழுத்து


In [3]:
df.head()

Unnamed: 0,Error word & consecutive word,Corrected words & its,Annotation
0,10கனநீர் உலைகள்,10கணநீர் உலைகள்,வேற்றெழுத்து
1,அகளவிரித்து,அகலவிரித்து,வேற்றெழுத்து
2,அணைத்து ஊழியர்களுக்குமான,அனைத்து ஊழியர்களுக்குமான,வேற்றெழுத்து
3,அதள பாதாளதுக்கு,அதல பாதாளத்துக்கு,வேற்றெழுத்து
4,அதற்குறிய தீர்வுகளை,அதற்குரிய தீர்வுகளை,வேற்றெழுத்து


In [4]:
# # Drop rows with missing values
# df = df.dropna(subset=['Error word & consecutive word', 'Corrected words & its'])

# # Preprocess the text (strip extra spaces)
# df['Error word & consecutive word'] = df['Error word & consecutive word'].str.strip()
# df['Corrected words & its'] = df['Corrected words & its'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Error word & consecutive word'] = df['Error word & consecutive word'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Corrected words & its'] = df['Corrected words & its'].str.strip()


In [5]:
# # Prepare features and labels
# X = df['Error word & consecutive word']  # Features: error words
# y = df['Corrected words & its']  # Labels: corrected words

In [6]:
# # Label Encoding for the output labels
# le = LabelEncoder()
# y_encoded = le.fit_transform(y)


In [7]:
# # Tokenization
# tokenizer = Tokenizer(char_level=True)  # Set char_level=True for character-level
# tokenizer.fit_on_texts(X)

In [8]:
# # Convert text to sequences
# X_sequences = tokenizer.texts_to_sequences(X)
# X_padded = pad_sequences(X_sequences, padding='post')

# # Convert labels to one-hot encoding
# y_onehot = to_categorical(y_encoded)

In [9]:
# # Train-test split
# X_train, X_val, y_train, y_val = train_test_split(X_padded, y_onehot, test_size=0.2, random_state=42)

In [10]:
# # Define the RNN model
# model = Sequential()
# model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=X_padded.shape[1]))
# model.add(LSTM(64))
# model.add(Dense(len(le.classes_), activation='softmax'))



In [11]:
# # Compile the model
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [12]:
# # Train the model
# model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

Epoch 1/10
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 38ms/step - accuracy: 0.0000e+00 - loss: 8.5099 - val_accuracy: 0.0000e+00 - val_loss: 8.6059
Epoch 2/10
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 47ms/step - accuracy: 0.0011 - loss: 8.5019 - val_accuracy: 0.0000e+00 - val_loss: 8.5996
Epoch 3/10
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 35ms/step - accuracy: 0.0000e+00 - loss: 8.4477 - val_accuracy: 0.0000e+00 - val_loss: 9.3777
Epoch 4/10
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 49ms/step - accuracy: 0.0011 - loss: 8.3826 - val_accuracy: 0.0000e+00 - val_loss: 9.6735
Epoch 5/10
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - accuracy: 8.2237e-04 - loss: 8.3548 - val_accuracy: 0.0000e+00 - val_loss: 10.2698
Epoch 6/10
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 37ms/step - accuracy: 4.8233e-04 - loss: 8.3392 - val_accuracy: 0.0000e+00 

<keras.src.callbacks.history.History at 0x7b3f8a0bf850>

In [15]:
# # Save the model (optional)
# model.save('tamil_checker_model.h5')



In [None]:
# # Function to correct grammar
# def correct_grammar(sentence):
#     seq = tokenizer_x.texts_to_sequences([sentence])
#     padded = pad_sequences(seq, maxlen=X_padded.shape[1], padding='post')
#     pred = model.predict(padded)
#     pred_sentence = ' '.join([tokenizer_y.index_word[np.argmax(word)] for word in pred[0]])
#     return pred_sentence
