In [69]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [70]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
import nltk
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [71]:
file_path = '/content/drive/MyDrive/new_dataset.txt'
with open(file_path, 'r', encoding='utf-16') as file:
    data = [line.strip().split(" ", 1) for line in file.readlines()]

data_frame = pd.DataFrame(data, columns=["category", "text"])
data_frame["category"] = data_frame["category"].astype(int)
data_frame['text'] = data_frame['text'].fillna('')

text_data = data_frame['text']
labels = data_frame['category']

In [72]:
X_train_data, X_test_data, y_train_labels, y_test_labels = train_test_split(text_data, labels, test_size=0.2, random_state=42)

In [73]:
vectorizer_instance = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)
X_train_vec = vectorizer_instance.fit_transform(X_train_data)
X_test_vec = vectorizer_instance.transform(X_test_data)

In [74]:
from sklearn.ensemble import GradientBoostingClassifier

classifier_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
classifier_model.fit(X_train_vec, y_train_labels)


In [75]:
predictions = classifier_model.predict(X_test_vec)
model_accuracy = accuracy_score(y_test_labels, predictions)
print(f"Accuracy: {model_accuracy:.2f}")

Accuracy: 0.90


In [76]:
def grammar_check(input_text, vectorizer, model, dataframe):
    """
    Check the grammar of a given input text and return the corrected text.
    """
    sentences = nltk.sent_tokenize(input_text)
    corrected_sentences = []
    all_correct = True

    for sentence in sentences:
        vectorized_text = vectorizer.transform([sentence])
        result = model.predict(vectorized_text)[0]

        if result == 1:
            corrected_sentences.append(sentence)  # Keep the original sentence if correct
        else:
            all_correct = False
            correct_texts = dataframe[dataframe['category'] == 1]['text']
            best_match = None
            highest_similarity = 0

            for correct_text in correct_texts:
                input_set = set(sentence.split())
                correct_set = set(correct_text.split())
                similarity_score = len(input_set.intersection(correct_set)) / len(input_set.union(correct_set)) if input_set.union(correct_set) else 0
                if similarity_score > highest_similarity:
                    highest_similarity = similarity_score
                    best_match = correct_text

            if best_match:
                corrected_sentences.append(best_match)
            else:
                corrected_sentences.append(sentence)

    corrected_text = " ".join(corrected_sentences)
    return corrected_text, all_correct

In [77]:
cross_val_scores = cross_val_score(classifier_model, X_train_vec, y_train_labels, cv=5, scoring='accuracy')
print(f"Cross-Validation Mean Accuracy: {cross_val_scores.mean():.2f}")

Cross-Validation Mean Accuracy: 0.89


In [78]:
param_grid_values = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_model = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_values, cv=3, n_jobs=-1, verbose=2)
grid_search_model.fit(X_train_vec, y_train_labels)
best_classifier_model = grid_search_model.best_estimator_

Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [79]:
conf_matrix = confusion_matrix(y_test_labels, predictions)
print(conf_matrix)
print(classification_report(y_test_labels, predictions))

[[2878    0]
 [ 344  166]]
              precision    recall  f1-score   support

           0       0.89      1.00      0.94      2878
           1       1.00      0.33      0.49       510

    accuracy                           0.90      3388
   macro avg       0.95      0.66      0.72      3388
weighted avg       0.91      0.90      0.88      3388



In [82]:
def test_new_sentence(input_sentence, vectorizer, model, dataframe):
    """
    Test a new Sinhala sentence for grammar and return the corrected sentence.
    """
    corrected_sentence, all_correct = grammar_check(input_sentence, vectorizer, model, dataframe)

    if all_correct:
        print("The sentence is grammatically correct!")
    else:
        print("The sentence has grammar issues. Corrected version:")
        print(corrected_sentence)

new_sentence = "මම උදෑසන ආහාරය සදහා බොහෝවිට බත් ආහාරයට ගැනීමට පුරුදුවී සිටින්නෙමු. නමුත් ඉදහිට පිටි වලින් සාදාගත් ආහාරද පරිබෝජනය කරන්නෙය. උදේ ආහාරය ලෙස බත් ආහාරයට ගත් විට දවසේ ඉදිරි වැඩ කටයුතු උද්‍යෝගිමත්ව සිදුකිරීමට හැකියාව ලැබේ. එලෙස පෝෂනය පිළිබඳ බලණවා."
#new_sentence = "අපි පසුගිය දිනවල අත්පන්දු ක්‍රීඩා තරග සදහා පුහුනුවීම් කටයුතු සිදුකලේය"
#new_sentence = "මම සෑම පෝය දිනකම සිල් සමාදන් වීමට පන්සල් යන්නෙමු. අපි පසුගිය දිනවල අත්පන්දු ක්‍රීඩා තරග සදහා පුහුනුවීම් කටයුතු සිදුකලේය."

test_new_sentence(new_sentence, vectorizer_instance, classifier_model, data_frame)

The sentence has grammar issues. Corrected version:
මම උදෑසන ආහාරය සදහා බොහෝවිට බත් ආහාරයට ගැනීමට පුරුදුවී සිටින්නෙමි නමුත් ඉදහිට පිටි වලින් සාදාගත් ආහාරද පරිබෝජනය කරන්නෙය. උදේ ආහාරය ලෙස බත් ආහාරයට ගත් විට දවසේ ඉදිරි වැඩ කටයුතු උද්‍යෝගිමත්ව සිදුකිරීමට හැකියාව ලැබේ. එලෙස පෝෂනය පිළිබඳ බලණවා.


In [86]:
!pip install fuzzywuzzy


Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [88]:
!pip install python-Levenshtein


Collecting python-Levenshtein
  Downloading python_Levenshtein-0.26.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.26.1 (from python-Levenshtein)
  Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.1->python-Levenshtein)
  Downloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading python_Levenshtein-0.26.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m94.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages

In [90]:
import ipywidgets as widgets
from IPython.display import display

# Create widgets
input_box = widgets.Textarea(
    description="Input Sentence:",
    placeholder="Enter a Sinhala sentence...",
    layout=widgets.Layout(width='80%', height='100px')
)

output_box = widgets.Output()

check_button = widgets.Button(
    description="Check Grammar",
    button_style="primary",
    tooltip="Click to check grammar of the input sentence"
)

# Function to handle button click
def on_button_click(b):
    with output_box:
        output_box.clear_output()  # Clear previous output
        input_sentence = input_box.value.strip()

        if not input_sentence:
            print("Please enter a sentence.")
        else:
            # Call the function to test the sentence
            try:
                test_new_sentence(input_sentence, vectorizer_instance, classifier_model, data_frame)
            except Exception as e:
                print(f"An error occurred: {e}")

# Attach click event to the button
check_button.on_click(on_button_click)

# Display the widgets
display(input_box, check_button, output_box)


Textarea(value='', description='Input Sentence:', layout=Layout(height='100px', width='80%'), placeholder='Ent…

Button(button_style='primary', description='Check Grammar', style=ButtonStyle(), tooltip='Click to check gramm…

Output()