In [42]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer
# Try importing Tokenizer from the new location
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

# If the above import fails, it might be that TensorFlow is not installed.
# In that case, uncomment the following line to install TensorFlow.
# !pip install tensorflow

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [43]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [44]:
df = pd.read_csv('/content/drive/MyDrive/Grammar_Correction.csv')

In [45]:
df.head()

Unnamed: 0,Serial Number,Error Type,Ungrammatical Statement,Standard English
0,1,Verb Tense Errors,I goes to the store everyday.,I go to the store everyday.
1,2,Verb Tense Errors,They was playing soccer last night.,They were playing soccer last night.
2,3,Verb Tense Errors,She have completed her homework.,She has completed her homework.
3,4,Verb Tense Errors,He don't know the answer.,He doesn't know the answer.
4,5,Verb Tense Errors,The sun rise in the east.,The sun rises in the east.


In [46]:
# dropping the Serial Number column
df.drop('Serial Number', axis=1, inplace=True)

In [47]:
# Checking for nulls
df.isnull().sum()

Unnamed: 0,0
Error Type,0
Ungrammatical Statement,0
Standard English,0


In [48]:
# Converting all uppercase to lowercase
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)

  df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)


In [49]:
# Function to retain only specific punctuation
#  Retains only word characters, whitespace, and . , ? !
def retain_selected_punctuation(text):
    if isinstance(text, str):
        cleaned_text = re.sub(r"[^\w\s.,?!']", '', text)
        return cleaned_text
    else:
        return text

# Apply the function to the 2 columns
df['Ungrammatical Statement'] = df['Ungrammatical Statement'].apply(retain_selected_punctuation)
df['Standard English'] = df['Standard English'].apply(retain_selected_punctuation)

df.head()

Unnamed: 0,Error Type,Ungrammatical Statement,Standard English
0,verb tense errors,i goes to the store everyday.,i go to the store everyday.
1,verb tense errors,they was playing soccer last night.,they were playing soccer last night.
2,verb tense errors,she have completed her homework.,she has completed her homework.
3,verb tense errors,he don't know the answer.,he doesn't know the answer.
4,verb tense errors,the sun rise in the east.,the sun rises in the east.


In [50]:
# Tokenization
def tokenize_text(text):
    """
    Tokenizes text into words (for sentence-level analysis).
    """
    if isinstance(text, str):
        # Tokenize the text into words
        word_tokens = nltk.word_tokenize(text)
        return word_tokens
    else:
        return []

# Apply tokenization to both columns
df['Tokenized Ungrammatical'] = df['Ungrammatical Statement'].apply(tokenize_text)
df['Tokenized Standard'] = df['Standard English'].apply(tokenize_text)

# Display the resulting DataFrame
df.head()


Unnamed: 0,Error Type,Ungrammatical Statement,Standard English,Tokenized Ungrammatical,Tokenized Standard
0,verb tense errors,i goes to the store everyday.,i go to the store everyday.,"[i, goes, to, the, store, everyday, .]","[i, go, to, the, store, everyday, .]"
1,verb tense errors,they was playing soccer last night.,they were playing soccer last night.,"[they, was, playing, soccer, last, night, .]","[they, were, playing, soccer, last, night, .]"
2,verb tense errors,she have completed her homework.,she has completed her homework.,"[she, have, completed, her, homework, .]","[she, has, completed, her, homework, .]"
3,verb tense errors,he don't know the answer.,he doesn't know the answer.,"[he, do, n't, know, the, answer, .]","[he, does, n't, know, the, answer, .]"
4,verb tense errors,the sun rise in the east.,the sun rises in the east.,"[the, sun, rise, in, the, east, .]","[the, sun, rises, in, the, east, .]"


In [51]:
# Vectorizing for Machine Learning models
# Define N-gram range (bigrams and trigrams for grammar context)
ngram_range = (1, 3)

# Apply TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=ngram_range)

# Fit and transform the text data
X_ung = tfidf_vectorizer.fit_transform(df['Ungrammatical Statement'])
X_std = tfidf_vectorizer.transform(df['Standard English'])  # Transform using the same vocabulary

# Display the shape of the resulting matrices
print(f"Ungrammatical Matrix Shape: {X_ung.shape}")
print(f"Standard Matrix Shape: {X_std.shape}")


Ungrammatical Matrix Shape: (2018, 24906)
Standard Matrix Shape: (2018, 24906)


In [52]:
# Vectorizing for Deep Learning models
# Initialize tokenizer
tokenizer = Tokenizer()

# Fit on both columns to maintain a consistent vocabulary
tokenizer.fit_on_texts(df['Ungrammatical Statement'] + df['Standard English'])

# Convert to sequences
X_ung_seq = tokenizer.texts_to_sequences(df['Ungrammatical Statement'])
X_std_seq = tokenizer.texts_to_sequences(df['Standard English'])

# Determine max sequence length (optional, or choose a fixed value)
max_len = max(max(len(seq) for seq in X_ung_seq), max(len(seq) for seq in X_std_seq))

# Pad sequences
X_ung_padded = pad_sequences(X_ung_seq, maxlen=max_len, padding='post')
X_std_padded = pad_sequences(X_std_seq, maxlen=max_len, padding='post')

# Vocabulary size for embedding layer
vocab_size = len(tokenizer.word_index) + 1

print(f"Padded Ungrammatical Shape: {X_ung_padded.shape}")
print(f"Padded Standard Shape: {X_std_padded.shape}")
print(f"Vocabulary Size: {vocab_size}")


Padded Ungrammatical Shape: (2018, 22)
Padded Standard Shape: (2018, 22)
Vocabulary Size: 3677
