### RNN model

In [68]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import pandas as pd

# Load and preprocess the dataset
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

max_len = 50  # Adjust as needed
embedding_dim = 50  # Adjust as needed

# Tokenize and pad sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['text']), maxlen=max_len, padding='post')
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['text']), maxlen=max_len, padding='post')

# Labels
y_train = train_data['label']
y_test = test_data['label']

# RNN model
rnn_model = Sequential()
rnn_model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_len))
rnn_model.add(SimpleRNN(64, activation='relu'))
rnn_model.add(Dense(64, activation='relu'))
rnn_model.add(Dropout(0.5))
rnn_model.add(Dense(1, activation='sigmoid'))

# optimizer = Adam(learning_rate=0.001)
# rnn_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
rnn_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])


# Train the model
rnn_model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)

# Evaluate on test set
loss, accuracy = rnn_model.evaluate(X_test, y_test)
print(f'RNN Test Loss: {loss}, Test Accuracy: {accuracy}')

# Make predictions
rnn_predictions = rnn_model.predict(X_test)




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
RNN Test Loss: 0.6170213222503662, Test Accuracy: 0.6925995945930481


In [69]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

class Grader:
    def __init__(self, model, tokenizer, max_len):
        self.model = model
        self.tokenizer = tokenizer
        self.max_len = max_len

    def evaluate(self, sentence):
        # Tokenize and pad the input sentence
        sequence = self.tokenizer.texts_to_sequences([sentence])
        padded_sequence = pad_sequences(sequence, maxlen=self.max_len, padding='post')

        # Make prediction using the trained model
        likelihood = self.model.predict(padded_sequence)[0][0]

        return likelihood

# Create an instance of Grader
mygrader = Grader(model=rnn_model, tokenizer=tokenizer, max_len=max_len)

# Test the Grader with a sentence
sentence_to_evaluate = "Cat am swimming in the pool."
result = mygrader.evaluate(sentence_to_evaluate)

# Print the result
if result < 0.7:
    print("Grammatically \033[1mINCORRECT\033[0m with probability:", result)
else:
    print("Grammatically \033[1mCORRECT\033[0m with probability:", result)


Grammatically [1mINCORRECT[0m with probability: 0.69524056


### RNN model with GloVe embeddings

In [73]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd

# Define Grader class
class Grader:
    def __init__(self, tokenizer, model):
        self.tokenizer = tokenizer
        self.model = model

    def evaluate(self, sentence):
        # Tokenize and pad the input sentence
        sequence = pad_sequences(self.tokenizer.texts_to_sequences([sentence]), maxlen=max_len, padding='post')

        # Make predictions
        prediction = self.model.predict(sequence)[0][0]
        return prediction

# Load and preprocess the dataset
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

max_len = 50  # Adjust as needed

# Tokenize and pad sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['text']), maxlen=max_len, padding='post')
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['text']), maxlen=max_len, padding='post')

# Labels
y_train = train_data['label']
y_test = test_data['label']

# Load GloVe embeddings
embedding_dim = 200  
embedding_index = {}
glove_path = 'glove.twitter.27B.200d.txt'  

with open(glove_path, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

# Create an embedding matrix
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# RNN model with pre-trained GloVe embeddings
rnn_model = Sequential()
rnn_model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_len, weights=[embedding_matrix], trainable=False))
rnn_model.add(SimpleRNN(64, activation='relu'))
rnn_model.add(Dense(64, activation='relu'))
rnn_model.add(Dropout(0.5))
rnn_model.add(Dense(1, activation='sigmoid'))

# Compile and train the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
rnn_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
rnn_model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)

# Evaluate on test set
loss, accuracy = rnn_model.evaluate(X_test, y_test)
print(f'RNN Test Loss: {loss}, Test Accuracy: {accuracy}')

# Make predictions
rnn_predictions = rnn_model.predict(X_test)





Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
RNN Test Loss: 0.6176514625549316, Test Accuracy: 0.6925995945930481


In [64]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd


# Load and preprocess the dataset
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

max_len = 50  # Adjust as needed

# Tokenize and pad sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['text']), maxlen=max_len, padding='post')
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['text']), maxlen=max_len, padding='post')

# Labels
y_train = train_data['label']
y_test = test_data['label']

# Load GloVe embeddings
embedding_dim = 300  
embedding_index = {}
glove_path = 'glove.6B.300d.txt'  

with open(glove_path, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

# Create an embedding matrix
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# RNN model with pre-trained GloVe embeddings
rnn_model = Sequential()
rnn_model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=300, input_length=max_len, weights=[embedding_matrix], trainable=False))
rnn_model.add(SimpleRNN(128, activation='tanh'))  # Changed activation function
rnn_model.add(Dense(128, activation='relu'))
rnn_model.add(Dropout(0.6))
rnn_model.add(Dense(1, activation='sigmoid'))

# Compile and train the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
rnn_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
rnn_model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)

# Evaluate on test set
loss, accuracy = rnn_model.evaluate(X_test, y_test)
print(f'RNN Test Loss: {loss}, Test Accuracy: {accuracy}')





Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
RNN Test Loss: 0.6172659993171692, Test Accuracy: 0.6925995945930481


In [65]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

def evaluate_sentence(sentence, model, tokenizer, max_len):
    # Tokenize and pad the input sentence
    sequence = tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post')

    # Print for debugging
    print(f'Sequence: {sequence}')
    # print(f'Padded Sequence: {padded_sequence}')

    # Make prediction using the trained model
    likelihood = model.predict(padded_sequence)[0][0]

    return likelihood

# Example usage:
sentence_to_evaluate = "He don’t come here no more."
result = evaluate_sentence(sentence_to_evaluate, cnn_model, tokenizer, max_len)
if result<0.6:
    print("Grammatically \033[1m INCORRECT \033[0m with probability: ",1- result)
else:
    print("Grammatically \033[1m CORRECT \033[0m with probability: ",result)


In [66]:
# Create a Grader instance
mygrader = Grader(tokenizer, rnn_model)

# Test the Grader
sentence = "He don’t come here no more."
result = mygrader.evaluate(sentence)
print(f"The likelihood that the sentence is grammatically correct: {result}")

Tokenized sequence for "He don’t come here no more.": [[ 12 216 235  90  25   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]]
[[0.70154154]]
The likelihood that the sentence is grammatically correct: 0.701541543006897


In [63]:


# Test the Grader
sentence_1 = "I comes."
sentence_2 = "This is a different sentence."
sentence_3 = "Another example sentence."

result_1 = mygrader.evaluate(sentence_1)
result_2 = mygrader.evaluate(sentence_2)
result_3 = mygrader.evaluate(sentence_3)

print(f"The likelihood that the first sentence is grammatically correct: {result_1}")
print(f"The likelihood that the second sentence is grammatically correct: {result_2}")
print(f"The likelihood that the third sentence is grammatically correct: {result_3}")

Tokenized sequence for "I comes.": [[   5 1210    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]]
[[1.]]
Tokenized sequence for "This is a different sentence.": [[  28    6    3  581 2366    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]]
[[1.]]
Tokenized sequence for "Another example sentence.": [[1840 2441 2366    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]]
[[1.]]
The likelihood that the first sentence is grammatically 