### CNN model

In [75]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.model_selection import train_test_split
import pandas as pd

# Load and preprocess the dataset
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

max_len = 50  # Adjust as needed
embedding_dim = 50  # Adjust as needed

# Tokenize and pad sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['text']), maxlen=max_len, padding='post')
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['text']), maxlen=max_len, padding='post')

# Labels
y_train = train_data['label']
y_test = test_data['label']

# CNN model
cnn_model = Sequential()
cnn_model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_len))
cnn_model.add(Conv1D(128, 5, activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(64, activation='relu'))
cnn_model.add(Dropout(0.5))
cnn_model.add(Dense(1, activation='sigmoid'))

cnn_model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])

# Train the model
cnn_model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)

# Evaluate on test set
loss, accuracy = cnn_model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

# Make predictions
predictions = cnn_model.predict(X_test)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: 0.2586108446121216, Test Accuracy: 0.6622390747070312


In [87]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

def evaluate_sentence(sentence, model, tokenizer, max_len):
    # Tokenize and pad the input sentence
    sequence = tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post')

    # Print for debugging
    print(f'Sequence: {sequence}')
    # print(f'Padded Sequence: {padded_sequence}')

    # Make prediction using the trained model
    likelihood = model.predict(padded_sequence)[0][0]

    return likelihood

# Example usage:
sentence_to_evaluate = "He don’t come here no more."
result = evaluate_sentence(sentence_to_evaluate, cnn_model, tokenizer, max_len)
if result<0.6:
    print("Grammatically \033[1m INCORRECT \033[0m with probability: ",1- result)
else:
    print("Grammatically \033[1m CORRECT \033[0m with probability: ",result)


Sequence: [[12, 216, 235, 90, 25]]
Grammatically [1m INCORRECT [0m with probability:  0.7341004908084869


In [83]:
min_value = np.min(predictions)
max_value = np.max(predictions)
scaled_values = (predictions - min_value) / (max_value - min_value)

binary_predictions = np.where(scaled_values > 0.4, 1, 0)
y_true = test_data['label'].astype(int)

# Calculate precision and recall
precision = precision_score(y_true, binary_predictions)
recall = recall_score(y_true, binary_predictions)

print(f'Precision: {precision}, Recall: {recall}')

Precision: 0.7251184834123223, Recall: 0.8383561643835616


### CNN model with GloVe embeddings

In [66]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import pandas as pd
import tensorflow as tf

# Load and preprocess the dataset
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

max_len = 50  # Adjust as needed

# Tokenize and pad sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['text']), maxlen=max_len, padding='post')
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['text']), maxlen=max_len, padding='post')

# Labels
y_train = train_data['label']
y_test = test_data['label']


class Grader:
    def __init__(self, tokenizer, model):
        self.tokenizer = tokenizer
        self.model = model

    def evaluate(self, sentence):
        # Tokenize and pad the input sentence using the same tokenizer and padding method used during training
        sequence = pad_sequences(self.tokenizer.texts_to_sequences([sentence]), maxlen=max_len,  padding='post')

        # Print the tokenized sequence for debugging
        print(f'Tokenized sequence for "{sentence}": {sequence}')

        # Make predictions
        prediction = self.model.predict(sequence)[0][0]
        print(self.model.predict(sequence))
        return prediction


# Load GloVe embeddings
embedding_dim = 300  
embedding_index = {}
glove_path = 'glove.6B.300d.txt'  

with open(glove_path, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

# Create an embedding matrix
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# CNN model
cnn_model = Sequential()
cnn_model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_len, weights=[embedding_matrix], trainable=False))
cnn_model.add(Conv1D(128, 5, activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(64, activation='relu'))
cnn_model.add(Dropout(0.5))
cnn_model.add(Dense(1, activation='sigmoid'))

# Compile and train the model
optimizer = Adam(learning_rate=0.01)
cnn_model.compile(optimizer=optimizer, loss='mse', metrics=['accuracy'])
cnn_model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)

# Evaluate on test set
loss, accuracy = cnn_model.evaluate(X_test, y_test)
print(f'CNN Test Loss: {loss}, Test Accuracy: {accuracy}')




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CNN Test Loss: 0.2147371917963028, Test Accuracy: 0.6925995945930481


In [67]:
# Create a Grader instance
mygrader = Grader(tokenizer, cnn_model)

# Test the Grader
sentence = "He don’t come here no more."
result = mygrader.evaluate(sentence)
print(f"The likelihood that the sentence is grammatically correct: {result}")

Tokenized sequence for "He don’t come here no more.": [[ 12 216 235  90  25   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]]
[[0.63015014]]
The likelihood that the sentence is grammatically correct: 0.6301501393318176


In [68]:


# Test the Grader
sentence_1 = "I comes."
sentence_2 = "This is a different sentence."
sentence_3 = "Another example sentence."

result_1 = mygrader.evaluate(sentence_1)
result_2 = mygrader.evaluate(sentence_2)
result_3 = mygrader.evaluate(sentence_3)

print(f"The likelihood that the first sentence is grammatically correct: {result_1}")
print(f"The likelihood that the second sentence is grammatically correct: {result_2}")
print(f"The likelihood that the third sentence is grammatically correct: {result_3}")

Tokenized sequence for "I comes.": [[   5 1210    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]]
[[0.69435024]]
Tokenized sequence for "This is a different sentence.": [[  28    6    3  581 2366    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]]
[[0.71503776]]
Tokenized sequence for "Another example sentence.": [[1840 2441 2366    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]]
[[0.62563956]]
The likelihood that the first se