In [4]:
import numpy as np
import tensorflow as tf
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Example sentences (grammatically correct for self-supervised training)
sentenceDataset = pd.read_csv("/Users/kana/Desktop/nlp/project/Grammar_Correction.csv")
sentences = sentenceDataset["Standard English"]
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
vocab_size = len(tokenizer.word_index) + 1

# Convert sentences to sequences
sequences = tokenizer.texts_to_sequences(sentences)
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Shift output for self-supervised learning (Next Word Prediction)
y_train = [seq[1:] + [0] for seq in sequences]  # Remove first word, append 0 for padding
y_train = pad_sequences(y_train, maxlen=max_length, padding='post')  # Ensure equal length

# Define LSTM Language Model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_length),
    LSTM(128, return_sequences=True),
    LSTM(64, return_sequences=True),
    Dense(vocab_size, activation='softmax')  # Predict next word probabilities
])

# Compile Model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train Model
model.fit(padded_sequences, y_train, epochs=5, batch_size=2)

# Test Sentence (Incorrect Grammar)
test_sentence = ["She go to school every day."]  # Incorrect grammar
test_seq = tokenizer.texts_to_sequences(test_sentence)
test_padded = pad_sequences(test_seq, maxlen=max_length, padding='post')

# Compute Perplexity for Grammar Error Detection
loss = model.evaluate(test_padded, y_train[:1], verbose=0)
perplexity = np.exp(loss)
print(f"Sentence Perplexity Score: {perplexity}")

Epoch 1/5




[1m1009/1009[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - accuracy: 0.5690 - loss: 3.8681
Epoch 2/5
[1m1009/1009[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8ms/step - accuracy: 0.5905 - loss: 2.8258
Epoch 3/5
[1m1009/1009[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8ms/step - accuracy: 0.6008 - loss: 2.6327
Epoch 4/5
[1m1009/1009[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8ms/step - accuracy: 0.6197 - loss: 2.4198
Epoch 5/5
[1m1009/1009[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8ms/step - accuracy: 0.6223 - loss: 2.3239
Sentence Perplexity Score: [4.98796796 2.30097585]


Hello
