In [1]:
pip install textstat

Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyphen (from textstat)
  Downloading pyphen-0.14.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.14.0 textstat-0.7.3


In [17]:
import nltk
import warnings
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import LearningRateScheduler, TensorBoard, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, RNN
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from tensorflow.keras.optimizers import Adam, Nadam
from nltk.translate.gleu_score import sentence_gleu
from tensorflow.keras.models import Model
from google.colab import drive
from nltk.tokenize import word_tokenize
warnings.filterwarnings("ignore")
from textblob import TextBlob
from nltk.util import ngrams
import matplotlib.cm as cm
import matplotlib as mpl
from tqdm import tqdm
import pandas as pd
import numpy as np
import zipfile
import random
import pickle
import re

In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
tokenizer_enc = pickle.load(open(r"/content/drive/MyDrive/GecDataset/tokenizer_encoder.pkl", "rb"))
tokenizer_dec = pickle.load(open(r"/content/drive/MyDrive/GecDataset/tokenizer_decoder.pkl", "rb"))

train_enc_inp = pickle.load(open(r"/content/drive/MyDrive/GecDataset/train_encoder_input.pkl", "rb"))
val_enc_inp = pickle.load(open(r"/content/drive/MyDrive/GecDataset/validation_encoder_input.pkl", "rb"))
test_enc_inp = pickle.load(open(r"/content/drive/MyDrive/GecDataset/test_encoder_input.pkl", "rb"))

train_dec_inp = pickle.load(open(r"/content/drive/MyDrive/GecDataset/train_decoder_input.pkl", "rb"))
val_dec_inp = pickle.load(open(r"/content/drive/MyDrive/GecDataset/validation_decoder_input.pkl", "rb"))
test_dec_inp = pickle.load(open(r"/content/drive/MyDrive/GecDataset/test_decoder_input.pkl", "rb"))

train_dec_out = pickle.load(open(r"/content/drive/MyDrive/GecDataset/train_decoder_output.pkl", "rb"))
val_dec_out = pickle.load(open(r"/content/drive/MyDrive/GecDataset/validation_decoder_output.pkl", "rb"))
test_dec_out = pickle.load(open(r"/content/drive/MyDrive/GecDataset/test_decoder_output.pkl", "rb"))

In [6]:
class Encoder(tf.keras.Model):


    def __init__(self, vocab_size, output_dim, enc_units, input_length):
        super().__init__()
        self.vocab_size = vocab_size
        self.output_dim = output_dim
        self.input_length = input_length
        self.enc_units = enc_units
        self.enc_output = 0
        self.enc_state_h = 0
        self.enc_state_c = 0

        self.embedding = Embedding(input_dim=self.vocab_size, output_dim=self.output_dim,
                                   input_length=self.input_length, mask_zero=True, name="embedding_layer_encoder")

        self.enc = LSTM(self.enc_units, return_state=True, return_sequences=True, name="Encoder_LSTM")

    def call(self,input_sequence, states):
        input_embedding = self.embedding(input_sequence)
        self.enc_output, self.enc_state_h, self.enc_state_c = self.enc(input_embedding, initial_state=states)
        return self.enc_output, self.enc_state_h, self.enc_state_c

    def initialize_states(self, batch_size):
        ini_hidden_state = tf.zeros(shape=(batch_size, self.enc_units))
        ini_cell_state = tf.zeros(shape=(batch_size, self.enc_units))
        return ini_hidden_state, ini_cell_state

In [7]:
class Decoder(tf.keras.Model):


    def __init__(self, vocab_size, output_dim, dec_units, input_length):
        super().__init__()
        self.vocab_size = vocab_size
        self.output_dim = output_dim
        self.dec_units = dec_units
        self.input_length = input_length

        self.embedding = Embedding(input_dim=self.vocab_size, output_dim=self.output_dim,
                                   input_length=self.input_length, mask_zero=True, name="embedding_layer_decoder")

        self.dec = LSTM(self.dec_units, return_sequences=True, return_state=True, name="Decoder_LSTM")

    def call(self,input_sequence, initial_states):
        target_embedding = self.embedding(input_sequence)
        dec_output, dec_state_h, dec_state_c = self.dec(target_embedding, initial_state=initial_states)
        return dec_output, dec_state_h, dec_state_c

In [8]:
class Encoder_decoder(tf.keras.Model):


    def __init__(self, enc_vocab_size, enc_output_dim, enc_inp_length, enc_units,
                 dec_vocab_size, dec_output_dim, dec_inp_length, dec_units, batch_size):

        super().__init__()
        self.encoder = Encoder(vocab_size=enc_vocab_size, output_dim=enc_output_dim,
                               input_length=enc_inp_length, enc_units=enc_units)
        self.decoder = Decoder(vocab_size=dec_vocab_size, output_dim=dec_output_dim,
                               input_length=dec_inp_length, dec_units=dec_units)
        self.dense   = Dense(dec_vocab_size, activation='softmax')
        self.ini_states = self.encoder.initialize_states(batch_size=batch_size)

    def call(self,data):
        input,output = data[0], data[1]
        enc_output, enc_h,enc_c = self.encoder(input, self.ini_states)
        dec_output, dec_h,dec_c = self.decoder(output, [enc_h, enc_c])
        output = self.dense(dec_output)
        return output

In [9]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

In [10]:
enc_voc_size = len(tokenizer_enc.word_index) + 1
dec_voc_size = len(tokenizer_dec.word_index) + 1
embedd_dim = 100
enc_inp_len = 12
dec_inp_len = 13
lstm_size = 64
batch_size = 1024

In [11]:
train_trunc_idx = (train_enc_inp.shape[0]//batch_size)*batch_size
val_trunc_idx = (val_enc_inp.shape[0]//batch_size)*batch_size

train_enc_inp_truncated = train_enc_inp[:train_trunc_idx]
train_dec_inp_truncated = train_dec_inp[:train_trunc_idx]
train_dec_out_truncated = train_dec_out[:train_trunc_idx]

val_enc_inp_truncated = val_enc_inp[:val_trunc_idx]
val_dec_inp_truncated = val_dec_inp[:val_trunc_idx]
val_dec_out_truncated = val_dec_out[:val_trunc_idx]

In [12]:
model  = Encoder_decoder(enc_vocab_size=enc_voc_size,enc_output_dim=embedd_dim,
                         enc_inp_length=enc_inp_len,enc_units=lstm_size,
                         dec_vocab_size=dec_voc_size,dec_output_dim=embedd_dim,
                         dec_inp_length=dec_inp_len,dec_units=lstm_size,
                         batch_size=batch_size)

log_dir=f'/lstm/Logs/'
early_stop = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=10, verbose=1, mode='min')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', min_delta=0.001, patience=5, verbose=1, factor=0.9)
check_point = ModelCheckpoint('/lstm/model/', monitor='val_loss',
                              save_best_only=True, save_weights_only=True, mode='min', verbose=0)

In [13]:
model.compile(optimizer=Adam(learning_rate=0.001), loss=loss_function)

In [None]:
model.fit(x=[train_enc_inp_truncated, train_dec_inp_truncated], y=train_dec_out_truncated,
          validation_data=([val_enc_inp_truncated, val_dec_inp_truncated], val_dec_out_truncated),
          epochs=30, batch_size=batch_size, callbacks=[early_stop, reduce_lr, check_point])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7fd04c1d7b50>

In [None]:
model.summary()

Model: "encoder_decoder_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_1 (Encoder)         multiple                  9240440   
                                                                 
 decoder_1 (Decoder)         multiple                  7826340   
                                                                 
 dense_1 (Dense)             multiple                  5059665   
                                                                 
Total params: 22126445 (84.41 MB)
Trainable params: 22126445 (84.41 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [14]:
correct_idx_word_dict = tokenizer_dec.word_index
correct_idx_word_dict = {v: k for k, v in correct_idx_word_dict.items()}

def predict(input_sentence):
    input_sentence = tokenizer_enc.texts_to_sequences([input_sentence])
    initialize_hidden_state = tf.zeros([1, 64])
    initialize_cell_state = tf.zeros([1, 64])
    initialize_states = [initialize_hidden_state, initialize_cell_state]
    pred_total = []
    encoder_output, encoder_state_h, encoder_state_c = model.layers[0](np.expand_dims(input_sentence[0], 0), ini_states)
    states_values = [encoder_state_h, encoder_state_c]
    pred = []
    sentence = []
    cur_vec = np.array([[1]])

    for i in range(12):
        decoder_output, decoder_state_h, decoder_state_c = model.layers[1](cur_vec, states_values)
        output = model.layers[2](decoder_output)
        states_values = [decoder_state_h, decoder_state_c]
        cur_vector = np.reshape(np.argmax(output), (1, 1))
        if correct_idx_word_dict[cur_vec[0][0]] == '<end>':
            break
        pred.append(cur_vector[0][0])

    for i in pred:
        sentence.append(correct_idx_word_dict[i])
    return " ".join(sentence)

In [None]:
model.save('/content/drive/MyDrive/GecDataset/lstm/model/')

In [16]:
test_dataset = pd.read_csv('/content/drive/MyDrive/GecDataset/gec_test.csv', na_filter=False)

In [None]:
def gleu_score(gec_dataset):
    gleu_score_arr = []
    for i in tqdm(range(500)):
        reference = [gec_dataset['correct'].iloc[i].split()]
        pred = predict(gec_dataset['incorrect'].iloc[i])
        candidate = pred.split()
        gleu_score_arr.append(sentence_gleu(reference, candidate))
    return np.mean(gleu_score_arr)

In [None]:
print(f"GLEU Score on Test data: {gleu_score(test_dataset)}")

  0%|          | 0/500 [00:00<?, ?it/s]

GLEU Score on Test data: 0.21755782533261941
