<a href="https://colab.research.google.com/github/Hitenjain20/Grammar-Error-correction/blob/main/Encoder%20and%20decoder%20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import datetime
import numpy as np
import pandas as pd
import random
from tqdm import tqdm
import keras
import tensorflow as tf
from sklearn.metrics import fbeta_score
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard
from tensorflow.keras.layers import Embedding,LSTM, TimeDistributed, Dense, Bidirectional
from tensorflow.keras.initializers import HeNormal, GlorotNormal, GlorotUniform
from nltk.translate.bleu_score import sentence_bleu
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
data = pd.read_csv('/content/drive/MyDrive/Data/final_preprocessed_15.csv')

In [5]:
pd.options.display.max_colwidth = 500
data[:50]

Unnamed: 0,error,correct
0,And he took in my favorite subject like soccer .,And he took in my favorite subjects like soccer .
1,"Actually , who let me know about Lang - was him .","Actually , he was the one who let me know about Lang - . ."
2,His Kanji is ability is much better than me .,His Kanji ability is much better than mine .
3,I heard a sentence last night when I watched TV .,I heard a sentence last night when I was watching TV .
4,"When you go uphill , you hvae to bend your back .","When you go uphill , you have to bend your back ."
5,"When you are go smoothly , you have to be more modest .","When everything is going smoothly , you have to be more modest ."
6,The making souvenir is a hard and interesting work .,Making souvenirs is a hard but interesting work .
7,"You know , you can take them at slot machine .","You know , you can ? them at a slot machine ."
8,The third memory is the house we lived .,The third memory is the house where we lived .
9,I liked the winter Finland .,I liked Finland in the Winter .


In [6]:
data[50:100]


Unnamed: 0,error,correct
50,I am listening to music with the commuter train .,I am listening to music on the commuter train .
51,"Today , I listen the music , it looks like in the cafe .","Today , I listen the music , it feels like I am in a cafe ."
52,Spring is very exciting season .,Spring is a very exciting season .
53,"Japanese school , work starts spring .","In Japan , the new school and work year starts in spring ."
54,I hope to get any quarifications in english .,I hope to become qualified in English .
55,I hope to get my new promotions .,I hope to get a new promotion .
56,Everyday starts new thinngs to try .,"Everyday , there are new things to try ."
57,". . . No , extremely sometimes .",". . . No , very frequently ."
58,"Since today , I am going to try write a diary ! !","Starting today , I am going to try write a diary everyday ! !"
59,Today is nothing specal to write ! !,Today has nothing specal to write ! !


In [7]:
def preprocess(t, add_start_token, add_end_token):

  if add_start_token == True and add_end_token == False:
    t = '<start>'+' '+t
  if add_start_token == False and add_end_token == True:
    t = t+' '+'<end>'
  if add_start_token == True and add_end_token == True:
    t = '<start>'+' '+t+' '+'<end>'

  t = re.sub(' +', ' ', t)
  return t

In [8]:
encoder_input = [preprocess(line, add_start_token= True, add_end_token=True) for line in data['error']]
decoder_input = [preprocess(line, add_start_token= True, add_end_token=False) for line in data['correct']]
decoder_output = [preprocess(line, add_start_token= False, add_end_token=True) for line in data['correct']]

In [9]:
print(encoder_input[0])
print(decoder_input[0])
print(decoder_output[0])

<start> And he took in my favorite subject like soccer . <end>
<start> And he took in my favorite subjects like soccer .
And he took in my favorite subjects like soccer . <end>


In [10]:
tokenizer = Tokenizer(filters='', split=" ")
tokenizer.fit_on_texts(encoder_input)
word_index = tokenizer.word_index #vocabulary

max_length = max([ len(row.split(" ")) for row in encoder_input ])
INPUT_ENCODER_LENGTH = max_length

enc_input_encoded = tokenizer.texts_to_sequences(encoder_input)
enc_input_padded= pad_sequences(enc_input_encoded, maxlen=INPUT_ENCODER_LENGTH, padding="post")

print(enc_input_padded.shape)

(289572, 17)


In [11]:
print(encoder_input[0])
print(enc_input_padded[0])

<start> And he took in my favorite subject like soccer . <end>
[  1  11  46 177  13  10 281 901  39 619   3   2   0   0   0   0   0]


In [12]:
decoder_data = decoder_input.copy()
decoder_data.extend(decoder_output)

out_tokenizer = Tokenizer(filters='', split=" ")
out_tokenizer.fit_on_texts(decoder_data)
word_index = out_tokenizer.word_index #vocabulary

max_length = max([ len(row.split(" ")) for row in decoder_input ])
INPUT_DECODER_LENGTH = max_length

In [13]:
dec_input_encoded = out_tokenizer.texts_to_sequences(decoder_input)
dec_input_padded= pad_sequences(dec_input_encoded, maxlen=INPUT_DECODER_LENGTH, padding="post", truncating = "post")

print(dec_input_padded.shape)

(289572, 29)


In [14]:
print(decoder_input[0])
print(dec_input_padded[0])

<start> And he took in my favorite subjects like soccer .
[   3   12   46  178   13   10  266 1490   41  610    1    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0]


In [15]:
dec_output_encoded = out_tokenizer.texts_to_sequences(decoder_output)
dec_output_padded= pad_sequences(dec_output_encoded, maxlen=INPUT_DECODER_LENGTH, padding="post", truncating = "post")

print(dec_output_padded.shape)

(289572, 29)


In [16]:
!wget --header="Host: dl.fbaipublicfiles.com" --header="User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36" --header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" --header="Accept-Language: en-US,en;q=0.9,kn;q=0.8" --header="Referer: https://fasttext.cc/" "https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip" -c -O 'wiki-news-300d-1M.vec.zip'


--2022-02-07 12:49:30--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 172.67.9.4, 104.22.75.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



In [None]:
!unzip wiki-news-300d-1M.vec.zip


Archive:  wiki-news-300d-1M.vec.zip
replace wiki-news-300d-1M.vec? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
import io

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.asarray(tokens[1:])#map(float, tokens[1:])
    return data

In [None]:
embedding_index = load_vectors('wiki-news-300d-1M.vec')

In [None]:
word_index = tokenizer.word_index
num_tokens = len(word_index) + 2
embedding_dim = 300
hits = 0
misses = 0

embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)

    if type(embedding_vector) == np.ndarray and embedding_vector.shape[0] == 300:  
        embedding_matrix[i] = embedding_vector
        hits += 1

    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))
np.save('/content/drive/MyDrive/Data/in_embedding.npy', embedding_matrix)

In [None]:
word_index = out_tokenizer.word_index
num_tokens = len(word_index) + 2
embedding_dim = 300
hits = 0
misses = 0

embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)

    if type(embedding_vector) == np.ndarray and embedding_vector.shape[0] == 300:  
        embedding_matrix[i] = embedding_vector
        hits += 1

    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))
np.save('/content/drive/MyDrive/Data/out_embedding.npy', embedding_matrix)

In [18]:
in_embedding_matrix = np.load('/content/drive/MyDrive/Data/in_embedding.npy')
out_embedding_matrix = np.load('/content/drive/MyDrive/Data/out_embedding.npy')
print(in_embedding_matrix.shape, out_embedding_matrix.shape)

(52482, 300) (41170, 300)


In [19]:
#Encoder
class Encoder(tf.keras.Model):
    def __init__(self,inp_vocab_size,embedding_size,lstm_size,input_length):
        super().__init__()
        self.vocab_size = inp_vocab_size
        self.embedding_size = embedding_size
        self.lstm_units = lstm_size
        self.input_length = input_length


    def build(self, input_sequence):
        #self.embedding = Embedding(input_dim=self.vocab_size, output_dim=self.embedding_size, input_length=self.input_length, 
        #                           #embeddings_initializer=keras.initializers.Constant(in_embedding_matrix), mask_zero=True, 
        #                           weights = [in_embedding_matrix], mask_zero=True, 
        #                           trainable = False, name="embedding_layer_encoder")
        self.embedding = Embedding(input_dim=self.vocab_size, output_dim=self.embedding_size, input_length=self.input_length,
                           mask_zero=True, name="embedding_layer_encoder")
        self.lstm = LSTM(self.lstm_units, return_state=True, return_sequences=True, name="Encoder_LSTM")

    def call(self,input_sequence,states, training = True):
        input_embedding = self.embedding(input_sequence)   #(batch_size, length of input array, embedding_size)
        self.lstm_output, self.state_h, self.state_c = self.lstm(input_embedding, initial_state = states)         
        return self.lstm_output,self.state_h, self.state_c

    
    def initialize_states(self,batch_size):
      initializer = GlorotNormal()
      lstm_state_h = initializer(shape=(batch_size, self.lstm_units))#tf.zeros((batch_size, self.lstm_units), dtype=tf.dtypes.float32, name="Encoder_LSTM_hidden_state")
      lstm_state_c = initializer(shape=(batch_size, self.lstm_units))#tf.zeros((batch_size, self.lstm_units), dtype=tf.dtypes.float32, name="Encoder_LSTM_cell_state")
      return lstm_state_h, lstm_state_c

#DECODER
class Decoder(tf.keras.Model):
    def __init__(self,out_vocab_size,embedding_size,lstm_size,input_length):
        super().__init__()
        self.vocab_size = out_vocab_size
        self.embedding_size = embedding_size
        self.lstm_units = lstm_size
        self.input_length = input_length


    def build(self,input_sequence):
        #self.embedding = Embedding(input_dim=self.vocab_size, output_dim=self.embedding_size, input_length=self.input_length, 
        #                           #embeddings_initializer=keras.initializers.Constant(out_embedding_matrix), 
        #                           weights = [out_embedding_matrix], mask_zero=True, 
        #                           trainable = False, name="embedding_layer_decoder")
        self.embedding = Embedding(input_dim=self.vocab_size, output_dim=self.embedding_size, input_length=self.input_length,
                           mask_zero=True, name="embedding_layer_decoder") 
        self.lstm = LSTM(self.lstm_units, return_state=True, return_sequences=True, name="Decoder_LSTM")


    def call(self,input_sequence,initial_states, training = True):

        input_embedding = self.embedding(input_sequence)
        self.lstm_output, self.state_h, self.state_c = self.lstm(input_embedding, initial_state=initial_states)
        return self.lstm_output,self.state_h, self.state_c

In [20]:
class Encoder_decoder(tf.keras.Model):
    
    def __init__(self, encoder_inputs_length,decoder_inputs_length, output_vocab_size):

        super().__init__()
        self.encoder = Encoder(INPUT_VOCAB_SIZE, embedding_size = 256, lstm_size= 1200 , input_length= INPUT_ENCODER_LENGTH)
        self.decoder = Decoder(OUTPUT_VOCAB_SIZE, embedding_size = 256, lstm_size = 1200, input_length = None)
        self.dense = Dense(output_vocab_size)#, activation = 'softmax')
    
    def call(self,data):
        input, output = data[0], data[1]
        states = self.encoder.initialize_states(input.shape[0])
        encoder_output,encoder_final_state_h,encoder_final_state_c = self.encoder(input, states)
        decoder_output,decoder_state_h,decoder_state_c = self.decoder(output,[encoder_final_state_h,encoder_final_state_c])
        outputs = self.dense(decoder_output)

        return outputs

In [21]:
INPUT_VOCAB_SIZE = len(list(tokenizer.word_index)) +1 #for zero padding +OOV
OUTPUT_VOCAB_SIZE = len(list(out_tokenizer.word_index)) +1 #for zero padding + OOV
BATCH_SIZE = 16
print(INPUT_VOCAB_SIZE, INPUT_ENCODER_LENGTH, OUTPUT_VOCAB_SIZE, INPUT_DECODER_LENGTH, BATCH_SIZE)

52481 17 41169 29 16


In [22]:
NUMBER_OF_DATAPOINTS = 10000

tf.random.set_seed(32)

encoder_input_datatset = tf.data.Dataset.from_tensor_slices(enc_input_padded)
decoder_input_datatset = tf.data.Dataset.from_tensor_slices(dec_input_padded)
decoder_output_datatset = tf.data.Dataset.from_tensor_slices(dec_output_padded)

full_dataset =  tf.data.Dataset.zip( ((encoder_input_datatset.take(NUMBER_OF_DATAPOINTS), decoder_input_datatset.take(NUMBER_OF_DATAPOINTS)), decoder_output_datatset.take(NUMBER_OF_DATAPOINTS) ) ).shuffle(1000) #encoder_input_datatset.take(NUMBER_OF_DATAPOINTS).repeat(2)


In [23]:
test_dataset = full_dataset.take(50).batch(32)
train_dataset = full_dataset.skip(50).batch(32)

print(train_dataset, test_dataset)

<BatchDataset shapes: (((None, 17), (None, 29)), (None, 29)), types: ((tf.int32, tf.int32), tf.int32)> <BatchDataset shapes: (((None, 17), (None, 29)), (None, 29)), types: ((tf.int32, tf.int32), tf.int32)>


In [24]:
#LEARNING RATE SCHEDULER: Decay learning rate after 15 epochs
def scheduler(epoch, lr):
   if epoch < 1:
     return lr   
   else:
     return lr * tf.math.exp(-0.1)
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)

#EARLY STOPPING
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1)

#TENSORBOARD PLOTS
tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir='logs')

#SAVE MODEL WEIGHTS
class SaveModel(tf.keras.callbacks.Callback):

  def __init__(self):
    self.history = { 'loss' : [],  'val_loss' : []}
    self.init = 0

  def on_epoch_end(self, epoch, logs = {}):
    
    self.history['loss'].append(logs.get('loss'))
    if logs.get('val_loss', -1) != -1:
        self.history['val_loss'].append(logs.get('val_loss'))

    #if epochs % 10 == 0:
    self.model.save_weights('/content/drive/MyDrive/Data/ENC_DEC_EMB/weights_{}.h5'.format(epoch+self.init))    #print('Saved weights for epoch {}!'.format(epoch))

    df = pd.DataFrame(columns = ['loss','val_loss']) 
    for col in df.columns:
      df[col] = self.history[col]
    df.to_csv('history.csv')
    !cp history.csv "/content/drive/MyDrive/Data/ENC_DEC_EMB/history.csv"

save_model = SaveModel()

In [25]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none'
)


def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [26]:
def f_beta_score(y_true, y_pred):
  y_pred_sparse = tf.convert_to_tensor(np.argmax(y_pred, axis = -1), dtype = tf.float32)
  fb_score = [ fbeta_score(y_true[i], y_pred_sparse[i],average = 'macro',beta = 0.5) for i in range(y_true.shape[0])]#tf.py_function(fbeta_score, inp = [y, y_pred, 0.5], Tout=tf.float32)
  return sum(fb_score)/len(fb_score)

In [None]:
tf.config.run_functions_eagerly(True)

#Create an object of encoder_decoder Model class, 
# Compile the model and fit the model
input = np.random.randint(0, 64, size=(BATCH_SIZE, INPUT_ENCODER_LENGTH))
output = np.random.randint(0, 64, size=(BATCH_SIZE, INPUT_DECODER_LENGTH))
target = np.random.randint(0, 64, size=(BATCH_SIZE, INPUT_DECODER_LENGTH))#tf.keras.utils.to_categorical(output, OUTPUT_VOCAB_SIZE)

model = Encoder_decoder(encoder_inputs_length = INPUT_ENCODER_LENGTH, decoder_inputs_length =INPUT_DECODER_LENGTH, output_vocab_size= OUTPUT_VOCAB_SIZE)
#model = encoder_decoder(enc_units = 1024, dec_units = 1024, scoring_func = 'dot', att_units = 1024)
model.compile(optimizer=tf.keras.optimizers.Adam(),loss=loss_function, metrics = [f_beta_score])#tf.keras.metrics.categorical_crossentropy)
model.fit([input, output], target, steps_per_epoch=1)

model.summary()

In [None]:
model.fit(train_dataset,
          validation_data = test_dataset, 
          epochs = 50, 
          callbacks = [early_stopping,tensorboard_cb, save_model])

In [None]:
model.fit(train_dataset,
          validation_data = test_dataset, 
          epochs = 50, 
          callbacks = [early_stopping,tensorboard_cb, save_model])

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs

In [None]:
model.load_weights('drive/MyDrive/GEC/ENC_DEC_EMB/weights_24_best.h5')

In [None]:
def input_processor(input_sentence, pad_seq):

  #Preprocess to remove unwanted characters and convert to ASCII characters
  encoder_input = preprocess(input_sentence, add_start_token= True, add_end_token=True)

  #Convert to sequence
  tokenized_text = tokenizer.texts_to_sequences([encoder_input])
  if pad_seq == True:
    tokenized_text = pad_sequences(tokenized_text, maxlen=INPUT_ENCODER_LENGTH, padding="post")

  tokenized_text = tf.convert_to_tensor(tokenized_text, dtype = tf.float32)
  return tokenized_text


def remove_end_token(words):
  words_list = words.split(' ')[:-1]
  words = " ".join(words_list)
  return words

In [None]:
def predict(input_sentence):
  input = input_processor(input_sentence, pad_seq = False)

  INPUT_LENGTH = input.shape[0] #Or number of inputs

  states = model.layers[0].initialize_states(INPUT_LENGTH)

  encoder_output,encoder_final_state_h,encoder_final_state_c = model.layers[0](input, states)
  states = [encoder_final_state_h,encoder_final_state_c]  #States to initialize Decoder with

  input_decoder = np.zeros((1,1))
  input_decoder[0][0] = 2  #<start> for eng vocab
  
  decoder_output_list = []
  stop = False

  while stop != True :

    decoder_output, dec_final_state_h, dec_final_state_c = model.layers[1](input_decoder, states)
    
    states = [dec_final_state_h, dec_final_state_c]

    output = model.layers[2](decoder_output)

    index = np.argmax(output, -1)
    decoder_output_list.append(index)
    input_decoder = index

    if index[0][0] == 4 :#or len(decoder_output_list) > input.shape[1]: #Index of <end> for out_tokenizer
      stop =True

  #Get the output tokens and store in arr_out
  arr_out = [int(np.asarray(i)[0][0]) for i in decoder_output_list]

  #Convert to text
  output_words = out_tokenizer.sequences_to_texts([arr_out])

  return output_words
  

In [None]:
indices = [random.randint(0, 10000) for i in range(1000)]
sent_list = [data['error'].iloc[i] for i in indices]
bleu_scores_ = []
actual_output = []
output_sent_list = []

print(sent_list)

#Translate and calculate BLEU scores
for i, sent in enumerate(tqdm(sent_list)):
  out = predict(sent) 
  actual_ = decoder_output[indices[i]]

  output_sent_list.append(out[0])
  actual_output.append(actual_)

  #Remove <end> token
  out_words = remove_end_token(out[0])
  actual_output_ = remove_end_token(actual_)

  #Calculate BLEU scores
  bleu_scores_.append(sentence_bleu(actual_output_.split(' '), out_words.split(' ')))


print('Average BLEU score :',sum(bleu_scores_)/len(bleu_scores_))

In [None]:
df = pd.DataFrame(columns= ['input_sentence', 'actual_output','translated_output', 'bleu_score'])
df['input_sentence'] = sent_list
df['actual_output'] = actual_output
df['translated_output'] = output_sent_list
df['bleu_score'] = bleu_scores_

In [None]:
model.load_weights('drive/MyDrive/GEC/ENC_DEC/weights_49_.h5')

In [None]:
indices = [random.randint(0, 10000) for i in range(1000)]
sent_list = [data['error'].iloc[i] for i in indices]
bleu_scores_ = []
actual_output = []
output_sent_list = []

print(sent_list)

#Translate and calculate BLEU scores
for i, sent in enumerate(tqdm(sent_list)):
  out = predict(sent) 
  actual_ = decoder_output[indices[i]]

  output_sent_list.append(out[0])
  actual_output.append(actual_)

  #Remove <end> token
  out_words = remove_end_token(out[0])
  actual_output_ = remove_end_token(actual_)

  #Calculate BLEU scores
  bleu_scores_.append(sentence_bleu(actual_output_.split(' '), out_words.split(' ')))


print('Average BLEU score :',sum(bleu_scores_)/len(bleu_scores_))

In [None]:
df = pd.DataFrame(columns= ['input_sentence', 'actual_output','translated_output', 'bleu_score'])
df['input_sentence'] = sent_list
df['actual_output'] = actual_output
df['translated_output'] = output_sent_list
df['bleu_score'] = bleu_scores_

In [None]:
df[:50]