# LSTM

In [None]:
from matplotlib import pyplot as plt

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

import numpy as np

import pandas as pd

import tensorflow as tf
import tensorflow_datasets as tfds

# **1. Load dataset**

In [None]:
def load_json(path):
    '''
    Loads the JSON file of the Squad dataset.
    Returns the json object of the dataset.
    '''
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        
    print("Length of data: ", len(data['data']))
    print("Data Keys: ", data['data'][0].keys())
    print("Title: ", data['data'][0]['title'])
    
    return data

def parse_data(data:dict)->list:
    '''
    Parses the JSON file of Squad dataset by looping through the
    keys and values and returns a list of dictionaries with
    context, query and label triplets being the keys of each dict.
    '''
    data = data['data']
    qa_list = []

    for paragraphs in data:

        for para in paragraphs['paragraphs']:
            context = para['context']

            for qa in para['qas']:
                
                id = qa['id']
                question = qa['question']
                
                for ans in qa['answers']:
                    answer = ans['text']
                    ans_start = ans['answer_start']
                    ans_end = ans_start + len(answer)
                    
                    qa_dict = {}
                    qa_dict['id'] = id
                    qa_dict['context'] = context
                    qa_dict['question'] = question
                    qa_dict['label'] = [ans_start, ans_end]

                    qa_dict['answer'] = answer
                    qa_list.append(qa_dict)    

    
    return qa_list

In [None]:
import json

train_data = load_json('/kaggle/input/squad-20/dev-v2.0.json')
valid_data = load_json('/kaggle/input/squad-20/train-v2.0.json')

# parse the json structure to return the data as a list of dictionaries

train_list = parse_data(train_data)
valid_list = parse_data(valid_data)
print('--------------------------')

print('Train list len: ',len(train_list))
print('Valid list len: ',len(valid_list))

# converting the lists into dataframes

train_ds = pd.DataFrame(train_list)
val_ds = pd.DataFrame(valid_list)

train_ds = train_ds.drop('id', axis=1)
train_ds = train_ds.drop('label', axis=1)
val_ds = val_ds.drop('id', axis=1)
val_ds = val_ds.drop('label', axis=1)
train_ds.columns = ['Paragraph', 'Question', 'Answer']
val_ds.columns = ['Paragraph', 'Question', 'Answer']

In [None]:
train_ds.head()

In [None]:
val_ds.head()

# 2. Preprocessing

# Tokenization

In [None]:
%time train_ds['Paragraph'] = train_ds['Paragraph'].apply(nltk.word_tokenize)
%time train_ds['Question'] = train_ds['Question'].apply(nltk.word_tokenize)
%time train_ds['Answer'] = train_ds['Answer'].apply(nltk.word_tokenize)
%time val_ds['Paragraph'] = val_ds['Paragraph'].apply(nltk.word_tokenize)
%time val_ds['Question'] = val_ds['Question'].apply(nltk.word_tokenize)
%time val_ds['Answer'] = val_ds['Answer'].apply(nltk.word_tokenize)

In [None]:
train_ds.head()

In [None]:
val_ds.head()

In [None]:
# Compute maximum length statistics for paragraph and question
paragraph_length = max(train_ds['Paragraph'].map(len).max(), val_ds['Paragraph'].map(len).max())
question_length = max(train_ds['Question'].map(len).max(), val_ds['Question'].map(len).max())
print('Max paragraph length:', paragraph_length)
print('Max question length:', question_length)

# Encode answers

In [None]:
num_not_found = 0
not_found = []

# Map answer tokens to one-hot encodings of start and end positions of the answer span extracted from the paragraph
def encode_answer(paragraph_tokens, answer_tokens):
    global num_not_found, not_found
    answer_ptr = 0
    start_pos = None
    end_pos = None
    for i, paragraph_token in enumerate(paragraph_tokens):
        if paragraph_token == answer_tokens[answer_ptr]:
            if start_pos == None:
                start_pos = i
            answer_ptr += 1
            if answer_ptr == len(answer_tokens):
                end_pos = i
                break
        elif start_pos != None:
            start_pos = None
            end_pos = None

    start = [0] * paragraph_length
    end = [0] * paragraph_length

    if start_pos == None or end_pos == None:
        num_not_found += 1
        not_found.append([paragraph_tokens, answer_tokens])
    else:
        start[start_pos] = 1
        end[end_pos] = 1
    return [start, end]

train_ds['Answer'] = train_ds.apply(lambda row: encode_answer(row['Paragraph'], row['Answer']), axis=1)
val_ds['Answer'] = val_ds.apply(lambda row: encode_answer(row['Paragraph'], row['Answer']), axis=1)

In [None]:
train_ds.head()

# 3. Embeddings - GloVe

In [None]:
embedding_file = 'glove.6B.50d.txt'
embedding_size = 50

# # Pre-computed unknown vector (by taking average of all word vectors)
# # Reference: https://stackoverflow.com/questions/49239941/what-is-unk-in-the-pretrained-glove-vector-files-e-g-glove-6b-50d-txt
unknown_vector = np.array([-0.12920076, -0.28866628, -0.01224866, -0.05676644, -0.20210965, -0.08389011,
    0.33359843,  0.16045167,  0.03867431,  0.17833012,  0.04696583, -0.00285802,
    0.29099807,  0.04613704,  -0.20923874, -0.06613114, -0.06822549, 0.07665912,
    0.3134014,   0.17848536,  -0.1225775,  -0.09916984, -0.07495987, 0.06413227,
    0.14441176,  0.60894334,  0.17463093,  0.05335403,  -0.01273871, 0.03474107,
    -0.8123879,  -0.04688699, 0.20193407,  0.2031118,   -0.03935686, 0.06967544,
    -0.01553638, -0.03405238, -0.06528071, 0.12250231,  0.13991883, -0.17446303,
    -0.08011883, 0.0849521,   -0.01041659, -0.13705009, 0.20127155, 0.10069408,
    0.00653003,  0.01685157], np.float32)
print(unknown_vector)
print(embedding_file)

In [None]:
import numpy as np
import io

def get_glove_dict():
    '''
    Parses the glove word vectors text file and returns a dictionary with the words as
    keys and their respective pretrained word vectors as values.
    '''
    encoding = 'utf-8'
    glove_dict = {}
    with open("/kaggle/input/glove6b50dtxt/glove.6B.50d.txt", "r") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            glove_dict[word] = vector

    return glove_dict

In [None]:
embeddings = get_glove_dict()
print(embeddings)

In [None]:
import numpy as np

# Assuming embeddings and unknown_vector are defined somewhere above this code

def embed(tokens):
    vectors = []
    for token in tokens:
        token = token.lower()
        if token in embeddings:
            vectors.append(embeddings[token])
        else:
            vectors.append(unknown_vector)
    return np.asarray(vectors, np.float32)

# Assuming train_ds and val_ds are your training and validation datasets

# Convert numpy arrays to lists before passing them to the embed function
train_ds['Paragraph'], train_ds['Question'] = train_ds['Paragraph'].tolist(), train_ds['Question'].tolist()
val_ds['Paragraph'], val_ds['Question'] = val_ds['Paragraph'].tolist(), val_ds['Question'].tolist()

train_ds['Paragraph'], train_ds['Question'] = train_ds['Paragraph'].map(embed), train_ds['Question'].map(embed)
val_ds['Paragraph'], val_ds['Question'] = val_ds['Paragraph'].map(embed), val_ds['Question'].map(embed)

# If you want to store the vectors, create a variable for them
train_paragraph_vectors = train_ds['Paragraph']
train_question_vectors = train_ds['Question']

val_paragraph_vectors = val_ds['Paragraph']
val_question_vectors = val_ds['Question']

# Now you can print the shape of the vectors
print(train_paragraph_vectors.shape)
print(train_question_vectors.shape)
print(val_paragraph_vectors.shape)
print(val_question_vectors.shape)


In [None]:
# Transform word tokens into word embeddings
def embed(tokens):
    vectors = []
    for token in tokens:
        token = token.lower()
        if token in embeddings:
            vectors.append(embeddings[token])
        else:
            vectors.append(unknown_vector)
    return np.asarray(vectors, np.float32)

train_ds['Paragraph'], train_ds['Question'] = train_ds['Paragraph'].map(embed), train_ds['Question'].map(embed)
val_ds['Paragraph'], val_ds['Question'] = val_ds['Paragraph'].map(embed), val_ds['Question'].map(embed)
print(vectors)


In [None]:
train_ds.head()

# 4. Prepare training and validation data

In [None]:
#print(vector)
# Pad paragraph and question embeddings
def pad_paragraph(vectors):
    remaining_length = paragraph_length - len(vectors)
    paddings = np.repeat([np.zeros(embedding_size)], remaining_length, axis=0)
    return np.concatenate((vectors, paddings), axis=0, dtype=np.float32)

def pad_question(vectors):
    remaining_length = question_length - len(vectors)
    paddings = np.repeat([np.zeros(embedding_size)], remaining_length, axis=0)
    return np.concatenate((vectors, paddings), axis=0, dtype=np.float32)

print(vector)
paragraph_train = train_ds['Paragraph'].map(pad_paragraph).to_list()
question_train = train_ds['Question'].map(pad_question).to_list()

paragraph_val = val_ds['Paragraph'].map(pad_paragraph).to_list()
question_val = val_ds['Question'].map(pad_question).to_list()
print(pad_paragraph)
print(pad_question)


In [None]:
# Get start and end token positions
answer_train = train_ds['Answer'].to_list()
start_train = [ans[0] for ans in answer_train]
end_train = [ans[1] for ans in answer_train]

answer_val = val_ds['Answer'].to_list()
start_val = [ans[0] for ans in answer_val]
end_val = [ans[1] for ans in answer_val]

In [None]:
# Convert to constant tensor
paragraph_train, question_train = tf.constant(paragraph_train, np.float32), tf.constant(question_train, np.float32)
paragraph_val, question_val = tf.constant(paragraph_val, np.float32), tf.constant(question_val, np.float32)
start_train, end_train = tf.constant(start_train, np.float32), tf.constant(end_train, np.float32)
start_val, end_val = tf.constant(start_val, np.float32), tf.constant(end_val, np.float32)

# 5. Feature Extraction - Bidirectional LSTM encoder

In [None]:
# LSTM encoder layer for paragraph
paragraph_inputs = tf.keras.Input(shape=(paragraph_length, embedding_size))
paragraph_inputs = tf.keras.layers.Masking(mask_value=np.zeros(embedding_size))(paragraph_inputs)
paragraph_encoded = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_size, return_sequences=True))(paragraph_inputs)

# LSTM encoder layer for question
question_inputs = tf.keras.Input(shape=(question_length, embedding_size))
question_inputs = tf.keras.layers.Masking(mask_value=np.zeros(embedding_size))(question_inputs)
question_encoded = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_size, return_sequences=True))(question_inputs)

# 6. Paragraph-Question Interaction - Bidirectional attention + One-hop interaction

In [None]:
# Co-attention layer
scores = tf.matmul(paragraph_encoded, question_encoded, transpose_b=True)
question_weights = tf.nn.softmax(scores)
paragraph_weights = tf.nn.softmax(tf.transpose(scores, perm=[0,2,1]))
question_context = tf.matmul(paragraph_encoded, question_weights, transpose_a=True)
question_concat = tf.concat([tf.transpose(question_encoded, perm=[0,2,1]), question_context], axis=1)
paragraph_context = tf.transpose(tf.matmul(question_concat, paragraph_weights), perm=[0,2,1])

# 7. Span prediction - Unidirectional boundary model

In [None]:
# Answer pointer layer
boundary = tf.keras.layers.LSTM(embedding_size, return_sequences=True)(paragraph_context)
start_pos_logits = tf.keras.layers.Dense(1)(boundary)
boundary = tf.keras.layers.Concatenate()([boundary, start_pos_logits])
end_pos_logits = tf.keras.layers.Dense(1)(boundary)

start_pos_distribution = tf.keras.layers.Softmax()(tf.squeeze(start_pos_logits, axis=-1))
end_pos_distribution = tf.keras.layers.Softmax()(tf.squeeze(end_pos_logits, axis=-1))

# 8. Build, fit and evaluate model

In [None]:
def cross_entropy_loss(y_true, y_pred):
    start_true, end_true = y_true[0], y_true[1]
    start_pred, end_pred = y_pred[0], y_pred[1]

    start_loss = -tf.reduce_sum(start_true * tf.math.log(start_pred))
    end_loss = -tf.reduce_sum(end_true * tf.math.log(end_pred))

    total_loss = start_loss + end_loss
    return total_loss

def exact_match(y_true, y_pred):
    start_true, end_true = tf.math.argmax(y_true[0], output_type=tf.int32), tf.math.argmax(y_true[1], output_type=tf.int32)
    start_pred = tf.math.argmax(y_pred[0], output_type=tf.int32)
    end_pred = start_pred + tf.math.argmax(y_pred[1][start_pred:], output_type=tf.int32)

    if start_pred != start_true or end_pred != end_true:
        return float(0)
    else:
        return float(1)

def f1_score(y_true, y_pred):
    start_true, end_true = tf.math.argmax(y_true[0], output_type=tf.int32), tf.math.argmax(y_true[1], output_type=tf.int32)
    start_pred = tf.math.argmax(y_pred[0], output_type=tf.int32)
    end_pred = start_pred + tf.math.argmax(y_pred[1][start_pred:], output_type=tf.int32)

    start_min = tf.math.minimum(start_true, start_pred)
    end_max = tf.math.maximum(end_true, end_pred)

    true_pos = 0
    false_neg = 0
    false_pos = 0

    for pos in range(start_min, end_max + 1):
        in_true = start_true <= pos <= end_true
        in_pred = start_pred <= pos <= end_pred

        if in_true:
            if in_pred:
                true_pos += 1
            else:
                false_neg += 1
        else:
            if in_pred:
                false_pos += 1

    if true_pos == 0 and false_neg == 0 and false_pos == 0:
        return float(0)

    return float((2 * true_pos) / (2 * true_pos + false_pos + false_neg))

model = tf.keras.Model(inputs=[paragraph_inputs, question_inputs], outputs =[start_pos_distribution, end_pos_distribution])
model.compile(optimizer='adam', loss=cross_entropy_loss, metrics=[exact_match, f1_score])
model.summary()

In [None]:
epochs = 4
history = model.fit(
    [paragraph_train, question_train],
    [start_train, end_train],
    epochs=epochs,
    validation_data=([paragraph_val, question_val], [start_val, end_val])
)

In [None]:
em = history.history['softmax_1_exact_match']
f1 = history.history['softmax_1_f1_score']
loss = history.history['loss']

val_em = history.history['val_softmax_1_exact_match']
val_f1 = history.history['val_softmax_1_f1_score']
val_loss = history.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(2, 2, 1)
plt.plot(epochs_range, em, label='Training EM')
plt.plot(epochs_range, val_em, label='Validation EM')
plt.legend(loc='lower right')
plt.title('Bidirectional LSTM EM')

plt.subplot(2, 2, 2)
plt.plot(epochs_range, f1, label='Training F1')
plt.plot(epochs_range, val_f1, label='Validation F1')
plt.legend(loc='lower right')
plt.title('Bidirectional LSTM F1')
plt.show()

plt.subplot(2, 2, 3)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Bidirectional LSTM Loss')
plt.show()

In [None]:
results = model.predict([paragraph_val, question_val])
predictions = []
for start_dist, end_dist in zip(results[0], results[1]):
    start_pred = np.argmax(start_dist)
    end_pred = start_pred + np.argmax(end_dist[start_pred:])
    predictions.append([start_pred, end_pred])
with open('model3_val.npy', 'wb') as f:
    np.save(f, np.array(predictions))

In [None]:
results = model.predict([paragraph_train, question_train])
predictions = []
for start_dist, end_dist in zip(results[0], results[1]):
    start_pred = np.argmax(start_dist)
    end_pred = start_pred + np.argmax(end_dist[start_pred:])
    predictions.append([start_pred, end_pred])
with open('model3_train.npy', 'wb') as f:
    np.save(f, np.array(predictions))

In [None]:
!ls

In [None]:
with open('/kaggle/working/model3_val.npy', 'rb') as f:
    model3_val = np.load(f)

with open('/kaggle/working/model3_train.npy', 'rb') as f:
    model3_train = np.load(f)

In [None]:
train_data_testing = load_json('/kaggle/input/new-squaddataset/train-v2.0.json')
valid_data_testing = load_json('/kaggle/input/new-squaddataset/dev-v2.0.json')

# parse the json structure to return the data as a list of dictionaries

train_list_testing = parse_data(train_data_testing)
valid_list_testing = parse_data(valid_data_testing)
print('--------------------------')

print('Train list len: ',len(train_list_testing))
print('Valid list len: ',len(valid_list_testing))

# converting the lists into dataframes

train_ds_testing = pd.DataFrame(train_list_testing)
val_ds_testing = pd.DataFrame(valid_list_testing)
# train_ds_testing.columns = ['Paragraph', 'Question', 'Answer']
# val_ds_testing.columns = ['Paragraph', 'Question', 'Answer']

In [None]:
val_ds_testing.head()

In [None]:
predictions = {}
for index, row in val_ds_testing.iterrows():
    id_val = row['id']
    paragraph = row['context']
    question = row['question']
    answer_start = row['label'][0]
    answer = row['answer']
    
    answer_end = answer_start + len(answer)
    if paragraph[answer_start-1:answer_end-1] == answer:
        answer_start -= 1
        answer_end -= 1
    elif paragraph[answer_start-2:answer_end-2] == answer:
        answer_start -= 2
        answer_end -= 2

    m3_start = model3_val[index][0]
    m3_end = model3_val[index][1]
    m3 = paragraph[m3_start:m3_end+1]
    
    predicted_answer = m3
    actual_answer = answer

    # Convert predicted and actual answers to sets of characters
    predicted_chars = set(predicted_answer)
    actual_chars = set(actual_answer)

    # Calculate common characters
    common_chars = predicted_chars.intersection(actual_chars)

    # Calculate precision, recall, and F1 score at character level
    precision = len(common_chars) / len(predicted_chars) if len(predicted_chars) > 0 else 0
    recall = len(common_chars) / len(actual_chars) if len(actual_chars) > 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
    
    predictions[id_val] = [m3,f1_score]

prediction_validDataset_BiLSTM = 'prediction_validDataset_BiLSTM.txt'
with open(prediction_validDataset_BiLSTM, 'w', encoding='utf-8') as file:
    json.dump(predictions, file, ensure_ascii=False)

In [None]:
predictions = {}
for index, row in train_ds_testing.iterrows():
    id_val = row['id']
    paragraph = row['context']
    question = row['question']
    answer_start = row['label'][0]
    answer = row['answer']
    
    answer_end = answer_start + len(answer)
    if paragraph[answer_start-1:answer_end-1] == answer:
        answer_start -= 1
        answer_end -= 1
    elif paragraph[answer_start-2:answer_end-2] == answer:
        answer_start -= 2
        answer_end -= 2

    m3_start = model3_train[index][0]
    m3_end = model3_train[index][1]
    m3 = paragraph[m3_start:m3_end+1]
    
    predicted_answer = m3
    actual_answer = answer

    # Convert predicted and actual answers to sets of characters
    predicted_chars = set(predicted_answer)
    actual_chars = set(actual_answer)

    # Calculate common characters
    common_chars = predicted_chars.intersection(actual_chars)

    # Calculate precision, recall, and F1 score at character level
    precision = len(common_chars) / len(predicted_chars) if len(predicted_chars) > 0 else 0
    recall = len(common_chars) / len(actual_chars) if len(actual_chars) > 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
    
    predictions[id_val] = [m3,f1_score]

prediction_trainDataset_BiLSTM = 'prediction_trainDataset_BiLSTM.txt'
with open(prediction_trainDataset_BiLSTM, 'w', encoding='utf-8') as file:
    json.dump(predictions, file, ensure_ascii=False)

# F1 score


In [None]:
total_sum = 0
num_elements = 0

for key in predictions.keys():
    total_sum += predictions[key][1]
    num_elements += 1

if num_elements > 0:
    average = total_sum / num_elements
else:
    average = 0


In [None]:
print(average)