<a href="https://colab.research.google.com/github/Jaskeerat23/QNAModel/blob/main/Project_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Importing Libraries**

In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import string
import random
import re

from sklearn.model_selection import train_test_split
from google.colab import drive

In [2]:
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
df = pd.read_csv("/content/drive/MyDrive/QNA_Dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,subjects,Difficulty,Questions
0,0,Physics,Medium,Discuss the theoretical foundation of Waves an...
1,1,Physics,Medium,Discuss the theoretical foundation of Waves an...
2,2,Physics,Easy,Discuss the theoretical foundation of Relativity.
3,3,Physics,Hard,What is Thermodynamics and how is it observed ...
4,4,Physics,Medium,Discuss the theoretical foundation of Electrom...


In [4]:
def generatePrompt(subject, diff):
  prompts = [
    "Generate <SUB> question of <DIFF> difficulty",
    "Provide question on <SUB> with <DIFF> difficulty",
    "subject : <SUB>, difficulty : <DIFF>",
    "Give me <DIFF> question of <SUB>",
    "Formulate a theoretical <SUB> question that is <DIFF>",
    "Create one <DIFF> level question from <SUB>",
    "I need a <DIFF> difficulty question in <SUB>",
    "Please generate a conceptual question in <SUB> at <DIFF> level",
    "Suggest a <DIFF> level question from <SUB>",
    "Write a <DIFF> difficulty theoretical question on <SUB>",
    "Prepare a question from <SUB> that is <DIFF> and theoretical",
    "Generate a non-numerical <SUB> question with <DIFF> difficulty",
    "Produce a conceptual <SUB> question categorized as <DIFF>",
    "Create one theoretical and <DIFF> level question from the topic <SUB>"
  ]


  promptIdx = random.randint(0, len(prompts)-1)
  prompt = prompts[promptIdx]

  prompt = prompt.split()
  subIdx, diffIdx = -1, -1
  for i, word in enumerate(prompt):
    if '<SUB>' in word:
      subIdx = i
    if '<DIFF>' in word:
      diffIdx = i

  prompt[subIdx] = subject
  prompt[diffIdx] = diff

  return ' '.join(prompt)

generatePrompt("Maths", "easy")

'Provide question on Maths with easy difficulty'

# **Adding a new column in Dataframe "Prompts"**

In [5]:
def addPromptsDataFrame(df):

  #These indexes will be used to extract information when iterating over rows in dataframe
  subIdx = 2
  diffIdx = 3
  quesIdx = 4

  #In these list we will add all the prompts that will be generated using "generatePrompt" function
  #These prompts will be used as input to ENCODER module
  prompts = []

  for dataRow in df.itertuples():
    # print(dataRow)
    sub = dataRow[subIdx]
    diff = dataRow[diffIdx]
    # print(sub, diff)
    prompt = generatePrompt(sub, diff)
    prompts.append(prompt)

  assert len(prompts) == len(df), f"length of prompt list is {len(prompts)} and that of df is {len(df)}"

  try:
    df.insert(3, "Prompts", prompts)
  except ValueError:
    print("Already done you can see it :)")
    df.head()

addPromptsDataFrame(df)

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,subjects,Difficulty,Prompts,Questions
0,0,Physics,Medium,Formulate a theoretical Physics question that ...,Discuss the theoretical foundation of Waves an...
1,1,Physics,Medium,Please generate a conceptual question in Physi...,Discuss the theoretical foundation of Waves an...
2,2,Physics,Easy,I need a Easy difficulty question in Physics,Discuss the theoretical foundation of Relativity.
3,3,Physics,Hard,Suggest a Hard level question from Physics,What is Thermodynamics and how is it observed ...
4,4,Physics,Medium,Please generate a conceptual question in Physi...,Discuss the theoretical foundation of Electrom...


# **Basic Preprocessing**

Since we are basically building a language model so during the preprocessing part we will not remove the "Stopwords" or "punctuations" we will just remove special characters like "?,/+-!@#$%&*" so they don't hurt the performance in Phase-1

In [6]:
def removeSpecialCharacters(sentence):
  return re.sub('[^a-zA-Z0-9]', ' ', sentence)

print(removeSpecialCharacters("!@#!$%%@%#%@@!Jas"))


              Jas


# **Getting the max lengths of input and output**

In [7]:
import math
def analyzeLengths(df):

  promptMaxLen = -math.inf
  quesMaxLen = -math.inf

  for prompt, question in zip(df['Prompts'], df['Questions']):
    promptMaxLen = max(len(prompt.split(' ')), promptMaxLen)
    quesMaxLen = max(len(question.split(' ')), quesMaxLen)

  print(f"Max prompt len : {promptMaxLen}")
  print(f"Max question len : {quesMaxLen}")

  return promptMaxLen, quesMaxLen

promptMaxLen, quesMaxLen = analyzeLengths(df)

Max prompt len : 11
Max question len : 13


# **Building Prompt and Question Vocab**

In [8]:
def addUniqueWordstoVocab(feature):

  words = set()
  for sentence in feature:
    wordsInSent = sentence.split()
    for word in wordsInSent:
      words.add(word)

  return words

promptVocab = addUniqueWordstoVocab(df['Prompts'])
print(f"vocab size for prompts are {len(promptVocab)}")
quesVocab = addUniqueWordstoVocab(df['Questions'])
print(f"vocab size for prompts are {len(quesVocab)}")

vocab = quesVocab.copy()

for word in promptVocab:
  if word in quesVocab:
    print(f"Word {word} appeared in both sets")
  else:
    vocab.add(word)

print(f"Final vocab size is {len(vocab)}")

vocab size for prompts are 43
vocab size for prompts are 90
Word the appeared in both sets
Word in appeared in both sets
Word Physics appeared in both sets
Word is appeared in both sets
Word of appeared in both sets
Word theoretical appeared in both sets
Word with appeared in both sets
Word Chemistry appeared in both sets
Word and appeared in both sets
Final vocab size is 124


# **Indexing words of vocab**

In phase-1 Since we know that max prompt len is 11 and max question len is 13 so we will keep Tx and Ty both 15

In [9]:
Tx = Ty = 15
def indexVocab(vocab):
  wordIdx = {'<UNK>' : 0, '<SOS>' : 1, '<EOS>' : 2, '<PAD>' : 3}

  for idx, word in enumerate(vocab, len(wordIdx)):
    wordIdx.update({word : idx})

  return wordIdx

wordIdx = indexVocab(vocab)
# (wordIdx)

# **Doing Basic Preprocessing**



*   Padding short sequences
*   Indexing each word in sequence
*   Appending End of sentence and Pushing start of sentence token in output sentences
*   Appending End of sentence token



In [10]:
def pushSOS(sent):
  return "<SOS> " + sent

def pushEOS(sent):
  return sent + " <EOS>"

def padSeq(sent):
  sent = sent.split()
  length = len(sent)
  sent = sent + ((Ty - length) * ['<PAD>'])
  return " ".join(sent)

df['Prompts'] = df['Prompts'].apply(pushEOS)
df['Prompts'] = df['Prompts'].apply(padSeq)
df['Questions'] = df['Questions'].apply(pushEOS)
df['Questions'] = df['Questions'].apply(pushSOS)
df['Questions'] = df['Questions'].apply(padSeq)

for prompt in df['Prompts']:
  assert len(prompt.split()) == Ty, f"Length of prompt is not equal to 15"

df['Prompts'][0], df['Questions'][0]

('subject : Physics difficulty : Medium <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>',
 '<SOS> Discuss the theoretical foundation of Waves and Optics. <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>')

In [11]:
def convertWordsToIndex(sentList : pd.DataFrame, wordIdx):
  newList = []
  for ques in list(sentList):
    tempList = []
    for word in ques.split():
      tempList.append(wordIdx[word])
    newList.append(tempList)
  return newList


indexedQuestions = convertWordsToIndex(df['Questions'], wordIdx)
indexedPrompts = convertWordsToIndex(df['Prompts'], wordIdx)

# **Creating Embedding Matrix**

In [None]:
def createEmbeddingMatrix(vocab, wordIdx):
  filePath = "/content/drive/MyDrive/glove.42B.300d.txt"
  embeddingMatrix = np.zeros(shape = (len(wordIdx), 300)) # 300 -> GloVe embeddings dimension

  print(f"Embedding matrix initialized with shape {embeddingMatrix.shape}")
  wordCount = 0 # To keep track of embedded words so we can stop early if all words of target are achieved

  with open(file = filePath, mode = 'r') as embeddingFile:
    for idx, line in enumerate(embeddingFile):
      line = line.split()
      key = str(line[0])
      if key in vocab:
        i = wordIdx[key]
        vector = np.array(line[1:], dtype = np.float32)
        embeddingMatrix[i, : ] = vector
        wordCount+=1

      if wordCount == len(wordIdx):
        print(f"All {wordCount} words have been embedded :)")

  return embeddingMatrix

embeddingMatrix = createEmbeddingMatrix(vocab, wordIdx)

Embedding matrix initialized with shape (128, 300)


KeyboardInterrupt: 

In [None]:
np.save(file = "/content/drive/MyDrive/PHASE-1_EMBEDDING_MATRIX_DIM_124_300.npy", arr = embeddingMatrix)

NameError: name 'embeddingMatrix' is not defined

In [12]:
embeddingMatrix = np.load(file = "/content/drive/MyDrive/PHASE-1_EMBEDDING_MATRIX_DIM_124_300.npy")
embeddingMatrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.035835  ,  0.77844   , -0.51806003, ...,  0.022321  ,
        -0.31591001, -0.35608   ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.56686997,  0.1235    ,  0.19329   , ...,  0.026026  ,
         0.13852   ,  0.017205  ]])


# **Encoder Module**

Since we have 4 layers in our Encoder module for now, and the size of our embeddings are 300. For a thumb of rule and to keep things simple i have taken the hidden size as 128 units. It may vary in further phases.

The nn.LSTM have been initialized with 4 layers and we know for an encoder decoder attention based model we need the hidden state of each layer so it will be provided by the output tuple.

nn.LSTM returns -


1.   The final layer hidden state
2.   Hidden states of each layer at each time step
3.   Cell states of each layer at each time step

the (2, 3) outputs are provided in a tuple that have been stored with variable name 'tup'



In [13]:
class Encoder(nn.Module):
  def __init__(self, ipSize, hiddenSize, Tx):
    super().__init__()
    self.lstm = nn.LSTM(input_size = ipSize, hidden_size = hiddenSize, num_layers = 4, dropout = 0.5, batch_first = True)

  def forward(self, emb):
    encoded, tup = self.lstm(emb)
    return encoded, tup

enc = Encoder(300, 128, 15)
randArr = np.random.randn(32, 15, 300)
encoded, tup = enc(torch.from_numpy(randArr).type(torch.float))
encoded.shape, tup[0].shape, tup[1].shape

(torch.Size([32, 15, 128]), torch.Size([4, 32, 128]), torch.Size([4, 32, 128]))

# **Attention Module**

In [None]:
lyr = nn.Linear(in_features = 128, out_features = 64)
arr1 = torch.randn(1, 2, 128)
lyr(arr1).shape

torch.Size([1, 2, 64])

In [None]:
encHiddenStates = torch.randn(1, 1, 128)
decHiddenStates = torch.randn(1, 1, 128)
X = torch.cat((encHiddenStates, decHiddenStates), dim = 1)
X.shape

torch.Size([1, 2, 128])

In [None]:
arr = torch.randn(size = (32, 128))
arr.unsqueeze(1).repeat(1, Tx, 1).shape

torch.Size([32, 15, 128])

In [14]:
class AttentionModule(nn.Module):
  def __init__(self, batchSize, hiddenDim, Tx):
    super().__init__()
    self.encLayer = nn.Linear(in_features = hiddenDim, out_features = hiddenDim)
    self.decLayer = nn.Linear(in_features = hiddenDim, out_features = hiddenDim)
    self.v = nn.Linear(in_features = hiddenDim, out_features = 1)
    self.softmax = nn.Softmax(dim = -1)

  def forward(self, encHiddenStates, decHiddenStates : torch.Tensor):
    B, Tx, H = encHiddenStates.shape
    decHiddenStates = decHiddenStates.unsqueeze(dim = 1).repeat(1, Tx, 1)

    energies = self.v(torch.tanh(self.encLayer(encHiddenStates) + self.decLayer(decHiddenStates)))
    alpha = self.softmax(energies)

    context = torch.sum(alpha * encHiddenStates, dim = 1)
    return context

In [15]:
attentionTest = AttentionModule(32, 128, 15)
encHiddenStates = torch.randn(size = (32, 15, 128))
decHiddenStates = torch.randn(size = (32, 128))
attentionTest(encHiddenStates, decHiddenStates).shape

torch.Size([32, 128])

# **Decoder Module**

In [None]:
class Decoder(nn.Module):
  def __init__(self, embDim, hiddenDim, Tx, Ty):
    super().__init__()
    self.lstm = nn.LSTM(input_size = embDim + hiddenDim, hidden_size = hiddenDim, num_layers = 4, dropout = 0.5, batch_first = True)
    self.attention = AttentionModule(32, hiddenDim, Tx)
    self.Tx = Tx
    self.Ty = Ty

  def forward(self,
              encoderHiddenStates : torch.Tensor, #The hidden states of last layer of LSTM (encoder)
              encFinal : torch.Tensor, #The final hidden states of each layer of LSTM (encoder)
              encCellFinal : torch.Tensor, #The final cell states of each layer of LSTM (encoder)
              X : torch.Tensor #The input Sequences to LSTM (decoder) during training -> Teacher Forcing
              ):
    #Shape of encoderHiddenStates will be [32, 15, 128] since in Encoder i have kept the 'batch_first'
    #variable as true and we have 32 batches each with sequence length 15 and 128 hidden units

    outputSeq = []
    d_h_0 = encFinal        #[num_layers, batch_size, H_out] ([4, 32, 128])
    d_c_0 = encCellFinal    #[num_layers, batch_size, H_out] ([4, 32, 128])
    contextVector = self.attention(encoderHiddenStates, d_h_0[-1])
    X = torch.cat(tensors = (X[:, 0, :], contextVector), dim = -1).unsqueeze(1)
    output, (d_h_n, d_c_n) = self.lstm(X, (d_h_0, d_c_0))
    outputSeq.append(output)

    for i in range(1, self.Tx):
      contextVector = self.attention(encoderHiddenStates, d_h_n[-1])
      X = torch.cat(tensors = (X[:, i, :], contextVector), dim = -1).unsqueeze(1)
      output, (d_h_n, d_c_n) = self.lstm(X, (d_h_n, d_c_n))
      outputSeq.append(output)

    return torch.cat(outputSeq, dim = 1)