In [21]:
!pip install --quiet transformers[sentencepiece]
!pip install --quiet tokenizers

!pip install --quiet torchtext
!pip install --quiet pytorch-lightning
!pip install --quiet sentencepiece
!pip install lightning[extra]




In [22]:
!pip install --quiet pandas numpy

In [23]:
import json
import pandas as pd
import numpy as np
import random
import logging
import glob
import os
import re
import argparse
import time
from string import punctuation
import torch
from torch.utils.data import DataLoader,Dataset
import textwrap
from pathlib import Path
import pytorch_lightning as pl
# from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AdamW,T5ForConditionalGeneration,T5Tokenizer,get_linear_schedule_with_warmup

In [24]:
#creating dataframe using this data
def extract_ques_ans(path):
  with Path(path).open() as json_file:
    data=json.load(json_file)
  # for i in range(len(data['data'])):
  listofdata = data['data']
  data_rows=[]
  for i in listofdata:
    questions=i['paragraphs']
    for question in questions:
      context=question['context']
      for ques_and_ans in question['qas']:
        question=ques_and_ans['question']
        answers=ques_and_ans['answers']

        for answer in answers:
          answer_text=answer['text']
          answer_start=answer['answer_start']
          # answer_end=answer_start + len(answer_text)

          data_rows.append(
              {
                  'question':question,
                  'context':context,
                  'answer_text':answer_text,
                  'answer_start':answer_start,
                  # 'answer_end':answer_end
              }
          )
  return pd.DataFrame(data_rows)

In [25]:
train_files_path=sorted(list(Path(r"./").glob("train*")))
# train_files_path
#appending all the data
dfs=[]
for path in train_files_path:
  df=extract_ques_ans(path)
  dfs.append(df)

train_df=pd.concat(dfs)
train_df = train_df.iloc[1:20,:]
print(train_df.size)

76


In [26]:
test_files_path=sorted(list(Path(r"./").glob("test*")))
# test_files_path
#appending all the data
dfs=[]
for path in test_files_path:
  df=extract_ques_ans(path)
  dfs.append(df)

test_df=pd.concat(dfs)

In [27]:
val_files_path=sorted(list(Path(r"./").glob("val*")))
# val_files_path
#appending all the data
dfs=[]
for path in val_files_path:
  df=extract_ques_ans(path)
  dfs.append(df)

val_df=pd.concat(dfs)

In [28]:
#tokenization
!pip install --quiet sentencepiece
model_name='t5-base'
tokenizer=T5Tokenizer.from_pretrained(model_name)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [29]:
#as per the huggingface t5 documentation pad values should be -100 are ignored (masked) and model take padded values this way only
# labels[labels==0]=-100
# labels = list()

In [30]:
#creating training dataset
class MASHQADataset(Dataset):
  def __init__(self,dataframe,tokenizer,source_max_len=396,tar_max_len=32):
    super().__init__()
    self.tokenizer=tokenizer
    self.source_max_len=source_max_len
    self.tar_max_len=tar_max_len
    self.dataframe=dataframe

  def __len__(self):
    return len(self.dataframe)
  def __getitem__(self,index):
    data_row=self.dataframe.iloc[index]

    source_encoding=tokenizer(
      data_row['question'],
      data_row['context'],
      max_length=self.source_max_len,
      padding='max_length',
      truncation='only_second',
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors='pt'
      )
    target_encoding=tokenizer(
        data_row['answer_text'],
        max_length=self.tar_max_len,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'

    )
    labels=target_encoding['input_ids']
    labels[labels==0]=-100

    return  dict(
        question=data_row['question'],
        context=data_row['context'],
        answer_text=data_row['answer_text'],
        input_ids=source_encoding['input_ids'].flatten(),
        attention_mask=source_encoding['attention_mask'].flatten(),
        labels=labels.flatten()
    )



In [31]:
class MASHQADataModule(pl.LightningDataModule):
  def __init__(self,train_df,test_df,tokenizer,batch_size=8,src_max_len=396,tar_max_len=32):
    super().__init__()
    self.train_df=train_df
    self.test_df=test_df
    self.tokenizer=tokenizer
    self.batch_size=batch_size
    self.src_max_len=src_max_len
    self.tar_max_len=tar_max_len
    print(self.batch_size)

  def setup(self, stage=None):
    self.train_dataset=MASHQADataset(self.train_df,self.tokenizer,self.src_max_len,self.tar_max_len)
    self.test_dataset=MASHQADataset(self.test_df,self.tokenizer,self.src_max_len,self.tar_max_len)

  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size=self.batch_size,
        shuffle=True,
        num_workers=2
    )

  def val_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size=1,
        num_workers=2
    )
  def test_dataloader(self):
    return DataLoader(
      self.test_dataset,
      batch_size=1,
      num_workers=2
  )

In [32]:
batch_size=8
no_epochs=1
data_module=MASHQADataModule(train_df,val_df,tokenizer,batch_size=batch_size)
data_module.setup()

8


In [33]:
class Model(pl.LightningModule):
  def __init__(self):
    super().__init__()
    self.model=T5ForConditionalGeneration.from_pretrained(model_name,return_dict=True)
  def forward(self,input_ids,attention_mask,labels=None):
    output=self.model(input_ids=input_ids,attention_mask=attention_mask,labels=labels)

    return output.loss,output.logits

  def training_step(self,batch,batch_idx):
    input_ids=batch['input_ids']
    attention_mask=batch['attention_mask']
    labels=batch['labels']
    loss,output=self(input_ids,attention_mask,labels)

    return loss
  def validation_step(self,batch,batch_idx):
    input_ids=batch['input_ids']
    attention_mask=batch['attention_mask']
    labels=batch['labels']
    loss,output=self(input_ids,attention_mask,labels)

    return loss
  def test_step(self,batch,batch_idx):
    input_ids=batch['input_ids']
    attention_mask=batch['attention_mask']
    labels=batch['labels']
    loss,output=self(input_ids,attention_mask,labels)

    return loss

  def configure_optimizers(self):
    return torch.optim.AdamW(self.parameters(),lr=0.0001)



In [34]:
model=Model()

trainer=pl.Trainer(
    max_epochs=no_epochs
)

# training
trainer.fit(model,data_module)

trained_model = trainer.model

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py:293: The number of training batches (3) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


In [35]:
torch.save(trained_model.state_dict(), "./model1")

In [36]:
# import torch
# model_100 = torch.jit.script(trained_model)
# model_100.save('model_100.pt')

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
from operator import itemgetter

def getContextList():
    # load json files
    contexts = []
    with open("./val_webmd_squad_v2_consec.json", "r+") as f:
        wrapper_dict = json.load(f)
        data = wrapper_dict["data"]
        contexts = list(set([d["paragraphs"][0]["context"] for d in data])) # list of unique contexts
    with open("./val_webmd_squad_v2_full.json", "r+") as f:
        wrapper_dict = json.load(f)
        data = wrapper_dict["data"]
        contexts.extend(list(set([d["paragraphs"][0]["context"] for d in data]))) # list of unique contexts    return contexts
    # with open("./train_webmd_squad_v2_full.json", "r+") as f:
    #     wrapper_dict = json.load(f)
    #     data = wrapper_dict["data"]
    #     contexts.extend(list(set([d["paragraphs"][0]["context"] for d in data]))) # list of unique contexts    return contexts
    with open("./test_webmd_squad_v2_consec.json", "r+") as f:
        wrapper_dict = json.load(f)
        data = wrapper_dict["data"]
        contexts.extend(list(set([d["paragraphs"][0]["context"] for d in data]))) # list of unique contexts    return contexts
    # with open("./test_webmd_squad_v2_full.json", "r+") as f:
    #     wrapper_dict = json.load(f)
    #     data = wrapper_dict["data"]
    #     contexts.extend(list(set([d["paragraphs"][0]["context"] for d in data]))) # list of unique contexts    return contexts
    with open("./train_webmd_squad_v2_consec.json", "r+") as f:
        wrapper_dict = json.load(f)
        data = wrapper_dict["data"]
        contexts.extend(list(set([d["paragraphs"][0]["context"] for d in data]))) # list of unique contexts
    return contexts

def getMatchingContext(query, contexts):
    # Preprocess the query
    query = " ".join(query.lower().split())

    # Calculate cosine similarity for each context string in all dataframes
    tfidf_vectorizer = TfidfVectorizer()
    query_tfidf = tfidf_vectorizer.fit_transform([query])
    context_tfidf = tfidf_vectorizer.transform(contexts)
    context_similarities = cosine_similarity(query_tfidf, context_tfidf)

    # associate contexts with their similarities
    pairs = [(i, context_similarities[i]) for i in range(len(context_similarities))]
    pairs.sort(reverse=True, key=itemgetter(1))
    # print(contexts[0])
    # print(pairs[0])
    return contexts[pairs[0][0]]

In [46]:

def generate_answer(sample_question, cxts):
  source_encoding=tokenizer(
  # sample_question['question'],
  sample_question,
  # sample_question['context'],
  getMatchingContext(sample_question, cxts),
  max_length=396,
  padding='max_length',
  truncation='only_second',
  return_attention_mask=True,
  add_special_tokens=True,
  return_tensors='pt'
  )

  generated_ids=trained_model.model.generate(
      input_ids=source_encoding['input_ids'],
      attention_mask=source_encoding['attention_mask'],
      num_beams=1,
      max_length=80,
      repetition_penalty=2.5,
      length_penalty=1.0,
      early_stopping=True,
      #use_cache=True
  )

  #print('predicted ids',generated_ids)
  pred=[
        tokenizer.decode(generated_id,skip_special_tokens=True,clean_up_tokenization_spaces=True) for generated_id in generated_ids
  ]

  return " ".join(pred), source_encoding

In [43]:
# for i in range(40,44):#running on some samples
#   sample_qu=val_df.iloc[i]#12
#   print('original---------------')
#   print(sample_qu['question'])
#   #print(sample_qu['context'])
#   print('original answer is ',sample_qu['answer_text'])

#   print('model prediction------------------------')
#   answer,source_encoding = generate_answer(sample_qu)
#   #print('source_encoding',source_encoding['input_ids'])
#   print('predicted answer',answer)


#   print('----------------------------------------------------')

  # from transformers import T5Tokenizer, T5ForConditionalGeneration

  # tokenizer = T5Tokenizer.from_pretrained('t5-small')
  # model = T5ForConditionalGeneration.from_pretrained('t5-small')


In [47]:
sample_q=input()
print('Question:', sample_q)
cxts = getContextList()
answer,source_encoding = generate_answer(sample_q, cxts)
print('Predicted answer:',answer)

print('----------------------------------------------------')

What is depression
Question: What is depression




Predicted answer: Do you have a partner who isn't sleeping well? Does your not sleeping at all? Or do they both You can help reduce the noise. turn and on during the day,??ly raise your bed? Does your? every night. Exercise daily for 20 minutes before going to sleep. insomnia
----------------------------------------------------
