In [5]:
!pip install -q -U transformers[sentencepiece] rouge git+https://github.com/deepset-ai/haystack.git grpcio-tools==1.34.1 spacy

In [6]:
import re
import spacy
import nltk
import json
import pandas as pd 
from tqdm import tqdm
from rouge import Rouge
from pprint import pprint
from typing import List
from haystack import Document
from haystack.reader import TransformersReader
from haystack.pipeline import ExtractiveQAPipeline 
from haystack.retriever.dense import DensePassageRetriever 
from haystack.document_store.faiss import FAISSDocumentStore
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [7]:
!spacy download en_core_web_md 
!spacy link en_core_web_md en

Collecting en-core-web-md==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.1.0/en_core_web_md-3.1.0-py3-none-any.whl (45.4 MB)
[K     |████████████████████████████████| 45.4 MB 18 kB/s 
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ As of spaCy v3.0, model symlinks are not supported anymore. You can
load trained pipeline packages using their full names or from a directory
path.[0m


In [8]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
from spacy.lang.en import English
# nlp = spacy.load('en_core_web_md')
nlp = English() 
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x7fb859acc730>

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# !unzip "drive/MyDrive/titleText Datasets.zip" -d drive/MyDrive/

In [11]:
with open('drive/MyDrive/qa-SQUAD.json', "r") as f:
    qa = json.loads(f.read())['data']

# df = pd.read_csv('drive/MyDrive/titleText-paragraphs.csv', index_col=0)
df = pd.read_csv('drive/MyDrive/titleText-threeSentences.csv', index_col=0)

# df = df.replace(r'\n',' ', regex=True) 
df = df.reset_index()

In [12]:
df.iloc[0]['text']

'Tioconazole is an antifungal medication that fights infections caused by fungus. 1-Day (for use in the vagina) is used to treat vaginal Candida (yeast) infections. 1-Day may also be used for other purposes not listed in this medication guide.'

In [13]:
titles = list(df["title"].values)
texts  = list(df["text"].values)
documents: List[Document] = []
 
for title, text in zip(titles, texts):
    documents.append(
        Document(
            text=text,
            meta={
                "name": title or ""
            }
        )
    )

In [24]:
document_store = FAISSDocumentStore(
    similarity="dot_product",
    faiss_index_factory_str="Flat",
    return_embedding=True
)

retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=True,
    embed_title=True,
)

# retriever = DensePassageRetriever(
#     document_store=document_store,
#     query_embedding_model="drive/MyDrive/bert-large-finetuned",
#     passage_embedding_model="drive/MyDrive/bert-large-finetuned",
#     use_gpu=True,
#     embed_title=True,
# )

document_store.delete_documents()
document_store.write_documents(documents)
document_store.update_embeddings(
    retriever=retriever
)

Updating Embedding:   0%|          | 0/16341 [00:00<?, ? docs/s]

Create embeddings:   0%|          | 0/10000 [00:00<?, ? Docs/s]

Documents Processed:  61%|██████    | 10000/16341 [05:05<03:14, 32.68 docs/s]

Create embeddings:   0%|          | 0/6352 [00:00<?, ? Docs/s]

Documents Processed: 20000 docs [08:19, 40.01 docs/s]


In [25]:
# reader = TransformersReader(model_name_or_path="ahotrod/albert_xxlargev1_squad2_512", use_gpu=0)
# reader = TransformersReader(model_name_or_path="bert-large-uncased-whole-word-masking-finetuned-squad", use_gpu=0)

reader = TransformersReader(model_name_or_path="ktrapeznikov/albert-xlarge-v2-squad-v2", 
                            context_window_size=175,
                            max_seq_len=256,
                            doc_stride=128,
                            use_gpu=0)

# reader = TransformersReader(model_name_or_path="drive/MyDrive/bert_basefi_qafi", 
#                             context_window_size=70,
#                             max_seq_len=256,
#                             doc_stride=128,
#                             use_gpu=0)

pipe = ExtractiveQAPipeline(reader, retriever)

# Answers Bleu and Rouge

In [16]:
bleu_scores = []
rouge1_scores = []
rouge2_scores = []
rougel_scores = []
context_detection = []
context_accuracy = []


rouge = Rouge()
smoothie = SmoothingFunction().method4

for data in tqdm(qa):
    true_context = data['paragraphs'][0]['context']
    # true_context = true_context.replace('\n', ' ')

    for q_a in data['paragraphs'][0]['qas']:
        question = q_a['question']
        reference = " ".join([answer['text'] for answer in q_a['answers']])
        reference_sents = nlp(reference)
        reference_sents = list(reference_sents.sents)
        reference_sents = [sent.text.lstrip().rstrip() for sent in reference_sents]

        preds = pipe.run(
            query=question,
            params={"Retriever": {"top_k": 5}, "Reader": {"top_k": 3}}
            )

        candidate_sent_list = []

        for pred in preds['answers']:
            pred_answer = pred['answer']

            if pred_answer is not None:
                offset_start = pred['offset_start']
                offset_end = pred['offset_end']
                meta_name = pred['meta']['vector_id']

                pred_all_context_sents= []

                for pred in preds['documents']:
                    pred_all_context_sents += list(nlp(pred.to_dict()['text']).sents)

                    if pred.to_dict()['meta']['vector_id'] == meta_name:
                        pred_context = pred.to_dict()['text']
                        pred_context_sents = nlp(pred_context)
                        pred_context_sents = list(pred_context_sents.sents)
                        pred_context_sents = [sent.text for sent in pred_context_sents]
                        # pred_context = " ".join(pred_context_sents)

                pred_all_context_sents = [re.sub(r'\n+', ' ', sent.text).strip() for sent in pred_all_context_sents]

                doc = nlp(pred_answer)
                pred_answer_sents = list(doc.sents)
                pred_answer_sents = [sent.text for sent in pred_answer_sents] 


                for i, pred_context_sent in enumerate(pred_context_sents):
                    start_index = 0
                    end_index = len(pred_answer) 

                    for pred_answer_sent in pred_answer_sents:
                        right_reduction = len(pred_answer_sent) - len(pred_answer_sent.rstrip())
                        left_reduction = len(pred_answer_sent) - len(pred_answer_sent.lstrip())
                        end_index -= len(pred_answer_sent) + 0 if pred_context_sents[-1] == pred_answer_sent else 1

                        context_offset_start = pred_context.find(pred_context_sent)
                        context_offset_end = pred_context.find(pred_context_sent) + len(pred_context_sent)

                        if  context_offset_start - left_reduction <= offset_start + start_index and context_offset_end + right_reduction >= offset_end - end_index:
                            candidate_sent_list.append(pred_context_sent)
                        
                        start_index += len(pred_answer_sent) + 0 if pred_context_sents[-1] == pred_answer_sent else 1

                for reference_sent in reference_sents:
                    context_truth = 0

                    if reference_sent in pred_all_context_sents:
                        context_truth = 1
        
                    context_accuracy.append(context_truth)

        candidate_sent_set = set(candidate_sent_list)
        candidate = " ".join(candidate_sent_set)
        token_reference = nltk.word_tokenize(reference)
        token_candidate = nltk.word_tokenize(candidate)

        bleu_score = sentence_bleu(token_reference, 
                                    token_candidate, 
                                    smoothing_function=smoothie, 
                                    weights=(1, 0, 0, 0))
        rouge_score = rouge.get_scores(candidate, reference)

        bleu_scores.append(bleu_score)
        rouge1_scores.append(rouge_score[0]['rouge-1']['f'])
        rouge2_scores.append(rouge_score[0]['rouge-2']['f'])
        rougel_scores.append(rouge_score[0]['rouge-l']['f'])

100%|██████████| 4/4 [00:32<00:00,  8.02s/it]


In [17]:
context_accuracy.count(1)/len(context_accuracy)

0.16326530612244897

In [18]:
print("bleu -->", sum(bleu_scores)/len(bleu_scores))
print("rouge1 -->", sum(rouge1_scores)/len(rouge1_scores))
print("rouge2 -->", sum(rouge2_scores)/len(rouge2_scores))
print("rougel -->", sum(rougel_scores)/len(rougel_scores))

bleu --> 0.058515905624942674
rouge1 --> 0.2620780277795925
rouge2 --> 0.1480834323458272
rougel --> 0.24051164554439602


# Sample

#### Change data_index, question_index values to generate new samples


In [23]:
data_index = 0
data = qa[data_index]
true_context = data['paragraphs'][0]['context']
# true_context = true_context.replace('\n', ' ')
question_index = 0
q_a = data['paragraphs'][0]['qas'][question_index]
question = q_a['question']
reference = " ".join([answer['text'] for answer in q_a['answers']])
reference_sents = nlp(reference)
reference_sents = list(reference_sents.sents)
reference_sents = [sent.text.lstrip().rstrip() for sent in reference_sents]

preds = pipe.run(
    query=question,
    params={"Retriever": {"top_k": 5}, "Reader": {"top_k": 3}}
    )

candidate_sent_list = []

for pred in preds['answers']:
    pred_answer = pred['answer']

    if pred_answer is not None:
        offset_start = pred['offset_start']
        offset_end = pred['offset_end']
        meta_name = pred['meta']['vector_id']

        pred_all_context_sents= []

        for pred in preds['documents']:
            pred_all_context_sents += list(nlp(pred.to_dict()['text']).sents)

            if pred.to_dict()['meta']['vector_id'] == meta_name:
                pred_context = pred.to_dict()['text']
                pred_context_sents = nlp(pred_context)
                pred_context_sents = list(pred_context_sents.sents)
                pred_context_sents = [sent.text for sent in pred_context_sents]
                # pred_context = " ".join(pred_context_sents)

        pred_all_context_sents = [re.sub(r'\n+', ' ', sent.text).strip() for sent in pred_all_context_sents]

        doc = nlp(pred_answer)
        pred_answer_sents = list(doc.sents)
        pred_answer_sents = [sent.text for sent in pred_answer_sents] 


        for i, pred_context_sent in enumerate(pred_context_sents):
            start_index = 0
            end_index = len(pred_answer) 

            for pred_answer_sent in pred_answer_sents:
                right_reduction = len(pred_answer_sent) - len(pred_answer_sent.rstrip())
                left_reduction = len(pred_answer_sent) - len(pred_answer_sent.lstrip())
                end_index -= len(pred_answer_sent) + 0 if pred_context_sents[-1] == pred_answer_sent else 1

                context_offset_start = pred_context.find(pred_context_sent)
                context_offset_end = pred_context.find(pred_context_sent) + len(pred_context_sent)

                if  context_offset_start - left_reduction <= offset_start + start_index and context_offset_end + right_reduction >= offset_end - end_index:
                    candidate_sent_list.append(pred_context_sent)
                
                start_index += len(pred_answer_sent) + 0 if pred_context_sents[-1] == pred_answer_sent else 1

        for reference_sent in reference_sents:
            context_truth = 0

            if reference_sent in pred_all_context_sents:
                context_truth = 1

            context_accuracy.append(context_truth)

candidate_sent_set = set(candidate_sent_list)
candidate = " ".join(candidate_sent_set)


print(f"""Question:  {question}
###################################
Reference answers: {reference_sents}
###################################
Reference context: {true_context}
###################################
Predicted answers: {candidate_sent_set}
###################################
Retrieved context: {pred_all_context_sents}""")

Question:  How should pregnant women use Inlyta?
###################################
Reference answers: ['Do not use Inlyta if you are pregnant.', 'It could harm the unborn baby.', 'Use birth control to prevent pregnancy while you are receiving this medicine, whether you are a man or a woman Both men and women using this medicine should use effective birth control to prevent pregnancy.']
###################################
Reference context: Do not use Inlyta if you are pregnant. It could harm the unborn baby. Use birth control to prevent pregnancy while you are receiving this medicine, whether you are a man or a woman. Inlyta use by either parent may cause birth defects or miscarriage.
Axitinib can increase your risk of serious bleeding. Stop using Inlyta and call your doctor at once if you have severe stomach pain, bloody or tarry stools, coughing up blood, or any heavy or unusual bleeding.
Some people taking this medicine have developed a perforation (a hole or tear) or a fistula (a