In [1]:
from IPython.display import HTML

HTML('''<script>
code_show=true;
function code_toggle() {
    if (code_show){
        $('div.input').hide();
    } else {
        $('div.input').show();
    }
    code_show = !code_show
}
$(document).ready(code_toggle);
</script>
Click <a href="javascript:code_toggle()">here</a> to toggle on/off the code cells.''')



In [2]:
import ipywidgets as widgets
from IPython.display import display
import warnings
import PyPDF2
import datetime
import functools

warnings.filterwarnings('ignore')

# Create the file upload widget
file_upload = widgets.FileUpload(
    accept='',  # Accepted file types (e.g., '.txt', '.pdf', '.jpg', '.csv', etc.); leave empty for all file types
    multiple=True  # Set to True if you want to allow multiple files to be uploaded
)

fileListOutput = widgets.Output()

# Display the widget
display(file_upload, fileListOutput)

ongFileUploadVals = []

def on_value_change(change):
    global ongFileUploadVals
    global file_upload
    
    if (ongFileUploadVals ==  [] and list(file_upload.value) == []) or ongFileUploadVals != list(file_upload.value):

        @fileListOutput.capture()
        def file_list_output_render():
            global ongFileUploadVals
            
            fileListOutput.clear_output()
            display(widgets.Label("Uploaded Files: "))
            
            if change != {}:

                # set file_upload to previous file_upload and current file_upload files
                updatedFileUploadVal = []
                for file in ongFileUploadVals:
                    updatedFileUploadVal.append(file)

                for file in file_upload.value:
                    if file not in updatedFileUploadVal:
                        updatedFileUploadVal.append(file)

                # update ong_file_upload value
                ongFileUploadVals = updatedFileUploadVal
            
                for file in ongFileUploadVals:
                    # extract number of pages of uploaded files
                    
                    fileOpen = open(file['name'], 'rb')
                    reader = PyPDF2.PdfReader(fileOpen)
                    totalPgs = len(reader.pages)
                    remButton = widgets.Button(description='Remove')
                    
                    def remove_button_onclick(button, fileInfo=""):
                        global ongFileUploadVals
                        # update file_upload.value to remove chosen file
                        newVal = []
                        for file in file_upload.value:
                            if file['name'] != fileInfo:
                                newVal.append(file)
                        ongFileUploadVals = []
                        file_upload.value = tuple(newVal)
                        
                    remButton.on_click(functools.partial(remove_button_onclick, fileInfo=file['name']))
                    
                    display(widgets.Label("\n  file name: " + file['name'] + "  #pages: " + str(totalPgs)), remButton)

                    file_upload.value = tuple(updatedFileUploadVal)

        file_list_output_render()

file_upload.observe(on_value_change, names='value')

FileUpload(value=(), description='Upload', multiple=True)

Output()

In [3]:
print(file_upload.value)

()


In [4]:
import io
import PyPDF2
import ipywidgets as widgets
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm

documents = []
embeddings = []
indexPgNums = []
docPgNums = []

def create_embeddings(documents, model_name="paraphrase-MiniLM-L6-v2"):
    #device = torch.device("cuda" if torch.cuda.is_available() else print("cpu"))
    model = SentenceTransformer(model_name)
    #model.to("cuda")
    embeddings = model.encode(documents)
    
    return embeddings

def split_string(input_string, chunk_size):
    return [input_string[i:i+chunk_size] for i in range(0, len(input_string), chunk_size)], [i for i in range(0, len(input_string), chunk_size)]

def upload_file(file_upload, i):
    global documents
    global embeddings
    global indexPgNums
    global docPgNums
    
    text_content = ''

    # Get the uploaded file's content
    uploaded_file_data = [f.content.tobytes() for f in file_upload.value]
    
    # Get the uploaded file's content - OLD IPYWIDGETS
    # for file, attributes in file_upload.value.items():
    #     content = attributes.get('content') 
    #     uploaded_file_data.append(content)    
    
    uploaded_file_content = uploaded_file_data[i]
    
    # Convert the uploaded file content to a readable file-like buffer
    file_buffer = io.BytesIO(uploaded_file_content)

    # Read the PDF file using PyPDF2
    pdf_reader = PyPDF2.PdfReader(file_buffer)

    # Extract the text content from the PDF
    text_content = ''
    for page_num in tqdm(range(len(pdf_reader.pages)), desc='Processing pages'):
        page = pdf_reader.pages[page_num]
        text_content += page.extract_text()
        indexPgNums += [page_num] * len(page.extract_text())
        
    # on last iteration add the information to documents and embeddings
    if page_num == (len(pdf_reader.pages)-1):
        splitStringOg, splitStringPgNums = split_string(text_content, 1000)
        documents.append(splitStringOg)
        docPgNums.append(list(map(lambda idx: indexPgNums[idx], splitStringPgNums)))
        
        # Create document embeddings for semantic search
        embeddings.append(create_embeddings(documents[i]))        

# for file, attributes in file_upload.value.items():
#     content = attributes.get('content') 
#     uploaded_file_data.append(content)
    
button = widgets.Button(description="Load the data")

def on_button_click(b):
    if file_upload.value:
        for i in range(len(file_upload.value)):
            upload_file(file_upload, i)
            print("Data loaded successfully!")
    else:
        print("No files uploaded")

button.on_click(on_button_click)

display(button)

Button(description='Load the data', style=ButtonStyle())

In [5]:
print(file_upload.value)

()


In [7]:
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from IPython.display import clear_output
import ipywidgets as widgets
from IPython.display import display, IFrame
import fitz 
import functools
import string
import os, sys, subprocess
import webbrowser
from difflib import SequenceMatcher
import wordninja

from transformers import logging

logging.set_verbosity_error()

def retrieve_passages(query, documents, embeddings, docPgNums, top_k=5):
    model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
    #model.to("cuda")

    query_embedding = model.encode([query])[0]
    similarities = cosine_similarity(query_embedding.reshape(1, -1), embeddings)

    #top_indices = np.argsort(similarities[0])[-top_k:][::-1]

    threshold = 0.4  
    top_indices = np.argsort(similarities[0])[::-1][:top_k][similarities[0][np.argsort(similarities[0])[::-1][:top_k]] >= threshold]

    return [documents[i] for i in top_indices], [docPgNums[i] for i in top_indices]
    
def extract_answer(question, passage, model_name="roberta-large"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    
    inputs = tokenizer.encode_plus(question, str(passage), return_tensors="pt", max_length=512, truncation=True, padding='max_length', return_offsets_mapping=True)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    offset_mapping = inputs["offset_mapping"].squeeze(0)

    outputs = model(input_ids, attention_mask=attention_mask)

    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits)

    answer_start_char = offset_mapping[answer_start][0]
    answer_end_char = offset_mapping[answer_end][1]
    answer = passage[answer_start_char:answer_end_char]

    return answer

def qa_system(question, documents, embeddings, docPgNums, model_name="roberta-large", top_k=5):
    answers = []
    # keys = list(file_upload.value.keys())
    
    # iterate over all files to extract answers from each one
    for i in range(len(documents)):
        passages, passagePgNum = retrieve_passages(question, documents[i], embeddings[i], docPgNums[i], top_k=top_k)
        
        # print("passages", passages)
            
        for passage, pgNum in zip(passages, passagePgNum):
            answer = extract_answer(question, passage, model_name=model_name)
            answers.append({"fileName" : file_upload.value[i]['name'], "answer": answer, "context": passage, "pageNum" : pgNum})
            # answers.append({"fileName" : file_upload.value[keys[i]]['metadata']['name'], "answer": answer, "context": passage})
                
    return answers

import ipywidgets as widgets
from IPython.display import display
import string
import re

# This function is called when the button is clicked

# Create the text field and button widgets
text_field = widgets.Text(placeholder='Enter your question here')
button = widgets.Button(description='Search')

answers = []

def on_button_click(button):
    
    %clear
    clear_output(wait=True)
    # Get the text input from the text field
    text_input = text_field.value
    # Display the widgets
    text_field.value = text_input
    display(text_field, button)   
    
    out.clear_output()
    print("Generating answers, please wait...")
    # Retrieve answers using the QA system
    global answers
    
    answers = qa_system(text_input, documents, embeddings, docPgNums)
    
    # Print the answers
    for idx, answer in enumerate(answers):
        print(f"Document : {answer['fileName']}, Answer {idx + 1}: {answer['answer']}\nContext: {answer['context']}\n") 

    # clear on new question asked
    clear_file()
    
    if not answers: 
        print("No answers available")
    else:
        show_results()
    
# Set the callback function for the button
button.on_click(on_button_click)

# output to display answers 
out = widgets.Output(layout={'width': '100%', 'height': '100%'})

# Display the widgets
display(text_field, button, out)

# GUI OPTION #1 - OPENS FILE AS URL 
    
fileToClear = ''
def clear_file():
    if fileToClear != '':
        pdfDoc = fitz.open(fileToClear)
    
        # iterates pages and removes annotations generated by fitz
        for pgNum in range(len(pdfDoc)):
            page = pdfDoc[pgNum]
    
            pdfDoc.xref_set_key(page.xref, "Annots", "[]")
        
        pdfDoc.save(fileToClear, incremental=True, encryption=fitz.PDF_ENCRYPT_KEEP)

def clear_btn_click(button): clear_file();

def open_file(filename, pgNum):
    currDir = os.getcwdb()

    # opens file in default browser at specific page, NOTE: assumes that file is in current directory and Chrome is installed
    fileLoc = "file:///" + str(currDir)[2:][:-1] + "/" + filename.replace(" ", "%20") + "#page=" + str(pgNum)

    (webbrowser.get(using='chrome')).open(fileLoc)      
    

# https://bobbyhadz.com/blog/python-remove-non-ascii-characters-from-string
def remove_non_ascii(txt):
    ascii_chars = set(string.printable)

    return ''.join(
        filter(lambda x: x in ascii_chars, txt)
   )

# https://www.geeksforgeeks.org/python-remove-punctuation-from-string/
def remove_punct(txt):
    return txt.translate(str.maketrans('', '', string.punctuation))

def fix_text(txt):
    return remove_punct(remove_non_ascii(txt).replace(" ", ""))

def line_eq(context, text, ratio=0.75):
    # return context == text
    # using difflib 
    return (SequenceMatcher(None, context, text).ratio()) > ratio

def bbox_within (bbox1, bbox2):
    return bbox2[0][0] <= bbox1[0] and bbox2[0][1] <= bbox1[1] and bbox2[-1][2] >= bbox1[2] and bbox2[-1][3] >= bbox1[3]

def is_valid_bbox(bbox):
    return bbox[0] <= bbox[2] and bbox[1] <= bbox[3]
    
def highlight_file(pdfDoc, fileInfo, all, fileAnswerIdx, overallHighlightTEXT, pgStart, blockStart):    
    contextLst = [line for line in fileInfo[1].split('\n') if line.strip() not in {"", "-", "/", "."} and len(line.strip().split(" ")) > 1]

    if len(contextLst) > 5:
        contextLst = contextLst[1:-1]
        
    # SPECIAL CASE - multiple lines of text in single line of context
    else:
        print("IN SPECIAL CASE")
        # preprocess given context lines
        splitGivenContext = []
        
        for cont in contextLst:
            splitGivenContext = splitGivenContext + cont.replace(":", ".").replace(";", ".").replace("!", ".").replace("?", ".").replace("--", " ").replace("-", "").replace("‘", ".").replace("[",".").replace("]",".").split(".")
        splitGivenContext = list(filter(lambda x: len(x.split(" ")) > 3, splitGivenContext))
        # print("splitGivenContext", splitGivenContext)
        
        splitContextDict = dict()

        pgOccur = [0] * len(pdfDoc)
        for pgNum in range(pgStart, len(pdfDoc)):            
            page = pdfDoc[pgNum]
            blocks = page.get_text("dict")["blocks"]
            
            for cont in splitGivenContext:
                # search for split context in text
                locs = page.search_for(cont)

                # find bbox of line that locs are within to add line text to new context list
                for blockIdx, block in enumerate(blocks): 
                    if block['type'] == 0:
                        for i, line in enumerate(block['lines']):
                            text, bbox = "", []
                            lineHighlighted = False
                                
                            for span in line['spans']:
                                # extract bbox and text of line
                                bbox.append(span['bbox'])
                                text = text + " " + span['text']

                            if locs:
                                for loc in locs:
                                    # create a dictionary to store all of the locations of found contexts
                                    if bbox_within(loc, bbox):
                                        if cont not in splitContextDict:
                                            splitContextDict[cont] = [(pgNum, text)]
                                        splitContextDict[cont].append((pgNum, text))

                                    # to find page with hightest occurance of context locations incr count of current page
                                    pgOccur[pgNum] = pgOccur[pgNum] + 1   

                            # if location not found attempt to look through line equality
                            elif line_eq(fix_text(cont), fix_text(text)):
                                if cont not in splitContextDict:
                                    splitContextDict[cont] = [(pgNum, text)]
                                splitContextDict[cont].append((pgNum, text))
                                pgOccur[pgNum] = pgOccur[pgNum] + 1 
                            
        newContextLst = []
        if list(filter(lambda x: x != 0, pgOccur)) != []: 
            pgHighOccurTmp = pgOccur[:]
            pgHighOccurTmp.sort(reverse=True)
            
            pgHighOccur = pgOccur.index(pgHighOccurTmp[0])
            pgSecondOccur = pgOccur.index(pgHighOccurTmp[1])
            if abs(pgHighOccur - pgSecondOccur) != 1: pgSecondOccur = pgHighOccur
            
            # create new context list from lines within locations in the highest occuring page
            for cont in splitContextDict:
                for (pg, txt) in splitContextDict[cont]:
                    if (pg == pgHighOccur or pg == pgSecondOccur) and txt not in newContextLst:
                        newContextLst.append(txt)
                        break
        if newContextLst:
            print("1")
            contextLst = list(filter(lambda x: fix_text(x.strip()) != "" and len(x.split(" ")) > 1, newContextLst)) 
            
        # iffy backup plan to if it can't find any of the split context in the text, just use the split context itself 
        # as the new context list
        else:
            print("2")
            contextLst = list(filter(lambda x: fix_text(x.strip()) != "" and len(x.strip().split(" ")) > 1, splitGivenContext))
       
    ogContextLstLen = len(contextLst)
    overallHighlight, found, done = False, False, False
    highlightColors = [(14/255, 48/255, 0), (34/255, 224/255, 27/255), (151/255, 250/255, 147/255), (188/255, 245/255, 98/255), (252/255, 255/255, 161/255)]
    
    # iterate through pages in file
    for pgNum in range(pgStart, len(pdfDoc)): 
        page = pdfDoc[pgNum]
        blocks = page.get_text("dict")["blocks"]
        
        if not done:
            
            # highlight sections and open file         
            for blockIdx, block in enumerate(blocks[blockStart:], blockStart): 
                # to highlight entire paragraph 
                paraHighlighted = False

                if not found:
                    hangingLinesBBOX = []
                    hangingLinesTEXT = []
                
                # look for text block by line
                if block['type'] == 0:
                    for i, line in enumerate(block['lines']):
                        text = ""
                        bbox = []
                        lineHighlighted = False

                        for span in line['spans']:
                            # extract bbox and text of line
                            bbox.append(span['bbox'])
                            text = text + " " + span['text']

                        # check if text corresponds to context in contextLst
                        for contextIdx, context in enumerate(contextLst):  
                            if text.rstrip() != "" and line_eq(fix_text(context), fix_text(text)):
                                if not found: found = True
                                
                                # highlight found context in page, using bbox coords of first and last bbox of line
                                if fix_text(text) not in overallHighlightTEXT:                                    
                                    overallHighlightTEXT.append(fix_text(text))
                                    if is_valid_bbox((bbox[0][0], bbox[0][1], bbox[-1][2], bbox[-1][3])):
                                        if not all:
                                            hi = page.add_rect_annot((bbox[0][0], bbox[0][1], bbox[-1][2], bbox[-1][3]))
                                            hi.set_colors(stroke=(1,1,0))
                                            hi.update(opacity=0.3, fill_color=(1,1,0))
                                        else:
                                            if fileAnswerIdx < len(highlightColors):
                                                hi = page.add_rect_annot((bbox[0][0], bbox[0][1], bbox[-1][2], bbox[-1][3]))
                                                hi.set_colors(stroke=highlightColors[fileAnswerIdx])
                                                hi.update(opacity=0.2, fill_color=highlightColors[fileAnswerIdx])
                                            else:
                                                hi = page.add_rect_annot((bbox[0][0], bbox[0][1], bbox[-1][2], bbox[-1][3]))
                                                hi.set_colors(stroke=highlightColors[-1])
                                                hi.update(opacity=0.2, fill_color=highlightColors[-1])
                                    else:
                                        for b in bbox:
                                            if page.get_textbox(b).strip() != "":
                                                if not all:
                                                    hi = page.add_rect_annot(b)
                                                    hi.set_colors(stroke=(1,1,0))
                                                    hi.update(opacity=0.3, fill_color=(1,1,0))
                                                else:
                                                    if fileAnswerIdx < len(highlightColors):
                                                        hi = page.add_rect_annot(b)
                                                        hi.set_colors(stroke=highlightColors[fileAnswerIdx])
                                                        hi.update(opacity=0.2, fill_color=highlightColors[fileAnswerIdx])
                                                    else:
                                                        hi = page.add_rect_annot(b)
                                                        hi.set_colors(stroke=highlightColors[-1])
                                                        hi.update(opacity=0.2, fill_color=highlightColors[-1])
                                                break
                                    
                                contextLst.remove(context)
    
                                paraHighlighted = True
                                lineHighlighted = True
                                overallHighlight = True

                                lastHighlightCount = 0
                                
                                break
                            
                        # if line is not highlighted than it is a hanging line
                        if not lineHighlighted and text.strip() != "":
                            if is_valid_bbox((bbox[0][0], bbox[0][1], bbox[-1][2], bbox[-1][3])):
                                hangingLinesBBOX.append((bbox[0][0], bbox[0][1], bbox[-1][2], bbox[-1][3]))
                                hangingLinesTEXT.append(text)
                            else:
                                for b in bbox:
                                    if page.get_textbox(b).strip() != "":
                                        hangingLinesBBOX.append(b)
                                        hangingLinesTEXT.append(text)
                                        break
                                
                        # check if last character in text is period and we have yet to actually highlight paragraph - previous paragraph that shouldn't be highlighted in block
                        # CASE - same block multiple paragraphs
                        textWOtrailNum = re.sub(r'\d+$', "", text.rstrip())
                        if textWOtrailNum.rstrip() != "" and textWOtrailNum.rstrip()[-1] in [".", "!", "?", ":"] and paraHighlighted == False and overallHighlight == False:
                            hangingLinesBBOX = []
                            hangingLinesTEXT = []

                        # check if last character in text is period - new paragraph and we are no longer looking for context 
                        # CASE - same block multiple paragraphs
                        if textWOtrailNum.rstrip() != "" and textWOtrailNum.rstrip()[-1] in [".", "!", "?", ":"] and contextLst == [] and paraHighlighted == True:
                            paraHighlighted = False
    
                            for bboxHanging, textHanging in zip(hangingLinesBBOX, hangingLinesTEXT):
                                if fix_text(textHanging) not in overallHighlightTEXT:
                                    overallHighlightTEXT.append(fix_text(textHanging))
                                    if not all:      
                                        hi = page.add_rect_annot(bboxHanging)
                                        hi.set_colors(stroke=(1,1,0))
                                        hi.update(opacity=0.3, fill_color=(1,1,0))
                                    else:
                                        if fileAnswerIdx < len(highlightColors):
                                            hi = page.add_rect_annot(bboxHanging)
                                            hi.set_colors(stroke=highlightColors[fileAnswerIdx])
                                            hi.update(opacity=0.2, fill_color=highlightColors[fileAnswerIdx])
                                        else:
                                            hi = page.add_rect_annot(bboxHanging)
                                            hi.set_colors(stroke=highlightColors[-1])
                                            hi.update(opacity=0.2, fill_color=highlightColors[fileAnswerIdx])
                            hangingLinesBBOX = []
                            hangingLinesTEXT = []

                            # check if any continuation of paragraph in previous block
                            if blockIdx != 0 and blocks[blockIdx-1]['type'] == 0:

                                # get text of top line of current block
                                topLineCurrTEXT = ""
                                for span in blocks[blockIdx]['lines'][0]['spans']:
                                    topLineCurrTEXT = topLineCurrTEXT + " " + span['text']

                                # get text of last line of previous block
                                lastLinePrevBlock = ""    
                                for span in blocks[blockIdx-1]['lines'][-1]['spans']: 
                                    lastLinePrevBlock = lastLinePrevBlock + " " + span['text']
                                        
                                # check if previous block is part of the same paragraph by checking if it has any end punctuation and if top of current block is highlighted
                                # CASE - same paragraph multiple blocks
                                lastLineWOtrailNum = re.sub(r'\d+$', "", lastLinePrevBlock.rstrip())
                                if (fix_text(lastLinePrevBlock) not in overallHighlightTEXT and lastLineWOtrailNum.rstrip() != "" and lastLineWOtrailNum.rstrip()[-1] not in [".", "!", "?", ":"] and fix_text(topLineCurrTEXT) in overallHighlightTEXT):
                                    fileInfo = (fileInfo[0], "temp temp" + "\n" + lastLinePrevBlock + "\n" + "\n".join(contextLst) + "\n" + "temp temp")
                                    overallHighlightTEXT = highlight_file(pdfDoc, fileInfo, all, fileAnswerIdx, overallHighlightTEXT, pgNum, blockIdx-1)
                                       
                            done = True
                            
                        # if last line in block check if block was partially highlighted then highlight hanging lines
                        if i == (len(block['lines'])-1) and paraHighlighted == True:
                                
                            for bbox, textHanging in zip(hangingLinesBBOX, hangingLinesTEXT):
                                if fix_text(textHanging) not in overallHighlightTEXT:
                                    overallHighlightTEXT.append(fix_text(textHanging))
                                    if not all:
                                        hi = page.add_rect_annot(bbox)
                                        hi.set_colors(stroke=(1,1,0))
                                        hi.update(opacity=0.3, fill_color=(1,1,0))
                                    else:
                                        if fileAnswerIdx < len(highlightColors):
                                            hi = page.add_rect_annot(bbox)
                                            hi.set_colors(stroke=highlightColors[fileAnswerIdx])
                                            hi.update(opacity=0.2, fill_color=highlightColors[fileAnswerIdx])
                                        else:
                                            hi = page.add_rect_annot(bbox)
                                            hi.set_colors(stroke=highlightColors[-1])
                                            hi.update(opacity=0.2, fill_color=highlightColors[fileAnswerIdx])

                            if blockIdx != 0 and blocks[blockIdx-1]['type'] == 0:

                                # get text of top line of current block
                                topLineCurrTEXT = ""
                                for span in blocks[blockIdx]['lines'][0]['spans']:
                                    topLineCurrTEXT = topLineCurrTEXT + " " + span['text']

                                # get text of last line of previous block
                                lastLinePrevBlock = ""    
                                for span in blocks[blockIdx-1]['lines'][-1]['spans']: 
                                    lastLinePrevBlock = lastLinePrevBlock + " " + span['text']

                                # check if previous block is part of the same paragraph by checking if it has any end punctuation and if top of current block is highlighted
                                # CASE - same paragraph multiple blocks
                                lastLineWOtrailNum = re.sub(r'\d+$', "", lastLinePrevBlock.rstrip())
                                if (fix_text(lastLinePrevBlock) not in overallHighlightTEXT and lastLineWOtrailNum.rstrip() != "" and lastLineWOtrailNum.rstrip()[-1] not in [".", "!", "?", ":"] and fix_text(topLineCurrTEXT) in overallHighlightTEXT):
                                    fileInfo = (fileInfo[0], "temp temp" + "\n" + lastLinePrevBlock + "\n" + "\n".join(contextLst) + "\n" + "temp temp")

                                    overallHighlightTEXT = highlight_file(pdfDoc, fileInfo, all, fileAnswerIdx, overallHighlightTEXT, pgNum, blockIdx-1)
                                    
                            # if the last line doesn't end with an end punctuation include next block into paragraph and there isn't context we are looking for
                            # CASE - same paragraph multiple blocks
                            textWOtrailNum = re.sub(r'\d+$', "", text.rstrip())

                            if textWOtrailNum.rstrip() != "" and textWOtrailNum.rstrip()[-1] not in [".", "!", "?", ":"] and blockIdx != len(blocks)-1 and blocks[blockIdx+1]['type'] == 0 and contextLst == []:
                                nextLine = ""
                                for span in blocks[blockIdx+1]['lines'][0]['spans']:
                                    nextLine = nextLine + " " + span['text']   
                            
                                if nextLine.rstrip() != "":
                                    contextLst.append(nextLine)
                        
                            if contextLst == []:
                                done = True
    
    return overallHighlightTEXT
    
@out.capture()
def show_results():
    # button to remove highlight and clear output 
    fileToClear = ''
    clearBtn = widgets.Button(description="Clear")
    prevFileName = "" 
    
    clearBtn.on_click(clear_btn_click)
    display(clearBtn)

    # iterate through all answers 
    for idx, answer in enumerate(answers):

        # check for a change in the filename for the answer
        if prevFileName != answer['fileName']:
            
            # button to highlight all context on file
            highlightAllButton = widgets.Button(description='Highlight All')

            def on_highlight_all_button_click(button, fileInfo=("", "")):
                global fileToClear
                
                # clear on direct change
                clear_file()

                fileName = fileInfo[0]
                pdfDoc = fitz.open(fileName) 
                fileAnswerIdx = 0
                overallHighlightTEXT = []
            
                # iterate through answers to use only those with corresponding fileName
                for idx, answer in enumerate(answers):
                    
                    if answer['fileName'] == fileName:
                        
                        # highlight relevant sections
                        if fileAnswerIdx == 0:
                            overallHighlightTEXT = highlight_file(pdfDoc, (answer['fileName'], answer['context']), True, fileAnswerIdx, overallHighlightTEXT, answer['pageNum'], 0)
                        else:
                            overallHighlightTEXT = highlight_file(pdfDoc, (answer['fileName'], answer['context']), True, fileAnswerIdx, overallHighlightTEXT, answer['pageNum'], 0)
                            
                        fileAnswerIdx = fileAnswerIdx + 1
                        
                pdfDoc.save(fileName, incremental=True, encryption=fitz.PDF_ENCRYPT_KEEP)
                fileToClear = fileName
                open_file(fileName, fileInfo[1]+1)

            highlightAllButton.on_click(functools.partial(on_highlight_all_button_click, fileInfo=(answer['fileName'], answers[0]['pageNum'])))
            
            display(highlightAllButton)

        prevFileName = answer['fileName']
        
        # display answer
        label = widgets.HTML(value="<style>p{word-wrap: break-word}</style> <p> File Name: " + answer['fileName'] + "</p>"  + "\n<p> Context: " + answer['context'] + "</p>")
        display(label)
        
        # button to highlight context on original PDF
        button = widgets.Button(description='Citation')
        
        def on_button_click(button, fileInfo=("", "", "")):
            global fileToClear
                        
            # if direct change from file to file, clear previous file
            clear_file()
            pdfDoc = fitz.open(fileInfo[0])        
            
            # highlight corresponding file
            temp = highlight_file(pdfDoc, (fileInfo[0], fileInfo[1]), False, 0, [], fileInfo[2], 0)
            
            pdfDoc.save(fileInfo[0], incremental=True, encryption=fitz.PDF_ENCRYPT_KEEP)
            fileToClear = fileInfo[0]
            
            open_file(fileInfo[0], fileInfo[2] + 1)
        button.on_click(functools.partial(on_button_click, fileInfo=(answer['fileName'], answer['context'], answer['pageNum'])))
        display(button)
    

Text(value='', placeholder='Enter your question here')

Button(description='Search', style=ButtonStyle())

Output(layout=Layout(height='100%', width='100%'))