In [1]:
from IPython.display import HTML

HTML('''<script>
code_show=true;
function code_toggle() {
    if (code_show){
        $('div.input').hide();
    } else {
        $('div.input').show();
    }
    code_show = !code_show
}
$(document).ready(code_toggle);
</script>
Click <a href="javascript:code_toggle()">here</a> to toggle on/off the code cells.''')



In [89]:
import ipywidgets as widgets
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')

# Create the file upload widget
file_upload = widgets.FileUpload(
    accept='',  # Accepted file types (e.g., '.txt', '.pdf', '.jpg', '.csv', etc.); leave empty for all file types
    multiple=True  # Set to True if you want to allow multiple files to be uploaded
)

fileListOutput = widgets.Output()

# Display the widget
display(file_upload, fileListOutput)

def on_value_change(change):
    with fileListOutput:
        fileListOutput.clear_output()
        print("Uploaded Files: ")
        for file in change['new']:
            print("\n" + file['name'])


file_upload.observe(on_value_change, names='value')

FileUpload(value=(), description='Upload', multiple=True)

Output()

In [11]:
import io
import PyPDF2
import ipywidgets as widgets
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm

documents = []
embeddings = []

def create_embeddings(documents, model_name="paraphrase-MiniLM-L6-v2"):
    #device = torch.device("cuda" if torch.cuda.is_available() else print("cpu"))
    model = SentenceTransformer(model_name)
    #model.to("cuda")
    embeddings = model.encode(documents)
    
    return embeddings

def split_string(input_string, chunk_size):
    return [input_string[i:i+chunk_size] for i in range(0, len(input_string), chunk_size)]

def upload_file(file_upload, i):
    global documents
    global embeddings

    text_content = ''

    # Get the uploaded file's content
    uploaded_file_data = [f.content.tobytes() for f in file_upload.value]
    
    # Get the uploaded file's content - OLD IPYWIDGETS
    # for file, attributes in file_upload.value.items():
    #     content = attributes.get('content') 
    #     uploaded_file_data.append(content)    
    
    uploaded_file_content = uploaded_file_data[i]
    
    # Convert the uploaded file content to a readable file-like buffer
    file_buffer = io.BytesIO(uploaded_file_content)

    # Read the PDF file using PyPDF2
    pdf_reader = PyPDF2.PdfReader(file_buffer)

    # Extract the text content from the PDF
    text_content = ''
    for page_num in tqdm(range(len(pdf_reader.pages)), desc='Processing pages'):
        page = pdf_reader.pages[page_num]
        text_content += page.extract_text()

    # on last iteration add the information to documents and embeddings
    if page_num == (len(pdf_reader.pages)-1):
        documents.append(split_string(text_content, 1000))
    
        # Create document embeddings for semantic search
        embeddings.append(create_embeddings(documents[i]))        

button = widgets.Button(description="Load the data")

def on_button_click(b):
    if file_upload.value:
        for i in range(len(file_upload.value)):
            upload_file(file_upload, i)
            print("Data loaded successfully!")
    else:
        print("No files uploaded")

button.on_click(on_button_click)

display(button)

Button(description='Load the data', style=ButtonStyle())

In [15]:
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from IPython.display import clear_output


from transformers import logging

logging.set_verbosity_error()

def retrieve_passages(query, documents, embeddings, top_k=5):
    model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
    #model.to("cuda")
    query_embedding = model.encode([query])[0]
    similarities = cosine_similarity(query_embedding.reshape(1, -1), embeddings)

    top_indices = np.argsort(similarities[0])[-top_k:][::-1]

    # print(top_indices)
    return [documents[i] for i in top_indices]

def extract_answer(question, passage, model_name="roberta-large"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)

    inputs = tokenizer.encode_plus(question, str(passage), return_tensors="pt", max_length=512, truncation=True, padding='max_length', return_offsets_mapping=True)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    offset_mapping = inputs["offset_mapping"].squeeze(0)

    outputs = model(input_ids, attention_mask=attention_mask)

    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits)

    answer_start_char = offset_mapping[answer_start][0]
    answer_end_char = offset_mapping[answer_end][1]
    answer = passage[answer_start_char:answer_end_char]

    return answer

def qa_system(question, documents, embeddings, model_name="roberta-large", top_k=5):
        answers = []

        # iterate over all files to extract answers from each one
        for i in range(len(documents)):
            passages = retrieve_passages(question, documents[i], embeddings[i], top_k=top_k)
        
            #print(passages)
        
            for passage in passages:
                answer = extract_answer(question, passage, model_name=model_name)
                answers.append({"fileName" : file_upload.value[i]['name'], "answer": answer, "context": passage})

        return answers

import ipywidgets as widgets
from IPython.display import display
# This function is called when the button is clicked

# Create the text field and button widgets
text_field = widgets.Text(placeholder='Enter your question here')
button = widgets.Button(description='Search')

answers = []

def on_button_click(button):
    %clear
    clear_output(wait=True)
    # Get the text input from the text field
    text_input = text_field.value
    # Display the widgets
    text_field.value = text_input
    display(text_field, button)   
    
    
    print("Generating answers, please wait...")
    # Retrieve answers using the QA system
    global answers
    answers = qa_system(text_input, documents, embeddings)

    # Print the answers
    for idx, answer in enumerate(answers):
        print(f"Document : {answer['fileName']}, Answer {idx + 1}: {answer['answer']}\nContext: {answer['context']}\n")    
        
    if not answers: 
        print("No answers available")

        
    # Call your function with the input and print the result
    #result = your_function(text_input)
    #print(result)



# Set the callback function for the button
button.on_click(on_button_click)

# Display the widgets
display(text_field, button)

Text(value='', placeholder='Enter your question here')

Button(description='Search', style=ButtonStyle())

In [90]:
# OPTION #1 - OPENS FILE AS URL 

import ipywidgets as widgets
from IPython.display import display, IFrame
import fitz 
import functools
import string
import os, sys, subprocess
import webbrowser

if answers:
    
    # extra button to clear all uploaded files from hightlight
    clearButton = widgets.Button(description='Clear Files')

    def clear_file(fileToClear):
        if fileToClear != '':
            pdfDoc = fitz.open(fileToClear)
        
            # iterates pages and removes annotations generated by fitz
            for pgNum in range(len(pdfDoc)):
                page = pdfDoc[pgNum]
        
                pdfDoc.xref_set_key(page.xref, "Annots", "[]")
            
            pdfDoc.save(fileToClear, incremental=True, encryption=fitz.PDF_ENCRYPT_KEEP)
            
    def clearing_files(button):
        fileNames = set()
        # extract file names from answers
        for answer in answers:
            fileNames.add(answer['fileName'])
        # clear files
        for file in fileNames:
            clear_file(file)
            
    clearButton.on_click(clearing_files)
    display(clearButton)
    
    def open_file(filename, pgNum):
        currDir = os.getcwdb()
        
        # opens file in default browser at specific page, NOTE: assumes that file is in current directory and Chrome is installed
        fileLoc = "file:///" + str(currDir)[2:][:-1] + "/" + filename + "#page=" + str(pgNum)
        (webbrowser.get(using='chrome')).open(fileLoc)  
        
    def fix_text(context):
        contextFix = (context).replace("-", "")
    
        ascii_chars = set(string.printable)
        contextFix = ''.join(filter(lambda x: x in ascii_chars, contextFix))

        return contextFix
        
    def preprocess_context(context):
        # removing hyphen, newline and non-ascii characters
        contextFix = fix_text(context)
    
        contextFixLst = contextFix.split("\n")
    
        contextFixLstWOWhite = []
        for contextFix in contextFixLst:
            if not all(ch==" " for ch in contextFix):
                contextFixLstWOWhite.append(contextFix)
                
        # split content into a list of lines
        return contextFixLstWOWhite
    
    foundPgNum = -1
    
    # iterate through all answers 
    for idx, answer in enumerate(answers):
        
        # display answer
        label = widgets.HTML(value="<style>p{word-wrap: break-word}</style> <p> File Name: " + answer['fileName'] + "</p>" + 
                             "\n<p> Answer: " + answer['answer'] + "</p>")
        display(label)
    
        # button to highlight context on original PDF
        button = widgets.Button(description='Context')
    
        def on_button_click(button, fileInfo=("", "")):
            global foundPgNum
            
            pdfDoc = fitz.open(fileInfo[0])        
            found = False                    
                
            # iterate through pages in file
            for pgNum in range(len(pdfDoc)):
            
                page = pdfDoc[pgNum]            
                contextLst = preprocess_context(fileInfo[1])
                found = False
                
                # highlight sections and open file
                blocks = page.get_text("dict")["blocks"]
                
                for block in blocks: 
                    
                    # look for text block by line
                    if block['type'] == 0:
                        
                        for line in block['lines']:
                            for span in line['spans']:
                                
                                # extract bbox and text of line
                                bbox = span['bbox']
                                text = span['text']

                                # check if text corresponds to context in contextLst
                                for context in contextLst:
    
                                    if context.replace(" ", "") == fix_text(text).replace(" ", ""):
                                        # save pg number of where context is found
                                        if not found:
                                            foundPgNum = pgNum + 1
                                            found = True
                                            
                                        # highlight found context in page
                                        page.add_highlight_annot(bbox)
            
            pdfDoc.save(fileInfo[0], incremental=True, encryption=fitz.PDF_ENCRYPT_KEEP)
            open_file(fileInfo[0], foundPgNum)
                        
        button.on_click(functools.partial(on_button_click, fileInfo=(answer['fileName'], answer['context'])))
        display(button)

Button(description='Clear Files', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-1.pdf</p>\n<p> Answer: </p>')

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-1.pdf</p>\n<p> Answer: </p>')

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-1.pdf</p>\n<p> Answer: must c…

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-1.pdf</p>\n<p> Answer: g., Ex…

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-1.pdf</p>\n<p> Answer: </p>')

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-2.pdf</p>\n<p> Answer: </p>')

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-2.pdf</p>\n<p> Answer: </p>')

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-2.pdf</p>\n<p> Answer: ed.\nU…

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-2.pdf</p>\n<p> Answer: ﬁlter …

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-2.pdf</p>\n<p> Answer: ). We …

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-3.pdf</p>\n<p> Answer: </p>')

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-3.pdf</p>\n<p> Answer: </p>')

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-3.pdf</p>\n<p> Answer: </p>')

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-3.pdf</p>\n<p> Answer: </p>')

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-3.pdf</p>\n<p> Answer: First,…

Button(description='Context', style=ButtonStyle())

In [76]:
# OPTION #2 - INLINE GUI 

# import ipywidgets as widgets
# from IPython.display import display, IFrame, FileLink
# import fitz 
# import functools
# import string
# import os, sys, subprocess
# import webbrowser

# if answers:
#     def fix_text(context):
#         contextFix = (context).replace("-", "")
    
#         ascii_chars = set(string.printable)
#         contextFix = ''.join(filter(lambda x: x in ascii_chars, contextFix))

#         return contextFix
        
#     def preprocess_context(context):
#         # removing hyphen, newline and non-ascii characters
#         contextFix = fix_text(context)
    
#         contextFixLst = contextFix.split("\n")
    
#         contextFixLstWOWhite = []
#         for contextFix in contextFixLst:
#             if not all(ch==" " for ch in contextFix):
#                 contextFixLstWOWhite.append(contextFix)
                
#         # split content into a list of lines
#         return contextFixLstWOWhite
    
#     # to display relevant PDF
#     out = widgets.Output(layout={'border': '1px solid black', 'width':'400px', 'height':'600px'})
#     display(out)
    
#     # to display page number
#     outPgNum = widgets.Output(layout={'width':'300px', 'height':'50px'})
#     display(outPgNum)
    
#     # button to remove highlight and clear output 
#     fileToClear = ''
#     clearBtn = widgets.Button(description="Clear")
    
#     foundPgNum = -1
    
#     def clear_file():
#         if fileToClear != '':
#             pdfDoc = fitz.open(fileToClear)
        
#             # iterates pages and removes annotations generated by fitz
#             for pgNum in range(len(pdfDoc)):
#                 page = pdfDoc[pgNum]
        
#                 pdfDoc.xref_set_key(page.xref, "Annots", "[]")
            
#             pdfDoc.save(fileToClear, incremental=True, encryption=fitz.PDF_ENCRYPT_KEEP)
    
#             # clear output cell 
#             out.clear_output()
#             outPgNum.clear_output()
    
#     def clear_btn_click(button): clear_file();
#     clearBtn.on_click(clear_btn_click)
#     display(clearBtn)
    
#     # iterate through all answers 
#     for idx, answer in enumerate(answers):
        
#         # display answer
#         label = widgets.HTML(value="<style>p{word-wrap: break-word}</style> <p> File Name: " + answer['fileName'] + "</p>" + 
#                              "\n<p> Answer: " + answer['answer'] + "</p>")
#         display(label)
    
#         # button to highlight context on original PDF
#         button = widgets.Button(description='Context')
    
#         def on_button_click(button, fileInfo=("", "")):
#             global foundPgNum
#             global fileToClear
            
#             # if direct change from file to file, clear previous file
#             clear_file()
            
#             pdfDoc = fitz.open(fileInfo[0])        
#             found = False                    
                
#             # iterate through pages in file
#             for pgNum in range(len(pdfDoc)):
            
#                 page = pdfDoc[pgNum]            
#                 contextLst = preprocess_context(fileInfo[1])
                
#                 found = False
                
#                 # highlight sections and open file
                
#                 blocks = page.get_text("dict")["blocks"]
                
#                 for block in blocks: 
                    
#                     # look for text block by line
#                     if block['type'] == 0:
                        
#                         for line in block['lines']:
#                             for span in line['spans']:
                                
#                                 # extract bbox and text of line
#                                 bbox = span['bbox']
#                                 text = span['text']

#                                 # check if text corresponds to context in contextLst
#                                 for context in contextLst:
    
#                                     if context.replace(" ", "") == fix_text(text).replace(" ", ""):
#                                         # save pg number of where context is found
#                                         if not found:
#                                             foundPgNum = pgNum + 1
#                                             found = True
                                            
#                                         # highlight found context in page
#                                         page.add_highlight_annot(bbox)
            
#             pdfDoc.save(fileInfo[0], incremental=True, encryption=fitz.PDF_ENCRYPT_KEEP)
#             fileToClear = fileInfo[0]
            
#             # clear output cell and render new file
#             out.clear_output()
#             @out.capture()
#             def render():
#                 display(IFrame(src=fileInfo[0],width=400, height=600))
#             render()
    
#             outPgNum.clear_output()
#             @outPgNum.capture()
#             def renderPgNum():
#                 display(HTML("<p> File Name: " + fileInfo[0] + "</p>"))
#                 display(HTML("<p> Page Number: " + str(foundPgNum) + "</p>"))
#             renderPgNum()
            
#         button.on_click(functools.partial(on_button_click, fileInfo=(answer['fileName'], answer['context'])))
#         display(button)


Output(layout=Layout(border_bottom='1px solid black', border_left='1px solid black', border_right='1px solid b…

Output(layout=Layout(height='50px', width='300px'))

Button(description='Clear', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-1.pdf</p>\n<p> Answer: </p>')

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-1.pdf</p>\n<p> Answer: </p>')

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-1.pdf</p>\n<p> Answer: must c…

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-1.pdf</p>\n<p> Answer: g., Ex…

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-1.pdf</p>\n<p> Answer: </p>')

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-2.pdf</p>\n<p> Answer: </p>')

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-2.pdf</p>\n<p> Answer: </p>')

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-2.pdf</p>\n<p> Answer: ed.\nU…

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-2.pdf</p>\n<p> Answer: ﬁlter …

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-2.pdf</p>\n<p> Answer: ). We …

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-3.pdf</p>\n<p> Answer: </p>')

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-3.pdf</p>\n<p> Answer: </p>')

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-3.pdf</p>\n<p> Answer: </p>')

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-3.pdf</p>\n<p> Answer: </p>')

Button(description='Context', style=ButtonStyle())

HTML(value='<style>p{word-wrap: break-word}</style> <p> File Name: sample-pdf-cs-3.pdf</p>\n<p> Answer: First,…

Button(description='Context', style=ButtonStyle())

In [6]:
# TEST CELL

# def open_file(filename, pgNum):
#     """ UNUSED """
#     currDir = os.getcwdb()
    
#     # opens file in default browser at specific page, NOTE: assumes that file is in current directory and Chrome is installed
#     fileLoc = "file:///" + str(currDir)[2:][:-1] + "/" + filename + "#page=" + str(pgNum)
#     (webbrowser.get(using='chrome')).open(fileLoc)  

#     # alternative method - not specific to page
#     # fileLoc = "file:///" + str(currDir)[2:][:-1] + "/" + filename + "#page=" + str(pgNum)
#     # webbrowser.open(fileLoc)

In [7]:
# TEST CELL

# import ipywidgets as widgets
# from IPython.display import display, IFrame

# out = widgets.Output(layout={'border': '1px solid black', 'width':'400px', 'height':'600px'})
# display(out)

# @out.capture()
# def render():
#     display(IFrame(src="sample-pdf-cs-1.pdf",width=400, height=600))

# render()

# but = widgets.Button(description='test')
# def test_click(button):
#     out.clear_output()
# but.on_click(test_click)
# display(but)

In [8]:
# TEST CELL

# tab = widgets.Tab(children=[widgets.IntSlider(), widgets.Text()], titles=('Slider', 'Text'))
# test = widgets.Button(description='test')

# def test_click(button):
    
#     tab.titles = tab.titles + tuple(["test"])
#     print(tab.titles)
#     tab.children = tab.children + tuple([widgets.Text()])
    
# test.on_click(test_click)
# display(tab, test)

In [9]:
# TEST CELL

# from IPython.display import IFrame
# fra = IFrame("sample-pdf-cs-1.pdf", width=600, height=400)
# display(fra)
# fra.layout.visibility = 'hidden'

# img = widgets.Image(value=file_upload.value[0].content.tobytes(), width = 400, height=600)
# display(img)
# widgets.HTML(
#     value='''<embed src="sample-pdf-cs-1.pdf#page=2" type="application/pdf" width="100%" height="600px" />''',
#     placeholder='Some HTML',
#     description='Some HTML',
# )

# HTML(''' <embed src="sample-pdf-cs-1.pdf#page=2" type="application/pdf" width="100%" height="600px" /> ''')
# HTML('''<object
# 	data="sample-pdf-cs-1.pdf#page=2"
# 	type="application/pdf"
# 	width="600"
# 	height="400">
# 	<p>
# 		Your browser does not support PDFs.
# 		<a href="sample-pdf-cs-1.pdf">Download the PDF</a>.
# 	</p>
# </object>''')

# h = widgets.HTML('''<iframe src="sample-pdf-cs-1.pdf" width=400 height=600></iframe>''')
# display(h)

# HTML('''<p>
# 	Open a PDF file
# 	<a href="sample-pdf-cs-1.pdf#page=2">example</a>
# </p>''')

# from wand.image import Image as WImage
# img = WImage(filename='sample-pdf-cs-1.pdf[2]')
# img