In [172]:
from IPython.display import HTML

HTML('''<script>
code_show=true;
function code_toggle() {
    if (code_show){
        $('div.input').hide();
    } else {
        $('div.input').show();
    }
    code_show = !code_show
}
$(document).ready(code_toggle);
</script>
Click <a href="javascript:code_toggle()">here</a> to toggle on/off the code cells.''')



In [173]:
import ipywidgets as widgets
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')

# Create the file upload widget
file_upload = widgets.FileUpload(
    accept='',  # Accepted file types (e.g., '.txt', '.pdf', '.jpg', '.csv', etc.); leave empty for all file types
    multiple=True  # Set to True if you want to allow multiple files to be uploaded
)

# Display the widget
file_upload

FileUpload(value={}, description='Upload', multiple=True)

In [174]:
import io
import PyPDF2
import ipywidgets as widgets
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm

documents = []
embeddings = []
uploaded_file_data = []

def create_embeddings(documents, model_name="paraphrase-MiniLM-L6-v2"):
    #device = torch.device("cuda" if torch.cuda.is_available() else print("cpu"))
    model = SentenceTransformer(model_name)
    #model.to("cuda")
    embeddings = model.encode(documents)
    
    return embeddings

def split_string(input_string, chunk_size):
    return [input_string[i:i+chunk_size] for i in range(0, len(input_string), chunk_size)]

def upload_file(file_upload, i):
    global documents
    global embeddings

    text_content = ''

    # Get the uploaded file's content

    for file, attributes in file_upload.value.items():
        content = attributes.get('content') 
        uploaded_file_data.append(content)    
    
    uploaded_file_content = uploaded_file_data[i]

    # Convert the uploaded file content to a readable file-like buffer
    file_buffer = io.BytesIO(uploaded_file_content)

    # Read the PDF file using PyPDF2
    pdf_reader = PyPDF2.PdfReader(file_buffer)

    # Extract the text content from the PDF
    text_content = ''
    for page_num in tqdm(range(len(pdf_reader.pages)), desc='Processing pages'):
        page = pdf_reader.pages[page_num]
        text_content += page.extract_text()

    # on last iteration add the information to documents and embeddings
    if page_num == (len(pdf_reader.pages)-1):
        documents.append(split_string(text_content, 1000))
    
        # Create document embeddings for semantic search
        embeddings.append(create_embeddings(documents[i]))        



        
for file, attributes in file_upload.value.items():
    content = attributes.get('content') 
    uploaded_file_data.append(content)
    
button = widgets.Button(description="Load the data")

def on_button_click(b):
    if file_upload.value:
        for i in range(len(file_upload.value)):
            upload_file(file_upload, i)
            print("Data loaded successfully!")
    else:
        print("No files uploaded")

button.on_click(on_button_click)

display(button)

Button(description='Load the data', style=ButtonStyle())

Processing pages: 100%|█████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 27.84it/s]


Data loaded successfully!


Processing pages: 100%|█████████████████████████████████████████████████████████████████████████████████████| 46/46 [00:06<00:00,  7.50it/s]


Data loaded successfully!


In [175]:
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from IPython.display import clear_output


from transformers import logging

logging.set_verbosity_error()

def retrieve_passages(query, documents, embeddings, top_k=5):
    model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
    #model.to("cuda")

    query_embedding = model.encode([query])[0]
    similarities = cosine_similarity(query_embedding.reshape(1, -1), embeddings)

    #top_indices = np.argsort(similarities[0])[-top_k:][::-1]

    threshold = 0.4  
    top_indices = np.argsort(similarities[0])[::-1][:top_k][similarities[0][np.argsort(similarities[0])[::-1][:top_k]] >= threshold]

    
    #print(similarities[0])
    #print(top_indices)
    return [documents[i] for i in top_indices]

def extract_answer(question, passage, model_name="roberta-large"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)

    inputs = tokenizer.encode_plus(question, str(passage), return_tensors="pt", max_length=512, truncation=True, padding='max_length', return_offsets_mapping=True)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    offset_mapping = inputs["offset_mapping"].squeeze(0)

    outputs = model(input_ids, attention_mask=attention_mask)

    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits)

    answer_start_char = offset_mapping[answer_start][0]
    answer_end_char = offset_mapping[answer_end][1]
    answer = passage[answer_start_char:answer_end_char]

    return answer

def qa_system(question, documents, embeddings, model_name="roberta-large", top_k=5):
        answers = []
        keys = list(file_upload.value.keys())

        # iterate over all files to extract answers from each one
        for i in range(len(documents)):
            passages = retrieve_passages(question, documents[i], embeddings[i], top_k=top_k)
        
            #print(passages)
        
            for passage in passages:
                answer = extract_answer(question, passage, model_name=model_name)
                answers.append({"fileName" : file_upload.value[keys[i]]['metadata']['name'], "answer": answer, "context": passage})

        return answers

import ipywidgets as widgets
from IPython.display import display
# This function is called when the button is clicked

# Create the text field and button widgets
text_field = widgets.Text(placeholder='Enter your question here')
button = widgets.Button(description='Search')

answers = []
first = False


def on_button_click(button):
    global first
    
    %clear
    clear_output(wait=True)
    first = False
    # Get the text input from the text field
    text_input = text_field.value
    # Display the widgets
    text_field.value = text_input
    display(text_field, button)   
    
    
    print("Generating answers, please wait...")
    # Retrieve answers using the QA system
    global answers
    answers = qa_system(text_input, documents, embeddings)

#     # Print the answers
#     for idx, answer in enumerate(answers):
#         print(f"Document : {answer['fileName']}, Answer {idx + 1}: {answer['answer']}\nContext: {answer['context']}\n")   
        
    

    if not answers: print("No answers available")
    
    show_results()
    # Call your function with the input and print the result
    #result = your_function(text_input)
    #print(result)



# Set the callback function for the button
button.on_click(on_button_click)

# Display the widgets
display(text_field, button)

Text(value='', placeholder='Enter your question here')

Button(description='Search', style=ButtonStyle())

In [176]:
import ipywidgets as widgets
from IPython.display import display, IFrame
import fitz 
import functools
import string
import os, sys, subprocess
import webbrowser
        
    
 
    
#show_results()        
    
    
def preprocess_context(context):
    # removing hyphen, newline and non-ascii characters
    contextFix = (context).replace("-", "")

    ascii_chars = set(string.printable)
    contextFix = ''.join(filter(lambda x: x in ascii_chars, contextFix))

    contextFixLst = contextFix.split("\n")

    contextFixLstWOWhite = []
    for contextFix in contextFixLst:
        if not all(ch==" " for ch in contextFix):
            contextFixLstWOWhite.append(contextFix)
            
    # split content into a list of lines
    return contextFixLstWOWhite



def clear_file():
    if fileToClear != '':
        pdfDoc = fitz.open(fileToClear)
    
        # iterates pages and removes annotations generated by fitz
        for pgNum in range(len(pdfDoc)):
            page = pdfDoc[pgNum]
    
            pdfDoc.xref_set_key(page.xref, "Annots", "[]")
        
        pdfDoc.save(fileToClear, incremental=True, encryption=fitz.PDF_ENCRYPT_KEEP)

        # clear output cell 
        out.clear_output()
        outPgNum.clear_output()

def clear_btn_click(button): clear_file();

def open_file(filename, pgNum):
    currDir = os.getcwdb()
    
    # opens file in default browser at specific page, NOTE: assumes that file is in current directory and Chrome is installed
    fileLoc = "file:///" + str(currDir)[2:][:-1] + "/" + filename + "#page=" + str(pgNum)
    (webbrowser.get(using='chrome')).open(fileLoc)      
    
    
def show_results():
    


    # button to remove highlight and clear output 
    fileToClear = ''
    clearBtn = widgets.Button(description="Clear")

    foundPgNum = -1   
    
    # to display relevant PDF
    out = widgets.Output(layout={'border': '1px solid white', 'width':'950px', 'height':'600px'})

    # to display page number
    outPgNum = widgets.Output(layout={'width':'300px', 'height':'50px'})    


    clearBtn.on_click(clear_btn_click)
    display(clearBtn)

    # iterate through all answers 
    for idx, answer in enumerate(answers):

        # display answer
        label = widgets.HTML(value="<style>p{word-wrap: break-word}</style> <p> File Name: " + answer['fileName'] + "</p>"  + "\n<p> Context: " + answer['context'] + "</p>")
        display(label)

        # button to highlight context on original PDF
        button = widgets.Button(description='Citation')

        def on_button_click(button, fileInfo=("", "")):
            global foundPgNum
            global fileToClear
            global first
            
            if not first:
                display(out)

                display(outPgNum)      
                
                first = True
            
            # if direct change from file to file, clear previous file
            clear_file()

            pdfDoc = fitz.open(fileInfo[0])        
            found = False

            # iterate through pages in file
            for pgNum in range(len(pdfDoc)):
                foundTxtLst = []
                page = pdfDoc[pgNum]            
                #print(fileInfo[1])
                #contextLst = preprocess_context(fileInfo[1])
                #contextLst = fileInfo[1].split('\n')

                contextLst = [line for line in fileInfo[1].split('\n') if line.strip() not in {"", "-", "/", "."}]


                # highlight sections and open file
                for context in contextLst:
                    # append to list of highlight rect to make a continuous rect
                    foundTxt = page.search_for(context)
                    #print('searching context: ', context, 'in page ', pgNum)



                    if foundTxt: 
                        # save pg number of where context is found
                        if not found: 
                            foundPgNum = pgNum + 1
                            found = True
                            #print('found2 ', foundTxt, foundPgNum)
                        foundTxtLst = foundTxtLst + foundTxt

                        
                        
                extra_width = 150
                # make more continuous highlight rect including all of the smaller rects of the found text
                if foundTxtLst:

                    #print('found text ', foundTxtLst)

                    # find leftmost and rightmost point of highlight on page
                    leftX = foundTxtLst[0].top_left.x
                    leftY = foundTxtLst[0].top_left.y
                    rightX = foundTxtLst[-1].bottom_right.x
                    rightY = foundTxtLst[-1].bottom_right.y

                    for rect in foundTxtLst:
                        if rect.top_left.x < leftX:
                            leftX = rect.top_left.x
                        if rect.top_left.y < leftY:
                            leftY = rect.top_left.y
                        if rect.bottom_right.x > rightX:
                            rightX = rect.bottom_right.x
                        if rect.bottom_right.x > rightY:
                            rightY = rect.bottom_right.y

                    # Adjust leftX and rightX for the extra width
                    leftX -= extra_width + 100
                    rightX += extra_width

                    page.add_highlight_annot(fitz.Rect(fitz.Point(leftX, leftY), 
                                                       fitz.Point(rightX, rightY)))

            pdfDoc.save(fileInfo[0], incremental=True, encryption=fitz.PDF_ENCRYPT_KEEP)
            fileToClear = fileInfo[0]
            
            open_file(fileInfo[0], foundPgNum)
            #os.system("open -a Preview /Users/elkindi/cfpb_complaint-bulletin_crypto-assets_2022-11.pdf -g -p 20")

            # clear output cell and render new file
            out.clear_output()
#             @out.capture()
#             def render():
#                 display(IFrame(src=fileInfo[0],width=1000, height=600))
#             render()

#             outPgNum.clear_output()
#             @outPgNum.capture()
#             def renderPgNum():
#                 display(HTML("<p> File Name: " + fileInfo[0] + "</p>"))
#                 display(HTML("<p> Page Number: " + str(foundPgNum) + "</p>"))
#             renderPgNum()

        button.on_click(functools.partial(on_button_click, fileInfo=(answer['fileName'], answer['context'])))
        display(button)

        