# Import

In [None]:
import re
import fitz
import pandas as pd
from tqdm.notebook import tqdm
from langchain import PromptTemplate
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import HuggingFacePipeline
from langchain.chains.question_answering import load_qa_chain
from transformers import pipeline

# Extracting the data

## Extraction function

In [None]:
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device_map="auto")


def extract_text(pdf_file):
    pdf_text = []
    summaries = []
    page_numbers = []
    hyperlinks = []

    pdf_document = fitz.open(pdf_file)

    for page_num in tqdm(range(len(pdf_document))):
        page = pdf_document[page_num]
        text = page.get_text()

        # Replace "\n" characters with spaces
        text_cleaned = re.sub(r'\n', ' ', text)

        # Replace unidentified characters with spaces
        text_cleaned = re.sub(r'�', ' ', text_cleaned)

        if len(text_cleaned) > 4000:
            summary = summarizer(text_cleaned)[0]['summary_text']
        else:
            summary = text_cleaned
        
        links = page.get_links()
        link_dict = {}
        for link in links:
            link_dict[link.get('uri')] = link.get('rect')

        pdf_text.append(text_cleaned)
        page_numbers.append(page_num + 1)
        hyperlinks.append(link_dict)
        summaries.append(summary)

    return pdf_text, page_numbers, hyperlinks, summaries


## Content extraction

In [None]:
pdf_file_path = # Filepath goes here
pdf_text, page_numbers, hyperlinks, summaries = extract_text_and_hyperlinks(pdf_file_path)

data = {
    'Page Number': page_numbers,
    'Text Block': pdf_text,
    'Summary': summaries,
    'Hyperlinks': hyperlinks
}

df = pd.DataFrame(data)

# Creating the Database

In [None]:
persitent_path = "./chromadb/"
 
documents = [Document(page_content=r["Summary"], metadata={"source": r["Page Number"]}) for index, r  in df.iterrows()]

db = Chroma.from_documents(collection_name="document_content", documents=documents, embedding=hf_embed, persist_directory=persitent_path)
db.similarity_search("dummy") 
db.persist()

# Extracting Data from the Database

In [None]:
persitent_path = "./chromadb/"

hf_embed = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
db = Chroma(collection_name="document_content", embedding_function=hf_embed, persist_directory=persitent_path)

# Retrieving relevant documents from ChromaDB

## Functions

In [None]:
def get_similar_docs(question, similar_doc_count):
  return db.similarity_search(question, k=similar_doc_count)

## Search

In [None]:
for doc in get_similar_docs("What is the difference between an overlap and an inline junction?", 2):
  print(i)
  i+=1
  print(doc.metadata)
  print(doc.page_content)

## Chatbot prompt

In [None]:
def answer_question_no_pipeline(question):
    content = ""
    for doc in get_similar_docs(question, 3):
        content = content + doc.page_content + """

        """
        
    print(content)
    generate_text = pipeline(model="mistralai/mistral-7b-instruct-v0.1", trust_remote_code=True, device_map="auto")
    text = """
            <s>[INST] Below is an instruction that describes a task. Write an explanation that appropriately completes the question.
            You are a STEM scientist and your job is to provide helpful information. 
            Use only information in the following paragraphs (References) to answer the question at the end. Explain the answer with reference to these paragraphs. If you don't know, say that you do not know.
            References:
            Topological insulators represent a new quantum state of matter which is characterized by peculiar edge or surface states that show up due to a topological character of the bulk wave functions. 
            This review presents a pedagogical account on topological insulator materials with an emphasis on basic theory and materials properties. 
            After presenting a historical perspective and basic theories of topological insulators, it discusses all the topological insulator materials discovered as of May 2013, with some illustrative descriptions of the developments in materials discoveries in which the author was involved. 
            A summary is given for possible ways to confirm the topological nature in a candidate material. 
            Various synthesis techniques as well as the defect chemistry that are important for realizing bulk-insulating samples are discussed. 
            Characteristic properties of topological insulators are discussed with an emphasis on transport properties. 
            In particular, the Dirac fermion physics and the resulting peculiar quantum oscillation patterns are discussed in detail. 
            It is emphasized that proper analyses of quantum oscillations make it possible to unambiguously identify surface Dirac fermions through transport measurements. 
            The prospects of topological insulator materials for elucidating novel quantum phenomena that await discovery conclude the review.
            
            Question: 
            What are topological insualtors?

            Answer: 
            A topological insulator (TI) is characterized by peculiar edge or surface states. 
            Those states emerge due to a topological character of the bulk wave functions.
            TIs can be realized with different materials and they are considered a new quantum state of matter (discovered in 2013).
            
            [/INST]
            References: 
        """ + content + """ </s>
            
            [INST] Question: 
            """ + question +"""
            Answer:
            [/INST]"""
    res = generate_text(text)
    return res[0]["generated_text"]

## Test

In [None]:
question = "What is the difference between an overlap and an inline junction?"

answer1 = answer_question_no_pipeline(question)