In [3]:
!pip install pymupdf python-docx langchain langchain_community chromadb faiss-gpu
import os
import fitz
import re
import faiss
import numpy as np

from docx import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from transformers import pipeline

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/input-data-pdfs/Maple_plant_guide.txt
/kaggle/input/input-data-pdfs/Jade_Bonsai_Care_guidelines.docx
/kaggle/input/input-data-pdfs/attention.pdf
/kaggle/input/input-data-pdfs/Fish_care.pdf


# Plan of Action and Project Over view

### Project over view 
    - 1. user will have option to "summarize" the document and "Chat" with document
    - 2. user will upload the appropriate document to work with
    - 3. model will process the document according to the option selected

### plan of action
    - 1. read the documents (type of documents:- pdf(✔), docx(✔), txt(✔))
         a. for pdf read it using pdfplumber(Unable to Install) or pyMuPDF(✔)
         b. for txt read it directly as is it just strings(✔)
         c. for docx use docx(✔)
    - 2. Clean the documents (✔), Chunk the documnets (✔), Vectorize the documents(optional in case if we want our own vector Db)
    - 3. initialize the models (Both summarizer (✔) and the another one which will be used for similarity search (✔))
    - 4. Summarize the text - directly use invoke method to summarize the test(✔)
    - 5. Chat 
         a. Perform Similarity search between the quey and chunks (✔)
         b. Create Prompts of similar results of query and chunks of document (✔)
         c. pass the prompts to a LLM to get Answer (✔)

### For PDF Files

In [None]:
# Extracting text from PDF's
def extract_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Cleaning the extracted text
def clean_text(text):
     # Replace multiple newlines with a single newline (preserves paragraphs)
    text = re.sub(r"\n{2,}", "\n", text)
    text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)
    text  = re.sub(r'[^\x00-\x7F]+', " ", text) ## hyper links are highlighted as "\xa0" so we will remove this
    return text

# making chunks of 1000 words with overlap of 500 words
def chunks_of_text(extracted_text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 200, chunk_overlap = 50)
    chunks = text_splitter.split_text(extracted_text)
    return chunks


def create_chunks(text, max_length = 1024):
    chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
    return chunks

In [None]:
extracted_text = extract_pdf("/kaggle/input/input-data-pdfs/attention.pdf")
# cleaned_text_pdf = clean_text(extracted_text)
chunks_pdf = create_chunks(extracted_text)


### For Docx files

In [None]:
def extract_doc(file):
    doc = Document(file)
    text = "\n".join([data.text for data in doc.paragraphs])
    return text

In [None]:
extracted_text = extract_doc("/kaggle/input/input-data-pdfs/Jade_Bonsai_Care_guidelines.docx")
cleaned_text = clean_text(extracted_text)
chunks = chunks_of_text(cleaned_text)

### For txt files

In [None]:
def extract_txt(file):
    data = open(file,"r").read()
    return data

In [None]:
extracted_text = extract_txt("/kaggle/input/input-data-pdfs/Maple_plant_guide.txt")
cleaned_text = clean_text(extracted_text)
chunks_txt = chunks_of_text(cleaned_text)

### Initializing the models Summarizer, Embedding, q_a

In [None]:
summarizer = pipeline("summarization", model="google/pegasus-large")
# MBZUAI/LaMini-Flan-T5-248M :- another option
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
q_a_model = pipeline("text2text-generation", model="google/flan-t5-large")

In [None]:
def Langchain_pipeline(file, model):

    # Check for the File Extension and excute the Read operation
    ext = os.path.splitext(file)[1]
    if ext == ".pdf":
        # Read PDF file
        doc = fitz.open(file)
        text = ""
        for page in doc:
            text += page.get_text()
            
    if ext == ".docx":
        # Read docs file
        doc = Document(file)
        text = "" 
        for data in doc.paragraphs:
            text += data.text
            
    if ext == ".txt":
         # Read txt file
        text = open(file,"r").read()
    

    # chunk the data 
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 50)
    chunks = text_splitter.split_text(text)
    
    # invoke the data
    # Summarize each chunk separately (You can use list comprehension)
    summaries = []
    for chunk in chunks:
        summary = model(chunk, max_length=100, min_length=50, do_sample=False)[0]['summary_text']
        summaries.append(summary)

    # Concatenate the summaries into a single string
    summary = ' '.join(summaries)
    return summary

In [None]:
Langchain_pipeline("/kaggle/input/input-data-pdfs/Jade_Bonsai_Care_guidelines.docx", summarizer)


## Q and A with Document

In [None]:
# we will create chunks of the whole document
# we will embedd all the chunks into vectors and store it in chromadb
# we will embedd the query entered by the user 
# than perform the similarity search between the embeddings of query and documents and return the top 3 similarity search chunks
# Than we will feed the usery query along with similar chunks to LLM to generate human like response

# I/O operation

In [None]:
def extract_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

data = extract_pdf("/kaggle/input/input-data-pdfs/attention.pdf")
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000,  chunk_overlap = 50)
chunks = text_splitter.split_text(data)

# Embbed the chunks
chunks_embeddings = embedding_model.encode(chunks).astype(np.float32)

In [None]:
# user Query and Embedding it
user_query = "What are the two most commonly used attention function"
user_query_embedding = embedding_model.encode(user_query)

In [None]:
# FAISS functions
def create_faiss_index(chunks_embeddings):
    # chunks_embeddings_dimension :- 384 
    chunks_embeddings_dimension = chunks_embeddings.shape[1]
    # create Faiss index (L2 Distance)
    index = faiss.IndexFlatL2(chunks_embeddings_dimension)
    index.add(chunks_embeddings)

    return index

def search_faiss(chunks, chunks_embeddings, user_query):
    top_k = 3 
    indices = create_faiss_index(chunks_embeddings)
    similar_indexes = indices.search(user_query, top_k)[1]
    similarity_results = [chunks[result] for result in similar_indexes[0]]
    return similarity_results

In [None]:
similar_queries = search_faiss(chunks= chunks, user_query = user_query_embedding.reshape(1,-1))

In [None]:
# Designing the Prompt
context = " ".join(similarity_result)
prompt = f"Answer the Following Question and explain the concept in brief based on the given context:\n\nContext: {context} \n\nQuestion: {user_query} \nAnswer:"
print(prompt)

In [None]:
result = q_a_pipeline(prompt, max_length =200)

In [None]:
print("Question: ",user_query)
print("Answer: ", result[0]["generated_text"])

In [None]:
# pdf pipeline
def extract_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text
    
# fully Pipeline
def Q_a_pipeline(embedding_model, user_query, chunks, q_a_model):
    # Embbed the chunks
    embedded_chunks = embedding_model.encode(chunks).astype(np.float32)
    
    # Embbed the user query
    embedded_user_query = embedding_model.encode(user_query).astype(np.float32)
    
    # FAISS and Similarity Search
    similar_queries = search_faiss(chunks= chunks, user_query = embedded_user_query.reshape(1,-1), chunks_embeddings = embedded_chunks)

    # Designing the Prompt
    context = " ".join(similar_queries)
    prompt = f"Answer the Following Question and explain the concept in brief based on the given context:\n\nContext: {context} \n\nQuestion: {user_query}"
    
    # Predict
    final_answer = q_a_model(prompt, max_length = 500)[0]["generated_text"]
    return final_answer

In [None]:
data = extract_pdf("/kaggle/input/input-data-pdfs/Fish_care.pdf")
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000,  chunk_overlap = 50)
chunks = text_splitter.split_text(data)

user_query = str(input("Enter the Query :- "))
Q_a_pipeline(embedding_model, user_query, chunks, q_a_model)