In [None]:
from langchain_community.llms import Ollama
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain_community.vectorstores import DocArrayInMemorySearch
from langchain_community.embeddings import OllamaEmbeddings
from operator import itemgetter
import requests
from bs4 import BeautifulSoup
from fpdf import FPDF
from PyPDF2 import PdfFileMerger, PdfFileReader
import os
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (SentenceTransformerEmbeddings,)
from langchain_text_splitters import CharacterTextSplitter

In [None]:
class RAG_Pipeline:
    def __init__(self, default_model="llama2", doc="test.pdf"):
        self.model_name=default_model
        self.doc_path=doc
        self.model=Ollama(model=self.model_name)
        self.embeddings=OllamaEmbeddings()
        self.parser=StrOutputParser()
        self.loader=PyPDFLoader(self.doc_path)
        self.pages=self.loader.load_and_split()
        self.vectorstore=DocArrayInMemorySearch.from_documents(self.pages,embedding=self.embeddings)
        self.retriever=self.vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})
        self.prompt_template = """
        Answer the question based on the context below. If you cannot answer the question, reply "I don't know".

        Context: {context}

        Question: Who is the best suited for the project {question}, say the name and explain why that person is best suited for this project
        """
        self.prompt = PromptTemplate.from_template(self.prompt_template)
        self.chain = (
            {
                "context": itemgetter("question") | self.retriever, 
                "question": itemgetter("question")
            }
            | self.prompt | self.model | self.parser
        )
    def fetch(self, question):return self.chain.invoke({'question': question})
    def get_metadata_for_chunk(self, chunk_id):
        metadata = self.vectorstore.get_metadata(chunk_id)
        return metadata

In [None]:
model=RAG_Pipeline()

In [None]:
folder_path='data'
output_folder='extract'

os.makedirs(output_folder,exist_ok=True)
user_input = ""

for filename in os.listdir(folder_path):
    if filename.endswith(".pdf") or filename.endswith(".docx"): 
        file_path=os.path.join(folder_path, filename)
        model=RAG_Pipeline(doc=file_path)
        answer=model.fetch(user_input)
        output_file_path=os.path.join(output_folder,f"{os.path.splitext(filename)[0]}_answer.txt")
        with open(output_file_path, 'w') as outfile:outfile.write(answer + "\n\n\n")

In [None]:
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)

In [None]:
query = """"""

In [None]:
folder_path = "extract"

In [None]:
highest_score=float('-inf')
best_file=None
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path=os.path.join(folder_path, filename)
        loader=TextLoader(file_path)
        documents=loader.load()
        docs=text_splitter.split_documents(documents)
        db=Chroma.from_documents(docs, embedding_function)
        docs=db.similarity_search(query)
        sml_scr=db.similarity_search_with_score(query)
        _, val=sml_scr[0]
        if val >highest_score:
            highest_score=val
            best_file=filename
print(f"File name is: {best_file} | score of : {highest_score}")