In [5]:
!pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [28]:
#importing the libraries:
#changes in redhat:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_community.llms import HuggingFaceHub
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

In [39]:
#reading the pdf from the folder:
loader = PyPDFLoader("HR-DigivateLabs-Leave-Policy.pdf")
documents = loader.load()

#splitting into chunks:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000,chunk_overlap=200)
final_document = text_splitter.split_documents(documents)
final_document[0]

Document(metadata={'source': 'HR-DigivateLabs-Leave-Policy.pdf', 'page': 0}, page_content='HR-DigivateL abs-Leave -Policy \n \n  \n \n \n \n \n \n \n \n \n \nCONTROL  INFORMATION  Leave Policy  \nHR De partment – Digivate Labs  \n \n# Attribute  Value  \n1 Document  Title Digivate Labs  Leave  Policy  \n \n \nRELEASE  HISTORY  \n \n# Date Prepared By  Reviewed By  Approved By  Reason  \n0.1 1-Apr-22 HR Management  MD & CEO  Released')

In [40]:
#initializing embedding technique:
hugging_face_embeddings = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings':True}
)

In [41]:
#creating the vector store:
vector_store = FAISS.from_documents(final_document[:],hugging_face_embeddings)

In [42]:
#Query using similar search:
query = "How many leaves can we carry forward?"
relevant_documents = vector_store.search(query,search_type='similarity')
print(relevant_documents[0].page_content)

HR-Digivate Labs -Leave-Policy 
  
 
The above policy shall be applicable to all full time/contract Employees.  
 
Short duration leaves should only be approved by HR after consideration or BU Head and can be done only 
once or twice in a month.  
 
Carry Forward  
 
You can carry forward a maximum of 7 leaves to a new calendar year. Thus, your leave balance cannot 
exceed 22 days at any given time. For instance, if you have accumula ted 7 days of leave by the end of a year 
and have added 13 days by 1st December, your leave balance will be 20. However, if you utilize 10 days 
during December, your leave balance as on 1st January will still be 7 only.  
 
Encashment  
 
NO ENCASHMENT OF LEAVE. Un -availed leave may be adjusted at the time of separation, at the sole 
discretion  of the management.  
 
Casual  & Sick Leave  
 
NO SEPARATE CASUAL or SICK LEAVE, ALL LEAVES COMBINED INTO ONE COMMON POOL AS 
ACCRUED LEAVE . Sick leave exce eding 3 days requires submission of a medical certif

In [43]:
#creating a retriever object:
retriever = vector_store.as_retriever(search_type='similarity',
                                      search_kwargs={"k":3})
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceBgeEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7ff875d91490>, search_kwargs={'k': 3})

In [56]:
#loading the huggingface api key:
HUGGINGFACEHUB_API_TOKEN = "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"

In [46]:
#loading a hugging face model:
llm = HuggingFaceHub(
        repo_id="mistralai/Mistral-7B-v0.1",
        model_kwargs={"temperature":3,
                      "max_length":1000}
)

In [47]:

#creating a prompt template:
template = """You are an assistant for question-answering tasks. Use the following pieces of retrieved 
context to answer the question. If you don't know the answer, just say that you don't know. 
Please provide a detailed to-the-point summary of the following answer:
Question: {question} ,
Context: {context},
Answer: """

prompt = PromptTemplate(template=template,
                        input_variables=["context","question"])

In [48]:
#creating a retireval QA:
retrievalQA = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt":prompt}
)

In [49]:
#testing the model with a query:
query = "What is the process to avail a maternity leave?"
response = retrievalQA.invoke({"query":query})

# Fetching only the context from the response:
context = response['source_documents'][0].page_content
print(context)

HR-Digivate Labs -Leave-Policy 
  
 
 
 
All eligible women employees are entitled to maternity leave, as shown in the table below. The maternity 
leave is inclusive of weekly offs and public & national holidays.  
 
Types of Maternity 
Leaves  Leave Entitlement 
(In Weeks)  Documents required to be 
submitted to HR Deptt to 
avail the leave  Leave 
Commencement  
Maternity leave in case 
of women employee up 
to two surviving children  26 1. Confirmation of pregnancy 
along with t he date of delivery.  
2. Medical certificate from 
certified medical practitioner.  Not earlier than 8 
weeks prior to the 
date of delivery.  
Maternity leave in case 
of women employee with 
two or more children  12 1. Confirmation of pregnancy 
along with the date of delivery.  
2. Medical certificate from 
certified medical practitioner.  Not earlier than 6 
weeks prior to the 
date of delivery.  
Commissioning Mother  12 1. Medical Documents  
2. Birth certificate of the ch ild From the date the 
child

In [None]:
import os

os.makedirs('PDF_ChatBot_RatHat/models',exist_ok=True)

In [57]:
from huggingface_hub import login

# Replace 'your_hugging_face_token' with your actual token
login(token=HUGGINGFACEHUB_API_TOKEN)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /opt/app-root/src/.cache/huggingface/token
Login successful


In [58]:
#saving the model:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.onnx import export
from pathlib import Path

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")

# Set the output path
output_path = Path("PDF_ChatBot_RatHat/models/onnx_model")

# Export the model to ONNX format
export(tokenizer, model, output=output_path, opset=13, use_external_format=True)


tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]



model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

OSError: [Errno 28] No space left on device

In [None]:

# Save the FAISS index
faiss.write_index(vector_store.index, "faiss_index.index")
