In [5]:
import os
from typing import List, Dict, Tuple
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
from presidio_analyzer import Pattern, PatternRecognizer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  return _bootstrap._gcd_import(name[level:], package, level)


In [6]:



class PIIAgent:
    """
    A class to mask Personally Identifiable Information (PII) in an email thread.

    This class encapsulates the functionality to anonymize and deanonymize Personally Identifiable Information (PII)
    using custom patterns for various entities such as credit card numbers, account numbers, CIF numbers,
    UAE phone numbers, and Emirates IDs. It uses the inbuilt entities for Email Addresses, Locations, Person Names and Date Times.

    A new anonymizer instance is initialized every time the class is instantiated.
    """

    def __init__(self):
        # Initialize the anonymizer
        self.anonymizer = PresidioReversibleAnonymizer(
            analyzed_fields=[
                "IBAN_CODE",
                "EMAIL_ADDRESS",
                "PERSON",
                "LOCATION",
                "DATE_TIME",
            ],
            add_default_faker_operators=False,
        )
        # Add recognizers
        self._add_recognizers()

    def _add_recognizers(self):
        # Add custom recognizers to the anonymizer

        ## Credit Card Number
        credit_card_number_pattern = Pattern(
            name="credit_card_number_pattern",
            regex=r"\b\d{16}\b",
            score=1,
        )
        credit_card_number_recognizer = PatternRecognizer(
            supported_entity="CREDIT_CARD_NUMBER", patterns=[credit_card_number_pattern]
        )
        self.anonymizer.add_recognizer(credit_card_number_recognizer)

        ## Account Number
        account_number_pattern = Pattern(
            name="account_number_pattern",
            regex=r"\b120\d{8}\b",
            score=1,
        )
        account_number_recognizer = PatternRecognizer(
            supported_entity="ACCOUNT_NUMBER", patterns=[account_number_pattern]
        )
        self.anonymizer.add_recognizer(account_number_recognizer)

        ## CIF Number
        cif_number_pattern = Pattern(
            name="cif_number_pattern",
            regex=r"\b\d{6}\b",
            score=1,
        )
        cif_number_recognizer = PatternRecognizer(
            supported_entity="CIF_NUMBER", patterns=[cif_number_pattern]
        )
        self.anonymizer.add_recognizer(cif_number_recognizer)

        ## UAE Phone Number
        phone_number_pattern = Pattern(
            name="phone_number_pattern",
            regex=r"(?:\+971|00971|971)[\s\-]?5[\s\-]?\d{1}[\s\-]?\d{3}[\s\-]?\d{4}",
            score=1,
        )
        phone_number_recognizer = PatternRecognizer(
            supported_entity="UAE_PHONE_NUMBER", patterns=[phone_number_pattern]
        )
        self.anonymizer.add_recognizer(phone_number_recognizer)

        ## Emirates ID
        emirates_id_pattern = Pattern(
            name="emirates_id_pattern",
            regex=r"784-?\d{4}-?\d{7}-?\d",
            score=1,
        )
        emirates_id_recognizer = PatternRecognizer(
            supported_entity="EMIRATES_ID", patterns=[emirates_id_pattern]
        )
        self.anonymizer.add_recognizer(emirates_id_recognizer)

    def mask(self, text):
        # Anonymize the given text
        return self.anonymizer.anonymize(text)

    def unmask(self, anonymized_text):
        # Deanonymize the given text
        return self.anonymizer.deanonymize(anonymized_text)


In [34]:
class RAGPII:
    def __init__(self, persist_directory = "./chroma_db",model_name = "gemini-pro",temperature = 0.1,top_p = 0.8,top_k = 40,max_output_tokens = 2048):
        self.pii_agent = PIIAgent()
        self.embeddings = HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2")

        self.llm = ChatGoogleGenerativeAI(
            model=model_name,temperature=temperature,top_p=top_p,top_k=top_k,max_output_tokens=max_output_tokens,api_key = ""

        )

        self.persist_directory = persist_directory
        self.vector_store = Chroma( persist_directory= persist_directory,embedding_function=self.embeddings)

        self.text_mappings = {}
        self.qa_prompt = PromptTemplate(
            input_variables= ["context","question"],    
            template=""""
            Context : {context}

            Question : {question}

            Please provide a clear and concise answer based on the context above.If the information isn't available in the context,please say so.

            Answer:
            """
        )


    def process_document(self,document_dir,batch_size=100):
        loader = DirectoryLoader(document_dir,glob="**./*txt",loader_cls= TextLoader)
        documents = loader.load()

        text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000,chunk_overlap = 200)

        splits = text_splitter.split_documents(documents)

        for i in range(0,len(splits),batch_size):
            batch = splits[i:i+batch_size]
            self._process_batch(batch)

        self.vector_store.persist()

    def _process_batch(self,batch):
        anony_docs = []
        for doc in batch:
            anonymized_text = self.pii_agent.mask(doc.text)
            doc_id = hash(doc.page_content)
            self.text_mappings[doc_id] = (doc.page_content,anonymized_text)
            anony_doc = Document(page_content=anonymized_text,metadata ={ **doc.metadata,"doc_id":doc_id})
            anony_docs.append(anony_doc)

        self.vector_store.add_documents(anony_docs)

    def query(self,query,k=4,system_prompt = None):
        anonymized_query = self.pii_agent.mask(query)
        qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=self.vector_store.as_retriever(
                search_kwargs={"k": k}
            ),
            chain_type_kwargs={
                "prompt": self.qa_prompt,
                "verbose": True
            }
        )
        
        
        response = qa_chain.run(anonymized_query)

        deanonymized_response = self.pii_agent.unmask(response)
        return deanonymized_response

        
    def add_single_document(self,content,metadata=None):
        if metadata is None:
            metadata = {}
        
        anonymized_content = self.pii_agent.mask(content)
        doc_id = hash(content)
        self.text_mappings[doc_id] = (content,anonymized_content)

        doc = Document(
            page_content= anonymized_content,
            metadata = {**metadata,"doc_id":doc_id}
        )

        self.vector_store.add_documents([doc])
        self.vector_store.persist()


    def set_custom_prompt(self,template,input_variable=None):
        if input_variable is None:
            input_variable = ["context","question"]
        self.qa_prompt = PromptTemplate(
            input_variables= input_variable,
            template=template
        )

            






        

In [35]:
rag_app = RAGPII()



In [36]:
sample_doc = """
   From: Sanjay Wariyar <sanjay.wariyar@silco.ae<mailto:sanjay.wariyar@silco.ae>>
Date: Thursday, 6 June 2024 at 12:05 PM
To: "Afroze.Naseem" <Afroze.Naseem@nbf.ae<mailto:Afroze.Naseem@nbf.ae>>
Cc: Anupam Paul <anupam.paul@silco.ae<mailto:anupam.paul@silco.ae>>, Saiu George <saju@silco.ae<mailto:saju@silco.ae>>
Subject: Re: Secure Email Message



Dear Afroze



Attached Invoice for which payment US$ 262,845 has be remitted .







Best Regards




    """
    

In [37]:
dir(rag_app)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_process_batch',
 'add_single_document',
 'embeddings',
 'llm',
 'persist_directory',
 'pii_agent',
 'process_document',
 'qa_prompt',
 'query',
 'set_custom_prompt',
 'text_mappings',
 'vector_store']

In [38]:
rag_app.add_single_document(sample_doc)

In [39]:
custom_prompt = """
    Context: {context}
    
    Question: {question}
    
    Please provide a professional and concise response based on the given context.
    Focus only on the relevant information and maintain a helpful tone.
    
    Answer:
    """
rag_app.set_custom_prompt(custom_prompt)
    

In [40]:

query = "What is payment amount for the attached invoice?"
response = rag_app.query(query)
print(f"Query: {query}")
print(f"Response: {response}")

  response = qa_chain.run(anonymized_query)




[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
    Context: 
   From: <PERSON> <<EMAIL_ADDRESS><mailto:<EMAIL_ADDRESS>>>
Date: <DATE_TIME>, <DATE_TIME_2> at <DATE_TIME_3>
To: "<PERSON_2>.<PERSON_3>" <<PERSON_2>.<PERSON_3>@nbf.ae<mailto:<PERSON_2>.<PERSON_3>@nbf.ae>>
Cc: <PERSON_4> <<EMAIL_ADDRESS_3><mailto:<EMAIL_ADDRESS_3>>>, <PERSON_5>>>
Subject: Re: Secure Email Message



Dear <PERSON_2>



Attached Invoice for which payment US$ 262,845 has be remitted .







Best Regards




    


   From: <PERSON> <<EMAIL_ADDRESS><mailto:<EMAIL_ADDRESS>>>
Date: <DATE_TIME>, <DATE_TIME_2> at <DATE_TIME_3>
To: "<PERSON_2>.<PERSON_3>" <<PERSON_2>.<PERSON_3>@nbf.ae<mailto:<PERSON_2>.<PERSON_3>@nbf.ae>>
Cc: <PERSON_4> <<EMAIL_ADDRESS_3><mailto:<EMAIL_ADDRESS_3>>>, <PERSON_5>>>
Subject: Re: Secure Email Message



Dear <PERSON_2>



Attached Invoice for which payment US$ 262,845 has be remitted .







Best Reg