In [1]:
!pip install arabic-reshaper

Collecting arabic-reshaper
  Downloading arabic_reshaper-3.0.0-py3-none-any.whl.metadata (12 kB)
Downloading arabic_reshaper-3.0.0-py3-none-any.whl (20 kB)
Installing collected packages: arabic-reshaper
Successfully installed arabic-reshaper-3.0.0


In [2]:
!pip install presidio_analyzer
!pip install presidio-anonymizer
!pip install python-bidi
!pip install langchain-experimental

Collecting python-bidi
  Downloading python_bidi-0.6.3-cp311-none-win_amd64.whl.metadata (5.0 kB)
Downloading python_bidi-0.6.3-cp311-none-win_amd64.whl (157 kB)
Installing collected packages: python-bidi
Successfully installed python-bidi-0.6.3
Collecting langchain-core<0.3.0,>=0.2.38 (from langchain-experimental)
  Downloading langchain_core-0.2.43-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain<0.3.0,>=0.2.16 (from langchain-community<0.3.0,>=0.2.16->langchain-experimental)
  Downloading langchain-0.2.17-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain<0.3.0,>=0.2.16->langchain-community<0.3.0,>=0.2.16->langchain-experimental)
  Using cached langchain_text_splitters-0.2.4-py3-none-any.whl.metadata (2.3 kB)
Downloading langchain_core-0.2.43-py3-none-any.whl (397 kB)
Downloading langchain-0.2.17-py3-none-any.whl (1.0 MB)
   ---------------------------------------- 0.0/1.0 MB ? eta -:--:--
   ----------------------------------

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
crewai 0.35.8 requires langchain<0.2.0,>=0.1.4, but you have langchain 0.2.17 which is incompatible.
crewai-tools 0.3.0 requires chromadb<0.5.0,>=0.4.22, but you have chromadb 0.5.16 which is incompatible.
crewai-tools 0.3.0 requires langchain<0.2.0,>=0.1.4, but you have langchain 0.2.17 which is incompatible.
embedchain 0.1.113 requires chromadb<0.5.0,>=0.4.24, but you have chromadb 0.5.16 which is incompatible.
embedchain 0.1.113 requires langchain<0.2.0,>=0.1.4, but you have langchain 0.2.17 which is incompatible.
embedchain 0.1.113 requires pypdf<5.0.0,>=4.0.1, but you have pypdf 5.1.0 which is incompatible.
langchain-google-genai 2.0.3 requires langchain-core<0.4,>=0.3.13, but you have langchain-core 0.2.43 which is incompatible.
langchain-google-vertexai 2.0.7 requires langchain-core<0.4,>=0.3.15, but you ha

In [1]:
from presidio_analyzer import Pattern, PatternRecognizer
from presidio_anonymizer.entities import OperatorConfig
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
import arabic_reshaper
from bidi.algorithm import get_display
import re

In [2]:

class ArabicPIIAgent:
    def __init__(self):
        self.anonymizer = PresidioReversibleAnonymizer(
            analyzed_fields=[
                "ARABIC_NAME",
                "PHONE_NUMBER",
                "ADDRESS",
                "EMAIL_ADDRESS",
                "NATIONAL_ID",
            ],
            add_default_faker_operators=False,
        )
        self._add_recognizers()

    def _add_recognizers(self):

        arabic_name_pattern = Pattern(
            name="arabic_name_pattern",
            regex=r'(?:السيد|السيدة|الدكتور|الأستاذ)?\s*[\u0600-\u06FF\s]{3,40}',
            score=0.85,
        )
        self.anonymizer.add_recognizer(
            PatternRecognizer(
                supported_entity="ARABIC_NAME",
                patterns=[arabic_name_pattern]
            )
        )


        phone_pattern = Pattern(
            name="uae_phone",
            regex=r'(?:\+971|00971|971)?[0-5]\d{8}',
            score=0.95
        )
        self.anonymizer.add_recognizer(
            PatternRecognizer(
                supported_entity="PHONE_NUMBER",
                patterns=[phone_pattern]
            )
        )

        email_pattern = Pattern(
            name="email_pattern",
            regex=r'[\w\.-]+@[\w\.-]+\.[a-zA-Z]{2,}',
            score=0.95
        )
        self.anonymizer.add_recognizer(
            PatternRecognizer(
                supported_entity="EMAIL_ADDRESS",
                patterns=[email_pattern]
            )
        )

        emirates_id_pattern = Pattern(
            name="emirates_id_pattern",
            regex=r'784-\d{4}-\d{7}-\d',
            score=0.95
        )
        self.anonymizer.add_recognizer(
            PatternRecognizer(
                supported_entity="NATIONAL_ID",
                patterns=[emirates_id_pattern]
            )
        )

        address_pattern = Pattern(
            name="arabic_address",
            regex=r'(?:شارع|طريق|حي|منطقة)[\u0600-\u06FF\s،,٠-٩0-9]{5,100}',
            score=0.8
        )
        self.anonymizer.add_recognizer(
            PatternRecognizer(
                supported_entity="ADDRESS",
                patterns=[address_pattern]
            )
        )

    def mask(self, text):
        """Mask PII in Arabic text"""
        return self.anonymizer.anonymize(text)

    def unmask(self, anonymized_text):
        """Unmask previously anonymized Arabic text"""
        return self.anonymizer.deanonymize(anonymized_text)


In [3]:
sample_text = """
    محضر اجتماع
    ------------------
    التاريخ: 15 نوفمبر 2023

    الحاضرون:
    1. السيد محمد عبدالله الهاشمي
    رقم الهوية: 784-1234-1234567-1
    رقم الهاتف: +971501234567
    البريد الإلكتروني: mohammed@example.com
    العنوان: شارع الشيخ زايد، منطقة الوحدة، أبوظبي

    2. السيدة فاطمة أحمد الكعبي
    رقم الهوية: 784-5678-7654321-2
    رقم الهاتف: +971562345678
    البريد الإلكتروني: fatima@company.ae
    العنوان: شارع خليفة، منطقة النهدة، دبي

    موضوع الاجتماع: مناقشة المشروع الجديد
    """

In [4]:
try:
      pii_agent = ArabicPIIAgent()

      print("Original Text:")
      print("-" * 50)
      print(sample_text)
      print("\n")

      masked_text = pii_agent.mask(sample_text)
      print("Masked Text:")
      print("-" * 50)
      print(masked_text)
      print("\n")


      unmasked_text = pii_agent.unmask(masked_text)
      print("Unmasked Text:")
      print("-" * 50)
      print(unmasked_text)

except Exception as e:
      print(f"An error occurred: {str(e)}")

Original Text:
--------------------------------------------------

    محضر اجتماع
    ------------------
    التاريخ: 15 نوفمبر 2023

    الحاضرون:
    1. السيد محمد عبدالله الهاشمي
    رقم الهوية: 784-1234-1234567-1
    رقم الهاتف: +971501234567
    البريد الإلكتروني: mohammed@example.com
    العنوان: شارع الشيخ زايد، منطقة الوحدة، أبوظبي

    2. السيدة فاطمة أحمد الكعبي
    رقم الهوية: 784-5678-7654321-2
    رقم الهاتف: +971562345678
    البريد الإلكتروني: fatima@company.ae
    العنوان: شارع خليفة، منطقة النهدة، دبي

    موضوع الاجتماع: مناقشة المشروع الجديد
    


Masked Text:
--------------------------------------------------
<ARABIC_NAME>------------------<ARABIC_NAME_2>: 15<ARABIC_NAME_3>2023<ARABIC_NAME_4>:<ARABIC_NAME_5>1. السيد محمد عبدالله الهاشمي<ARABIC_NAME_5>رقم الهوية: <NATIONAL_ID><ARABIC_NAME_5>رقم الهاتف: <PHONE_NUMBER><ARABIC_NAME_5>البريد الإلكتروني: <EMAIL_ADDRESS><ARABIC_NAME_5>العنوان: شارع الشيخ زايد، منطقة الوحدة، أبوظبي
<ARABIC_NAME_5>2. السيدة فاطمة أحمد الكع