<a href="https://colab.research.google.com/github/JNAbhishek27/DeID-Guard/blob/main/DeID_Guard.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get install -y poppler-utils
!pip install pytesseract pdf2image opencv-python PyMuPDF

In [None]:
import cv2
import pytesseract
from pdf2image import convert_from_path
import fitz  # PyMuPDF
import matplotlib.pyplot as plt

In [None]:
# Convert PDF to image
pages = convert_from_path("/content/sample_pii.pdf", dpi=300)
page = pages[0]   # First page only

# Save as image for OCR
page.save("page1.png", "PNG")

In [None]:
# Load image
img = cv2.imread("page1.png")

# Convert to gray for better OCR
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# OCR extraction
text = pytesseract.image_to_string(gray)
print("Extracted Text:\n", text)

In [None]:
plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
plt.axis("off")
plt.show()

In [None]:
!pip install spacy
!python -m spacy download en_core_web_trf

In [None]:
import spacy
import re

# Load transformer-based spaCy model
nlp = spacy.load("en_core_web_trf")

In [None]:
sample_text = """
Patient Name: Rahul Sharma
DOB: 12/08/1995
Phone: +91-9876543210
Address: 123 MG Road, Bangalore
Aadhaar: 1234-5678-9123
Diagnosis: Type II Diabetes
"""

doc = nlp(sample_text)

pii_entities = []
for ent in doc.ents:
    if ent.label_ in ["PERSON", "GPE", "ORG", "DATE", "CARDINAL", "MONEY"]:
        pii_entities.append((ent.text, ent.label_))

# Regex rules for IDs / phone numbers
regex_patterns = {
    "PHONE": r"\+?\d[\d -]{8,12}\d",
    "AADHAAR": r"\d{4}-\d{4}-\d{4}",
    "EMAIL": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
}

for label, pattern in regex_patterns.items():
    matches = re.findall(pattern, sample_text)
    for m in matches:
        pii_entities.append((m, label))

print("Detected PII:", pii_entities)

In [None]:
def redact_text(text):
    doc = nlp(text)
    redacted = text

    # Replace NER-detected PII
    for ent in doc.ents:
        if ent.label_ in ["PERSON", "GPE", "ORG", "DATE", "CARDINAL", "MONEY"]:
            redacted = redacted.replace(ent.text, "[REDACTED]")

    # Replace regex PII
    for label, pattern in regex_patterns.items():
        redacted = re.sub(pattern, "[REDACTED]", redacted)

    return redacted

print("Original Text:\n", sample_text)
print("\nRedacted Text:\n", redact_text(sample_text))

In [None]:
!pip install spacy PyMuPDF
!python -m spacy download en_core_web_trf

In [None]:
import fitz  # PyMuPDF
import spacy
import re

# Load spaCy model
nlp = spacy.load("en_core_web_trf")

In [None]:
regex_patterns = {
    "PHONE": r"\+?\d[\d -]{8,12}\d",
    "AADHAAR": r"\d{4}-\d{4}-\d{4}",
    "EMAIL": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
}

def detect_pii(text):
    pii_entities = []
    doc = nlp(text)

    for ent in doc.ents:
        if ent.label_ in ["PERSON", "GPE", "ORG", "DATE", "CARDINAL", "MONEY"]:
            pii_entities.append(ent.text)

    for label, pattern in regex_patterns.items():
        matches = re.findall(pattern, text)
        pii_entities.extend(matches)

    return list(set(pii_entities))

In [None]:
def redact_pdf(input_pdf, output_pdf):
    doc = fitz.open(input_pdf)
    for page in doc:
        text = page.get_text()
        pii_list = detect_pii(text)

        for pii in pii_list:
            areas = page.search_for(pii)
            for area in areas:
                page.add_redact_annot(area, fill=(0, 0, 0))
        page.apply_redactions()
    doc.save(output_pdf)
    print(f"Redacted PDF saved as: {output_pdf}")

# Run on your uploaded PDF
redact_pdf("sample_pii.pdf", "redacted_sample.pdf")

In [None]:
from google.colab import files
files.download("redacted_sample.pdf")

In [None]:
!pip install opencv-python

In [None]:
import cv2
import matplotlib.pyplot as plt

# Load OpenCV face detector
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")

In [None]:
def redact_faces(image_path, output_path="face_redacted.png"):
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Detect faces
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=4, minSize=(30, 30))

    # Redact each face with a black box
    for (x, y, w, h) in faces:
        cv2.rectangle(img, (x, y), (x+w, y+h), (0, 0, 0), -1)

    cv2.imwrite(output_path, img)

    # Show result
    plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    plt.axis("off")
    plt.show()
    print(f"Redacted image saved as {output_path}")

In [None]:
redact_faces("Untitled design.jpg")

In [None]:
!pip install pyzbar opencv-python
!apt-get install -y libzbar0
!pip install pyzbar Pillow opencv-python

In [None]:
from pyzbar.pyzbar import decode
from PIL import Image
import cv2
import matplotlib.pyplot as plt

def redact_qr_barcode(image_path, output_path="qr_redacted.png"):
    img = cv2.imread(image_path)
    barcodes = decode(Image.open(image_path))

    for bc in barcodes:
        (x, y, w, h) = bc.rect
        cv2.rectangle(img, (x, y), (x+w, y+h), (0, 0, 0), -1)

    cv2.imwrite(output_path, img)

    # Show output
    plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    plt.axis("off")
    plt.show()
    print(f"Redacted QR/Barcode saved as {output_path}")

# Run on your uploaded file
redact_qr_barcode("qr.png")

In [None]:
def redact_signatures(image_path, output_path="signature_redacted.png"):
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Threshold for ink regions (handwriting)
    _, thresh = cv2.threshold(gray, 120, 255, cv2.THRESH_BINARY_INV)

    # Find contours
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        # Heuristic: Signatures are usually small & wide
        if 50 < w < 500 and 10 < h < 200:
            cv2.rectangle(img, (x, y), (x+w, y+h), (0, 0, 0), -1)

    cv2.imwrite(output_path, img)
    plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    plt.axis("off")
    plt.show()
    print(f"Redacted signatures/stamps saved as {output_path}")

In [None]:
redact_signatures("Signature.jpg")

In [None]:
import cv2
from pyzbar.pyzbar import decode
from PIL import Image
import matplotlib.pyplot as plt

# Load Haar cascade for faces
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")

def redact_all(image_path, output_path="final_redacted.png"):
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # 1. Face Detection
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=4, minSize=(30, 30))
    for (x, y, w, h) in faces:
        cv2.rectangle(img, (x, y), (x+w, y+h), (0, 0, 0), -1)

    # 2. Signature/Stamp Detection (ink-based)
    _, thresh = cv2.threshold(gray, 120, 255, cv2.THRESH_BINARY_INV)
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if 50 < w < 500 and 10 < h < 200:  # heuristic for signatures
            cv2.rectangle(img, (x, y), (x+w, y+h), (0, 0, 0), -1)

    # 3. QR/Barcode Detection
    barcodes = decode(Image.open(image_path))
    for bc in barcodes:
        (x, y, w, h) = bc.rect
        cv2.rectangle(img, (x, y), (x+w, y+h), (0, 0, 0), -1)

    # Save + show
    cv2.imwrite(output_path, img)
    plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    plt.axis("off")
    plt.show()
    print(f"Redacted image saved as {output_path}")

In [None]:
redact_faces("qr.png")

In [None]:
!apt-get install -y poppler-utils libzbar0
!pip install pytesseract pdf2image opencv-python PyMuPDF spacy pyzbar Pillow
!python -m spacy download en_core_web_trf

In [None]:
import spacy, re
import fitz  # PyMuPDF
from pdf2image import convert_from_path
import cv2
from pyzbar.pyzbar import decode
from PIL import Image

# Load NLP model
nlp = spacy.load("en_core_web_trf")

regex_patterns = {
    "PHONE": r"\+?\d[\d -]{8,12}\d",
    "AADHAAR": r"\d{4}-\d{4}-\d{4}",
    "EMAIL": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
}

def detect_pii_text(text):
    pii = []
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ in ["PERSON","GPE","ORG","DATE","CARDINAL","MONEY"]:
            pii.append(ent.text)
    for label, pattern in regex_patterns.items():
        pii.extend(re.findall(pattern, text))
    return list(set(pii))

In [None]:
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")

def redact_visual(img_path, out_path):
    img = cv2.imread(img_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Faces
    faces = face_cascade.detectMultiScale(gray,1.1,4)
    for (x,y,w,h) in faces:
        cv2.rectangle(img,(x,y),(x+w,y+h),(0,0,0),-1)

    # Signatures/Stamps
    _, thresh = cv2.threshold(gray,120,255,cv2.THRESH_BINARY_INV)
    contours,_ = cv2.findContours(thresh,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
    for cnt in contours:
        x,y,w,h=cv2.boundingRect(cnt)
        if 50<w<500 and 10<h<200:
            cv2.rectangle(img,(x,y),(x+w,y+h),(0,0,0),-1)

    # QR/Barcode
    barcodes = decode(Image.open(img_path))
    for bc in barcodes:
        (x,y,w,h)=bc.rect
        cv2.rectangle(img,(x,y),(x+w,y+h),(0,0,0),-1)

    cv2.imwrite(out_path,img)
    return out_path

In [None]:
def redact_pdf_full(input_pdf, output_pdf):
    doc = fitz.open(input_pdf)
    for i, page in enumerate(doc):
        # --- Text Redaction ---
        text = page.get_text()
        pii_list = detect_pii_text(text)
        for pii in pii_list:
            for area in page.search_for(pii):
                page.add_redact_annot(area, fill=(0,0,0))
        page.apply_redactions()

        # --- Image Redaction ---
        images = page.get_images(full=True)
        for img_index, img in enumerate(images):
            xref = img[0]
            base_img = doc.extract_image(xref)
            img_bytes = base_img["image"]
            ext = base_img["ext"]
            img_path = f"page{i}_img{img_index}.{ext}"
            with open(img_path, "wb") as f:
                f.write(img_bytes)

            # Run visual redaction
            redacted_path = f"redacted_{img_path}"
            redact_visual(img_path, redacted_path)

            # Replace in PDF (fix: update_stream instead of update_image)
            with open(redacted_path, "rb") as f:
                doc.update_stream(xref, f.read())

    doc.save(output_pdf)
    print(f"✅ Final redacted PDF saved as {output_pdf}")

In [None]:
redact_pdf_full("test_doc.pdf", "final_redacted_test.pdf")

In [None]:
import numpy as np

def redact_visual(img_path, out_path, style="black"):
    img = cv2.imread(img_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Helper function for region styling
    def apply_style(x, y, w, h):
        roi = img[y:y+h, x:x+w]
        if style == "black":
            cv2.rectangle(img, (x, y), (x+w, y+h), (0,0,0), -1)
        elif style == "blur":
            roi_blur = cv2.GaussianBlur(roi, (51,51), 30)
            img[y:y+h, x:x+w] = roi_blur
        elif style == "pseudonym":
            cv2.putText(img, "XXXX", (x, y+h//2), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,0), 2)

    # Faces
    faces = face_cascade.detectMultiScale(gray,1.1,4)
    for (x,y,w,h) in faces:
        apply_style(x,y,w,h)

    # Signatures
    _, thresh = cv2.threshold(gray,120,255,cv2.THRESH_BINARY_INV)
    contours,_ = cv2.findContours(thresh,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
    for cnt in contours:
        x,y,w,h = cv2.boundingRect(cnt)
        if 50<w<500 and 10<h<200:
            apply_style(x,y,w,h)

    # QR/Barcodes
    barcodes = decode(Image.open(img_path))
    for bc in barcodes:
        (x,y,w,h) = bc.rect
        apply_style(x,y,w,h)

    cv2.imwrite(out_path, img)
    return out_path

In [None]:
redact_visual("Signature.jpg", "out_black.png", style="black")
redact_visual("Untitled design.jpg", "out_blur.png", style="blur")
redact_visual("qr.png", "out_pseudo.png", style="pseudonym")

In [None]:
import json

def redact_pdf_with_log(input_pdf, output_pdf, style="black", log_file="audit_log.json"):
    log_data = {"file": input_pdf, "redactions": []}
    doc = fitz.open(input_pdf)

    for i, page in enumerate(doc):
        text = page.get_text()
        pii_list = detect_pii_text(text)

        for pii in pii_list:
            for area in page.search_for(pii):
                page.add_redact_annot(area, fill=(0,0,0))
            log_data["redactions"].append({"page": i+1, "type": "text", "value": pii})
        page.apply_redactions()

        images = page.get_images(full=True)
        for img_index, img in enumerate(images):
            xref = img[0]
            base_img = doc.extract_image(xref)
            ext = base_img["ext"]
            img_path = f"page{i}_img{img_index}.{ext}"
            with open(img_path,"wb") as f:
                f.write(base_img["image"])

            redacted_path = f"redacted_{img_path}"
            redact_visual(img_path, redacted_path, style=style)

            with open(redacted_path,"rb") as f:
                doc.update_stream(xref, f.read())

            log_data["redactions"].append({"page": i+1, "type": "image", "action": style})

    doc.save(output_pdf)

    with open(log_file,"w") as f:
        json.dump(log_data,f,indent=2)

    print(f"✅ Saved {output_pdf} and audit log {log_file}")

In [None]:
redact_pdf_with_log("test_doc.pdf", "redacted_with_log.pdf", style="blur")

In [None]:
!apt-get install -y tesseract-ocr tesseract-ocr-hin tesseract-ocr-tam
!pip install pytesseract

In [None]:
import pytesseract
from PIL import Image

# Example Hindi text image
hindi_img = Image.new("RGB", (400, 100), (255, 255, 255))
import cv2, numpy as np
cv2.putText(np.array(hindi_img), "आधार संख्या: १२३४-५६७८-९१२३", (10,60), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0,0,0), 2)

# Run Hindi OCR
hindi_text = pytesseract.image_to_string(hindi_img, lang="hin")
print("Extracted Hindi Text:", hindi_text)

In [None]:

tamil_text = pytesseract.image_to_string(hindi_img, lang="tam")
print("Extracted Tamil Text:", tamil_text)


In [None]:
regex_patterns.update({
    "AADHAAR_HINDI": r"[०१२३४५६७८९]{4}-[०१२३४५६७८९]{4}-[०१२३४५६७८९]{4}"
})

In [None]:
ground_truth = {
    "test_doc.pdf": ["Rahul Sharma", "+91-9876543210", "123 MG Road, Bangalore", "1234-5678-9123"]
}

In [None]:
def evaluate_redaction(file, ground_truth_entities):
    doc = fitz.open(file)
    detected = set()
    for page in doc:
        text = page.get_text()
        detected.update(detect_pii_text(text))

    tp = len(set(ground_truth_entities) & detected)
    fp = len(detected - set(ground_truth_entities))
    fn = len(set(ground_truth_entities) - detected)

    precision = tp / (tp+fp) if (tp+fp)>0 else 0
    recall = tp / (tp+fn) if (tp+fn)>0 else 0
    return {"precision": precision, "recall": recall, "tp": tp, "fp": fp, "fn": fn}

In [None]:
metrics = evaluate_redaction("test_doc.pdf", ground_truth["test_doc.pdf"])
print(metrics)

In [None]:
def compare_pdfs(original, redacted, page_num=0):
    pages = convert_from_path(original, dpi=150)
    redacted_pages = convert_from_path(redacted, dpi=150)

    import matplotlib.pyplot as plt
    fig, axs = plt.subplots(1, 2, figsize=(12,6))
    axs[0].imshow(pages[page_num]); axs[0].set_title("Original"); axs[0].axis("off")
    axs[1].imshow(redacted_pages[page_num]); axs[1].set_title("Redacted"); axs[1].axis("off")
    plt.show()

compare_pdfs("test_doc.pdf", "final_redacted_test.pdf")

In [None]:
!pip install streamlit

In [None]:
import streamlit as st
import fitz
import os
from tempfile import NamedTemporaryFile

st.set_page_config(page_title="DeID-Guard", layout="centered")

st.title("🛡️ DeID-Guard – Privacy by Design")
st.write("Upload a document and choose how to anonymize sensitive data.")

uploaded_file = st.file_uploader("Upload a PDF/Image", type=["pdf","jpg","png"])
style = st.radio("Choose Redaction Style", ["black", "blur", "pseudonym"])

if uploaded_file:
    st.success(f"File `{uploaded_file.name}` uploaded successfully!")

    if st.button("Run De-identification"):
        input_path = uploaded_file.name
        with open(input_path,"wb") as f:
            f.write(uploaded_file.getbuffer())

        output_pdf, log_file = redact_pdf_with_log(input_path, "redacted_output.pdf", style=style)

        st.success("✅ De-identification completed!")

        # Preview first page
        pages = convert_from_path(output_pdf, dpi=150)
        st.image(pages[0], caption="Redacted PDF Preview (Page 1)", use_container_width=True)

        with open(output_pdf,"rb") as f:
            st.download_button("⬇️ Download Redacted PDF", f, file_name="redacted.pdf")

        with open(log_file,"rb") as f:
            st.download_button("⬇️ Download Audit Log (JSON)", f, file_name="audit_log.json")


In [None]:
!pip install pyngrok
!pip install streamlit


In [None]:
!streamlit run app.py --server.port 8501
!nohup streamlit run app.py --server.port 8501 &

In [None]:
from pyngrok import ngrok
import time
import os
import psutil

# Kill any running ngrok processes
for proc in psutil.process_iter(['pid', 'name']):
    if proc.info['name'] == 'ngrok':
        print(f"Killing ngrok process with PID: {proc.info['pid']}")
        proc.kill()

# Run Streamlit in background
os.system("nohup streamlit run app.py --server.port 8501 &")

time.sleep(5)  # wait for streamlit

# Auth for ngrok
ngrok.set_auth_token("32YE2dnGk7AkEQ8qbyPF9jrEWDg_7FGuwpYte26gYK9fJut65")  # replace with your real token

# Open tunnel
public_url = ngrok.connect(8501)
print("🌐 Your DeID-Guard app is live at:", public_url)