In [1]:
# Install these only in a notebook/script environment
!pip install spacy scikit-learn joblib pandas numpy tensorflow gardio
!python -m spacy download en_core_web_sm

Collecting fastapi
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.7-py3-none-any.whl.metadata (9.4 kB)
Collecting starlette<0.47.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.46.2-py3-none-any.whl.metadata (6.2 kB)
Downloading fastapi-0.115.12-py3-none-any.whl (95 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading uvicorn-0.34.2-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.2.7-py3-none-any.whl (23 kB)
Downloading starlette-0.46.2-py3-none-any.whl (72 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: uvicorn, pyngrok, s

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import joblib
import re
import spacy
from typing import Dict, List, Any

class PIIMasker:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        self.placeholder_map = {
            "full_name": "[full_name]",
            "email": "[email]",
            "phone_number": "[phone_number]",
            "dob": "[dob]",
            "aadhar_num": "[aadhar_num]",
            "credit_debit_no": "[credit_debit_no]",
            "cvv_no": "[cvv_no]",
            "expiry_no": "[expiry_no]"
        }
        self.patterns = {
            "email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
            "phone_number": r'(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b|\b\d{3}[-.\s]?\d{4}\b',
            "dob": r'\b(?:0?[1-9]|1[0-2])[/-](?:0?[1-9]|[12][0-9]|3[01])[/-](?:19|20)?\d{2}\b',
            "aadhar_num": r'\b\d{4}[ -]?\d{4}[ -]?\d{4}\b',
            "credit_debit_no": r'\b(?:\d[ -]*?){13,16}\b',
            "cvv_no": r'\b\d{3,4}\b',
            "expiry_no": r'\b(?:0[1-9]|1[0-2])[/-](?:\d{4}|\d{2})\b'
        }

    def mask_text(self, text: str) -> Dict[str, Any]:
        # First pass: Protect existing placeholders
        protected = {}
        for pii_type, placeholder in self.placeholder_map.items():
            pattern = re.compile(re.escape(placeholder))
            for i, match in enumerate(pattern.finditer(text)):
                protected[f"PROTECTED_{pii_type}_{i}"] = match.group()
                text = text.replace(match.group(), f"PROTECTED_{pii_type}_{i}")

        # Second pass: Mask new PII
        entities = []
        masked_text = text
        masked_positions = set()

        # 1. Mask names using spaCy
        doc = self.nlp(masked_text)
        for ent in doc.ents:
            if ent.label_ == "PERSON" and len(ent.text.split()) >= 2:
                start, end = ent.start_char, ent.end_char
                if not any(s <= start < e or s < end <= e for (s, e) in masked_positions):
                    original = masked_text[start:end]
                    masked_text = masked_text[:start] + "[full_name]" + masked_text[end:]
                    entities.append({
                        "position": [start, end],
                        "classification": "full_name",
                        "entity": original
                    })
                    masked_positions.add((start, end))

        # 2. Mask other PII with regex (process from end to start)
        spans = []
        for pii_type, pattern in self.patterns.items():
            for match in re.finditer(pattern, masked_text):
                start, end = match.span()
                if not any(s <= start < e or s < end <= e for (s, e) in masked_positions):
                    spans.append((start, end, pii_type, match.group()))

        # Sort spans in reverse order to avoid offset issues
        for start, end, pii_type, original in sorted(spans, key=lambda x: x[0], reverse=True):
            masked_text = masked_text[:start] + self.placeholder_map[pii_type] + masked_text[end:]
            entities.append({
                "position": [start, end],
                "classification": pii_type,
                "entity": original
            })

        # Third pass: Restore protected placeholders
        for protected_key, original in protected.items():
            masked_text = masked_text.replace(protected_key, original)

        return {"masked_email": masked_text, "entities": sorted(entities, key=lambda x: x["position"][0])}

# Load and preprocess data
def load_and_preprocess_data(filepath: str):
    df = pd.read_csv(filepath)
    emails = df['email'].tolist()
    categories = df['type'].tolist()
    return emails, categories

# Main training function
def train_classifier():
    # Load data
    emails, categories = load_and_preprocess_data('/content/drive/MyDrive/combined_emails_with_natural_pii.csv')

    # Initialize and apply PII masking
    masker = PIIMasker()
    masked_emails = [masker.mask_text(email)['masked_email'] for email in emails]

    # Encode labels
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(categories)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        masked_emails, y, test_size=0.2, random_state=42, stratify=y
    )

    # Vectorize text
    vectorizer = TfidfVectorizer(
        max_features=5000,
        ngram_range=(1, 2),
        stop_words='english'
    )
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Train model
    classifier = LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        C=0.1,
        solver='liblinear'
    )
    classifier.fit(X_train_vec, y_train)

    # Evaluate
    y_pred = classifier.predict(X_test_vec)
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

    # Save artifacts
    joblib.dump(classifier, 'email_classifier.joblib')
    joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
    joblib.dump(label_encoder, 'label_encoder.joblib')

    return classifier, vectorizer, label_encoder

# Prediction function
def predict_email_category(email: str, classifier, vectorizer, label_encoder):
    masker = PIIMasker()
    masked_email = masker.mask_text(email)['masked_email']
    email_vec = vectorizer.transform([masked_email])
    pred = classifier.predict(email_vec)
    return label_encoder.inverse_transform(pred)[0]

# Example usage
if __name__ == "__main__":
    # Train the model
    print("Training classifier...")
    classifier, vectorizer, label_encoder = train_classifier()

    # Test predictions
    test_emails = [
        "Hello John Doe, your invoice for $100 is due on 05/30/2023. Contact us at billing@company.com",
        "Password reset requested for account jane.smith@example.com",
        "Your appointment with Dr. Johnson is confirmed for 06/15 at 3 PM"
    ]

    print("\nTest Predictions:")
    for email in test_emails:
        category = predict_email_category(email, classifier, vectorizer, label_encoder)
        print(f"\nEmail: {email}\nCategory: {category}")

Training classifier...
Accuracy: 0.73

Classification Report:
              precision    recall  f1-score   support

      Change       0.82      0.75      0.78       504
    Incident       0.67      0.88      0.76      1917
     Problem       0.57      0.19      0.29      1007
     Request       0.86      0.91      0.89      1372

    accuracy                           0.73      4800
   macro avg       0.73      0.69      0.68      4800
weighted avg       0.72      0.73      0.70      4800


Test Predictions:

Email: Hello John Doe, your invoice for $100 is due on 05/30/2023. Contact us at billing@company.com
Category: Problem

Email: Password reset requested for account jane.smith@example.com
Category: Incident

Email: Your appointment with Dr. Johnson is confirmed for 06/15 at 3 PM
Category: Incident


In [14]:
import gradio as gr
import spacy
import re
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the trained model, vectorizer, and label encoder
classifier = joblib.load('email_classifier.joblib')
vectorizer = joblib.load('tfidf_vectorizer.joblib')
label_encoder = joblib.load('label_encoder.joblib')

class PIIMasker:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        self.placeholder_map = {
            "full_name": "[full_name]",
            "email": "[email]",
            "phone_number": "[phone_number]",
            "dob": "[dob]",
            "aadhar_num": "[aadhar_num]",
            "credit_debit_no": "[credit_debit_no]",
            "cvv_no": "[cvv_no]",
            "expiry_no": "[expiry_no]"
        }
        self.patterns = {
            "email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b',
            "phone_number": r'(\+?\d{1,3}[-.\s]?)?\(?\d{2,3}\)?[-.\s]?\d{2,4}[-.\s]?\d{4}\b',
            "dob": r'\b(?:0?[1-9]|1[0-2])[/-](?:0?[1-9]|[12][0-9]|3[01])[/-](?:19|20)?\d{2}\b',
            "aadhar_num": r'\b\d{4}[ -]?\d{4}[ -]?\d{4}\b',
            "credit_debit_no": r'\b(?:\d[ -]*?){13,16}\b',
            "cvv_no": r'\b\d{3,4}\b',
            "expiry_no": r'\b(?:0[1-9]|1[0-2])[/-](?:\d{4}|\d{2})\b'
        }

    def mask_text(self, text: str):
        # First pass: Protect existing placeholders
        protected = {}
        for pii_type, placeholder in self.placeholder_map.items():
            pattern = re.compile(re.escape(placeholder))
            for i, match in enumerate(pattern.finditer(text)):
                protected[f"PROTECTED_{pii_type}_{i}"] = match.group()
                text = text.replace(match.group(), f"PROTECTED_{pii_type}_{i}")

        # Second pass: Mask new PII
        # 1. Mask using regex patterns
        for pii_type, pattern in self.patterns.items():
            text = re.sub(
                pattern,
                self.placeholder_map[pii_type],
                text
            )

        # 2. Mask names using spaCy
        doc = self.nlp(text)
        spans = []
        for ent in doc.ents:
            if ent.label_ == "PERSON":
                # Check if this is a multi-word name
                if len(ent.text.split()) >= 2:
                    spans.append((ent.start_char, ent.end_char))

        # Replace from end to start to avoid offset issues
        for start, end in sorted(spans, reverse=True):
            text = text[:start] + "[full_name]" + text[end:]

        # Third pass: Restore protected placeholders
        for protected_key, original in protected.items():
            text = text.replace(protected_key, original)

        return text


def classify_and_mask_email(email_body):
    masker = PIIMasker()

    # Mask PII
    masked_email = masker.mask_text(email_body)

    # Classify
    email_vector = vectorizer.transform([masked_email])
    predicted_class = classifier.predict(email_vector)
    predicted_category = label_encoder.inverse_transform(predicted_class)

    return masked_email, predicted_category[0]

# Create a Gradio interface
iface = gr.Interface(
    fn=classify_and_mask_email,
    inputs="text",
    outputs=[gr.Textbox(label="Masked Email"), gr.Textbox(label="Category")], # Label the outputs here
    live=True,
    title="Email PII Masking & Classification",
    description="This app masks PII in an email and classifies the email category."
)

# Launch the interface
iface.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://69cb9d6d2593673c13.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


