In [None]:

# 📩 Email Classification with PII Masking (Akaike Assignment Ready)

# STEP 1: Install required libraries
!pip install pandas scikit-learn joblib

# STEP 2: Upload your dataset
from google.colab import files
uploaded = files.upload()

# STEP 3: Rename uploaded file
import os
for filename in uploaded.keys():
    os.rename(filename, "emails.csv")

# STEP 4: Imports
import pandas as pd
import re
import joblib
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# STEP 5: Define PII Masking Rules
PII_PATTERNS = {
    "full_name": r"\b[A-Z][a-z]+ [A-Z][a-z]+\b",
    "email": r"\b[\w\.-]+@[\w\.-]+\.\w+\b",
    "phone_number": r"\b(?:\+91[-\s]?)?[6-9]\d{9}\b",
    "dob": r"\b\d{2}[/-]\d{2}[/-]\d{4}\b",
    "aadhar_num": r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4}\b",
    "credit_debit_no": r"\b(?:\d[ -]*?){13,16}\b",
    "cvv_no": r"\b\d{3}\b",
    "expiry_no": r"\b(0[1-9]|1[0-2])\/\d{2,4}\b",
}

def mask_pii_with_metadata(text):
    entities = []
    masked_text = text
    offset = 0

    for entity, pattern in PII_PATTERNS.items():
        for match in re.finditer(pattern, text):
            start, end = match.start(), match.end()
            original = match.group()
            masked_entity = f"[{entity}]"

            adjusted_start = start + offset
            adjusted_end = end + offset

            entities.append({
                "position": [adjusted_start, adjusted_end],
                "classification": entity,
                "entity": original
            })

            masked_text = masked_text[:adjusted_start] + masked_entity + masked_text[adjusted_end:]
            offset += len(masked_entity) - (end - start)
    
    return masked_text, entities

# STEP 6: Load and process the dataset
df = pd.read_csv("emails.csv")
assert "email" in df.columns and "type" in df.columns, "Dataset must have 'email' and 'type' columns."

df['masked_email'] = df['email'].apply(lambda x: mask_pii_with_metadata(str(x))[0])

# STEP 7: Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['masked_email'], df['type'], test_size=0.2, random_state=42)

# STEP 8: Build and train the model
model = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english")),
    ("clf", LogisticRegression(max_iter=300))
])
model.fit(X_train, y_train)

# STEP 9: Evaluate model
y_pred = model.predict(X_test)
print("📊 Classification Report:")
print(classification_report(y_test, y_pred))

# Save model
joblib.dump(model, "email_classifier.pkl")

# STEP 10: Final classifier function (API-ready structure)
def classify_email_with_format(raw_email):
    masked_email, entities = mask_pii_with_metadata(raw_email)
    clf = joblib.load("email_classifier.pkl")
    predicted = clf.predict([masked_email])[0]

    response = {
        "input_email_body": raw_email,
        "list_of_masked_entities": entities,
        "masked_email": masked_email,
        "category_of_the_email": predicted
    }

    import json
    print(json.dumps(response, indent=2))

# STEP 11: Test it on a sample email
sample_email = """Hi, I’m Sneha Kapoor. I was charged twice on my credit card 4111 1111 1111 1234.
Please fix this billing issue urgently. My email is sneha.kapoor@example.com and my phone number is 9876543210.
"""
classify_email_with_format(sample_email)
