In [1]:
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC



from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("ai4privacy/pii-masking-300k", split="train")
df = pd.DataFrame(dataset)
print("Columns in dataset:", df.columns)


'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /datasets/ai4privacy/pii-masking-300k/resolve/main/README.md (Caused by NameResolutionError("HTTPSConnection(host=\'huggingface.co\', port=443): Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 8be635d3-e58c-42c5-a350-25498395d569)')' thrown while requesting HEAD https://huggingface.co/datasets/ai4privacy/pii-masking-300k/resolve/main/README.md
Retrying in 1s [Retry 1/5].
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /datasets/ai4privacy/pii-masking-300k/resolve/main/README.md (Caused by NameResolutionError("HTTPSConnection(host=\'huggingface.co\', port=443): Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: ceb61908-c2a1-4ef8-9063-c27a10abd099)')' thrown while requesting HEAD https://huggingface.co/datasets/ai4privacy/pii-masking-300k/resolve

Columns in dataset: Index(['source_text', 'target_text', 'privacy_mask', 'span_labels',
       'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set'],
      dtype='object')


In [3]:
df['has_pii'] = (df['source_text'] != df['target_text']).astype(int)
print("Class distribution before adding non-PII samples:")
print(df['has_pii'].value_counts())


Class distribution before adding non-PII samples:
has_pii
1    159027
0     18650
Name: count, dtype: int64


In [4]:
df_non_pii = df.sample(n=5000, random_state=42).copy()
df_non_pii['has_pii'] = 0
df_non_pii['target_text'] = df_non_pii['source_text']

df_combined = pd.concat([df, df_non_pii], ignore_index=True)
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

print("Class distribution after adding non-PII samples:")
print(df_combined['has_pii'].value_counts())


Class distribution after adding non-PII samples:
has_pii
1    159027
0     23650
Name: count, dtype: int64


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    df_combined['source_text'],
    df_combined['has_pii'],
    test_size=0.2,
    random_state=42,
    stratify=df_combined['has_pii']
)


In [6]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [7]:
models = {
    "KNN": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
   "Linear SVC": LinearSVC(max_iter=10000),  # fast linear kernel
    "Naive Bayes": MultinomialNB()
}


In [8]:
results = []

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred)
    })


Training KNN...
Training Logistic Regression...
Training Decision Tree...
Training Random Forest...
Training Linear SVC...
Training Naive Bayes...


In [9]:
results_df = pd.DataFrame(results)
print("\nModel performance metrics:")
print(results_df)



Model performance metrics:
                 Model  Accuracy  Precision    Recall        F1
0                  KNN  0.721562   0.895759  0.769729  0.827976
1  Logistic Regression  0.897033   0.904886  0.985286  0.943376
2        Decision Tree  0.824091   0.904730  0.891844  0.898241
3        Random Forest  0.867528   0.884723  0.974848  0.927601
4           Linear SVC  0.890190   0.900444  0.982488  0.939678
5          Naive Bayes  0.866132   0.940320  0.903572  0.921580


In [10]:
import joblib

# Assuming these exist in your notebook:
# vectorizer -> your trained TfidfVectorizer
# models -> dictionary of all trained models

# Create a dictionary to save everything
pipeline = {
    "vectorizer": vectorizer,
    "models": models
}

# Save to a single .pkl file
joblib.dump(pipeline, "../models/pii_pipeline.pkl")
print("All models and vectorizer saved as pii_pipeline.pkl ✅")


All models and vectorizer saved as pii_pipeline.pkl ✅
