In [1]:
from google.colab import drive
drive.mount('/content/drive')
PROJECT_DIR = "/content/drive/MyDrive/DataShield_AI"
MODEL_DIR = f"{PROJECT_DIR}/models/ner-distilbert"
DATA_DIR  = f"{PROJECT_DIR}/data"
!mkdir -p "$MODEL_DIR" "$DATA_DIR"


Mounted at /content/drive


In [2]:
%%capture
!pip install -U transformers datasets accelerate evaluate seqeval faker

In [3]:
LABELS = [
    "O",
    "B-EMAIL","I-EMAIL",
    "B-PHONE","I-PHONE",
    "B-SSN","I-SSN",
    "B-CREDITCARD","I-CREDITCARD",
    "B-APIKEY","I-APIKEY",
    "B-PERSON","I-PERSON",
    "B-ADDRESS","I-ADDRESS"
]
label2id = {l:i for i,l in enumerate(LABELS)}
id2label = {i:l for l,i in label2id.items()}

In [4]:
!nvidia-smi
import torch; print("CUDA?", torch.cuda.is_available())

Wed Nov  5 04:20:21 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   48C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [5]:
import random, re, json, os
from faker import Faker
fake = Faker()

def make_sample():
    # compose a sentence with 0–3 entities; vary templates
    parts = []
    tags  = []
    def add_token(seq, label_seq):
        for tok, lab in zip(seq, label_seq):
            parts.append(tok)
            tags.append(lab)
    # base sentence
    base = ["Please","review","this","message","for","policy","compliance",":"]
    parts += base; tags += ["O"]*len(base)

    # choose 1–3 entity types
    candidates = ["EMAIL","PHONE","SSN","CREDITCARD","APIKEY","PERSON","ADDRESS"]
    for _ in range(random.randint(1,3)):
        t = random.choice(candidates)
        if t=="EMAIL":
            v = fake.email()
            toks = v.split("@")[0].split(".")+["@"]+[v.split("@")[1]]
        elif t=="PHONE":
            v = fake.phone_number()
            toks = re.findall(r"\d+|[^\s\d]+", v)
        elif t=="SSN":
            v = fake.ssn()
            toks = v.split("-")
        elif t=="CREDITCARD":
            v = fake.credit_card_number()
            toks = [v[:4],v[4:8],v[8:12],v[12:]]
        elif t=="APIKEY":
            v = fake.sha1()
            toks = [v[:8],v[8:16],v[16:24],v[24:]]
        elif t=="PERSON":
            v = fake.name()
            toks = v.split()
        else: # ADDRESS
            v = fake.street_address()
            toks = v.split()

        lab = f"B-{t}"
        labs = [lab] + [f"I-{t}"]*(len(toks)-1)
        # prepend a cue
        cue = random.choice(["Contact","Ref","Key","User","Info",""])
        if cue:
            parts += [cue]; tags += ["O"]
        add_token(toks, labs)
        # add punctuation / filler
        tail = random.choice([".",";","—","and"])
        parts += [tail]; tags += ["O"]

    return parts, tags

def build_dataset(n=5000):
    sents, labels = [], []
    for _ in range(n):
        p,t = make_sample()
        sents.append(p); labels.append(t)
    return sents, labels

train_tokens, train_tags = build_dataset(4500)
val_tokens,   val_tags   = build_dataset(400)
test_tokens,  test_tags  = build_dataset(400)

os.makedirs(DATA_DIR, exist_ok=True)
json.dump({"tokens":train_tokens,"tags":train_tags}, open(f"{DATA_DIR}/train.json","w"))
json.dump({"tokens":val_tokens,  "tags":val_tags},   open(f"{DATA_DIR}/val.json","w"))
json.dump({"tokens":test_tokens, "tags":test_tags},  open(f"{DATA_DIR}/test.json","w"))

In [6]:
from datasets import DatasetDict, Dataset
import json

def load_split(path):
    d = json.load(open(path))
    return Dataset.from_dict(d)

raw = DatasetDict({
    "train": load_split(f"{DATA_DIR}/train.json"),
    "validation": load_split(f"{DATA_DIR}/val.json"),
    "test": load_split(f"{DATA_DIR}/test.json"),
})

from transformers import AutoTokenizer
MODEL_NAME = "distilbert-base-cased"  # fast + accurate enough
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_and_align_labels(ex):
    tokenized = tokenizer(ex["tokens"], is_split_into_words=True, truncation=True)
    labels = []
    for i, words in enumerate(ex["tokens"]):
        word_ids = tokenized.word_ids(batch_index=i)
        tag_seq  = ex["tags"][i]
        aligned  = []
        prev = None
        for wid in word_ids:
            if wid is None:
                aligned.append(-100)
            else:
                tag = tag_seq[wid]
                # only label the first subword
                if wid != prev:
                    aligned.append(label2id[tag])
                else:
                    aligned.append(-100)
                prev = wid
        labels.append(aligned)
    tokenized["labels"] = labels
    return tokenized

tokenized = raw.map(tokenize_and_align_labels, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [7]:
!pip install -U transformers



In [8]:
# 0) Speed/compat deps
!pip install -U transformers accelerate evaluate seqeval

# 1) Clean up any CPU-only state
import gc, torch
gc.collect(); torch.cuda.empty_cache()

# 2) Quick sanity check
import torch
print("CUDA available:", torch.cuda.is_available())
print("Device count:", torch.cuda.device_count())
print("Current device:", torch.cuda.current_device())
print("Name:", torch.cuda.get_device_name(0))

CUDA available: True
Device count: 1
Current device: 0
Name: Tesla T4


In [9]:
from transformers import AutoTokenizer
MODEL_NAME = "distilbert-base-cased"  # keep as-is
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [10]:
import json, os
from datasets import Dataset, DatasetDict

DATA_DIR = "/content/drive/MyDrive/DataShield_AI/data"  # adjust if different

def load_split(path):
    d = json.load(open(path))
    return Dataset.from_dict(d)

raw = DatasetDict({
    "train": load_split(f"{DATA_DIR}/train.json"),
    "validation": load_split(f"{DATA_DIR}/val.json"),
    "test": load_split(f"{DATA_DIR}/test.json"),
})

In [11]:
def tokenize_and_align_labels(examples):
    tokenized = tokenizer(examples["tokens"], is_split_into_words=True, truncation=True)
    labels = []
    for i, words in enumerate(examples["tokens"]):
        word_ids = tokenized.word_ids(batch_index=i)
        tag_seq  = examples["tags"][i]
        aligned  = []
        prev = None
        for wid in word_ids:
            if wid is None:
                aligned.append(-100)
            else:
                tag = tag_seq[wid]
                if wid != prev:          # first subword gets the label
                    aligned.append(label2id[tag])
                else:
                    aligned.append(-100) # subsequent subwords ignored
                prev = wid
        labels.append(aligned)
    tokenized["labels"] = labels
    return tokenized

tokenized = raw.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [12]:
print(type(tokenizer))
print("Splits:", raw)
print("Tokenized features:", tokenized["train"].features)

<class 'transformers.models.distilbert.tokenization_distilbert_fast.DistilBertTokenizerFast'>
Splits: DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 4500
    })
    validation: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 400
    })
    test: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 400
    })
})
Tokenized features: {'tokens': List(Value('string')), 'tags': List(Value('string')), 'input_ids': List(Value('int32')), 'attention_mask': List(Value('int8')), 'labels': List(Value('int64'))}


In [13]:
import evaluate
seq_f1 = evaluate.load("seqeval")
def compute_metrics(p):
    preds, labels = p
    preds = preds.argmax(-1)
    true_labels, true_preds = [], []
    for pred, lab in zip(preds, labels):
        curr_pred, curr_lab = [], []
        for p_i, l_i in zip(pred, lab):
            if l_i != -100:
                curr_pred.append(id2label[p_i])
                curr_lab.append(id2label[l_i])
        true_preds.append(curr_pred)
        true_labels.append(curr_lab)
    results = seq_f1.compute(predictions=true_preds, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

from transformers import (
    AutoModelForTokenClassification, TrainingArguments, Trainer,
    DataCollatorForTokenClassification
)

# Rebuild the model AFTER enabling GPU
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(LABELS),
    id2label=id2label,
    label2id=label2id
)

args = TrainingArguments(
    output_dir=MODEL_DIR,
    learning_rate=5e-5,
    per_device_train_batch_size=8,       # try 16 if VRAM allows
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    eval_strategy="epoch",               # use this if your version expects eval_strategy
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=True,                           # << use half-precision on the T4 GPU
    dataloader_pin_memory=False,         # quiets the earlier pin_memory warning
    report_to="none",
    logging_steps=50,
    save_total_limit=2                   # keep checkpoints light
)

collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,                 # deprecation warning is harmless
    data_collator=collator,
    compute_metrics=compute_metrics
)

# Train & save
trainer.train()
trainer.save_model(MODEL_DIR)
tokenizer.save_pretrained(MODEL_DIR)

import json, os
os.makedirs(MODEL_DIR, exist_ok=True)
json.dump(LABELS, open(f"{MODEL_DIR}/labels.json","w"))

Downloading builder script: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0025,0.001828,0.993758,0.996245,0.995,0.99961
2,0.0009,0.000129,1.0,1.0,1.0,1.0
3,0.0002,0.000746,0.998748,0.998748,0.998748,0.99987


In [14]:
from transformers import pipeline
ner = pipeline("token-classification", model=MODEL_DIR, tokenizer=MODEL_DIR, aggregation_strategy="simple")

sample = "Email me at alice.lee@example.com or call 415-555-1234. My SSN is 123-45-6789."
ner(sample)

Device set to use cuda:0


[{'entity_group': 'EMAIL',
  'score': np.float32(0.4712591),
  'word': 'me',
  'start': 6,
  'end': 8},
 {'entity_group': 'EMAIL',
  'score': np.float32(0.99574643),
  'word': 'al',
  'start': 12,
  'end': 14},
 {'entity_group': 'EMAIL',
  'score': np.float32(0.8027112),
  'word': '##ice.',
  'start': 14,
  'end': 18},
 {'entity_group': 'EMAIL',
  'score': np.float32(0.99916565),
  'word': 'le',
  'start': 18,
  'end': 20},
 {'entity_group': 'EMAIL',
  'score': np.float32(0.9992527),
  'word': '##e @ example. com',
  'start': 20,
  'end': 33},
 {'entity_group': 'PHONE',
  'score': np.float32(0.9997759),
  'word': '415 - 555 - 1234',
  'start': 42,
  'end': 54},
 {'entity_group': 'PERSON',
  'score': np.float32(0.83417004),
  'word': 'My SSN',
  'start': 56,
  'end': 62},
 {'entity_group': 'PHONE',
  'score': np.float32(0.99936163),
  'word': '123 - 45 - 6789',
  'start': 66,
  'end': 77}]

In [15]:
eval_res = trainer.evaluate(tokenized["test"])
eval_res

{'eval_loss': 0.00036737590562552214,
 'eval_precision': 0.998766954377312,
 'eval_recall': 0.998766954377312,
 'eval_f1': 0.998766954377312,
 'eval_accuracy': 0.9998717784331325,
 'eval_runtime': 0.7374,
 'eval_samples_per_second': 542.417,
 'eval_steps_per_second': 67.802,
 'epoch': 3.0}

In [16]:
import re

# High-precision regexes (tune as needed)
RE_EMAIL = re.compile(r'\b[a-zA-Z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
RE_PHONE = re.compile(r'\b(?:\+?\d{1,3}[\s-]?)?(?:\(?\d{3}\)?[\s-]?)?\d{3}[\s-]?\d{4}\b')
RE_SSN   = re.compile(r'\b\d{3}-\d{2}-\d{4}\b')
RE_CC    = re.compile(r'\b(?:\d[ -]*?){13,19}\b')  # catch groups; refine with Luhn if you’d like

SEVERITY = {"SSN":"HIGH","CREDITCARD":"HIGH","APIKEY":"HIGH",
            "EMAIL":"MEDIUM","PHONE":"MEDIUM","ADDRESS":"MEDIUM","PERSON":"LOW"}

def normalize_email(s):
    # Fix spaced tokens like "example . com" or " @ "
    return re.sub(r'\s*@\s*', '@', re.sub(r'\s*\.\s*', '.', s))

def span_overlap(a, b):
    return not (a[1] <= b[0] or b[1] <= a[0])

In [17]:
from transformers import pipeline

ner = pipeline("token-classification", model=MODEL_DIR, tokenizer=MODEL_DIR, aggregation_strategy="simple")

def ner_entities(text):
    raw = ner(text)
    spans = []
    for r in raw:
        s, e = r["start"], r["end"]
        label = r["entity_group"]
        chunk = text[s:e]
        if label == "EMAIL":
            chunk = normalize_email(chunk)
        spans.append({"start": s, "end": e, "label": label, "score": float(r["score"]), "source": "NER"})
    return spans

Device set to use cuda:0


In [18]:
def rule_entities(text):
    spans = []
    for m in RE_SSN.finditer(text):
        spans.append({"start": m.start(), "end": m.end(), "label": "SSN", "score": 1.0, "source": "RULE"})
    for m in RE_CC.finditer(text):
        spans.append({"start": m.start(), "end": m.end(), "label": "CREDITCARD", "score": 1.0, "source": "RULE"})
    for m in RE_EMAIL.finditer(text):
        spans.append({"start": m.start(), "end": m.end(), "label": "EMAIL", "score": 1.0, "source": "RULE"})
    for m in RE_PHONE.finditer(text):
        spans.append({"start": m.start(), "end": m.end(), "label": "PHONE", "score": 1.0, "source": "RULE"})
    return spans

In [19]:
PRIORITY = {"RULE": 2, "NER": 1}

def resolve_overlaps(spans):
    # Sort by (priority desc, severity, score desc, length desc)
    def key(sp):
        sev = {"HIGH":3,"MEDIUM":2,"LOW":1}[SEVERITY.get(sp["label"], "LOW")]
        return (PRIORITY.get(sp["source"], 0), sev, sp["score"], sp["end"]-sp["start"])
    spans = sorted(spans, key=key, reverse=True)

    kept = []
    for sp in spans:
        if all(not span_overlap((sp["start"], sp["end"]), (k["start"], k["end"])) for k in kept):
            kept.append(sp)
    return sorted(kept, key=lambda x: x["start"])

In [20]:
def redact_text(text, spans, mask_style="stars"):
    chars = list(text)
    for sp in spans:
        s, e, lab = sp["start"], sp["end"], sp["label"]
        if mask_style == "stars":
            chars[s:e] = "*" * (e - s)
        elif mask_style == "label":
            chars[s:e] = f"[{lab}_REDACTED]"
        elif mask_style == "partial_email" and lab == "EMAIL":
            chunk = text[s:e]
            chunk = normalize_email(chunk)
            try:
                user, domain = chunk.split("@", 1)
                masked = f"{user[:1]}***@{domain}"
            except Exception:
                masked = "***"
            chars[s:e] = masked
        else:
            chars[s:e] = "***"
    return "".join(chars)

In [21]:
def detect_and_redact(text):
    spans = ner_entities(text) + rule_entities(text)   # fuse
    spans = resolve_overlaps(spans)
    out   = redact_text(text, spans, mask_style="stars")
    findings = [{
        "label": sp["label"],
        "severity": SEVERITY.get(sp["label"], "LOW"),
        "span": [sp["start"], sp["end"]],
        "source": sp["source"]
    } for sp in spans]
    return out, findings

sample = "Email me at alice.lee@example.com or call 415-555-1234. My SSN is 123-45-6789."
redacted, findings = detect_and_redact(sample)
print(redacted)
print(findings)

Email ** at ********************* or call ************. ****** is ***********.
[{'label': 'EMAIL', 'severity': 'MEDIUM', 'span': [6, 8], 'source': 'NER'}, {'label': 'EMAIL', 'severity': 'MEDIUM', 'span': [12, 33], 'source': 'RULE'}, {'label': 'PHONE', 'severity': 'MEDIUM', 'span': [42, 54], 'source': 'RULE'}, {'label': 'PERSON', 'severity': 'LOW', 'span': [56, 62], 'source': 'NER'}, {'label': 'SSN', 'severity': 'HIGH', 'span': [66, 77], 'source': 'RULE'}]


In [22]:
# --- Drift monitoring (log entity distribution changes) ---
import collections, json
from datetime import datetime

def log_entity_distribution(predictions, log_path=f"{MODEL_DIR}/drift_log.json"):
    dist = collections.Counter()
    for pred in predictions:
        for p in pred:
            if p["entity_group"] != "O":
                dist[p["entity_group"]] += 1

    entry = {
        "timestamp": str(datetime.utcnow()),
        "distribution": dict(dist)
    }

    try:
        data = json.load(open(log_path))
    except:
        data = []

    data.append(entry)
    json.dump(data, open(log_path,"w"), indent=2)