In [19]:
import os
import json
import random

random.seed(42)

PII_LABELS = [
    "CREDIT_CARD",
    "PHONE",
    "EMAIL",
    "PERSON_NAME",
    "DATE",
    "CITY",
    "LOCATION",
]

FIRST_NAMES = ["john", "emma", "arun", "sophia", "mike", "lisa", "priya", "rohan"]
LAST_NAMES = ["doe", "kumar", "sharma", "patel", "lee", "gomez", "singh"]
CITIES = ["chennai", "mumbai", "delhi", "bangalore", "london", "paris"]
LOCATIONS = ["central park", "marine drive", "mg road", "downtown", "near railway station"]

MONTHS = [
    "january", "february", "march", "april", "may", "june",
    "july", "august", "september", "october", "november", "december",
]

DIGITS = "0123456789"


def random_credit_card():
    # 16-digit card as a string (no spaces)
    return "".join(random.choice(DIGITS) for _ in range(16))


def random_phone():
    # 10-digit phone
    return "".join(random.choice(DIGITS) for _ in range(10))


def random_email():
    first = random.choice(FIRST_NAMES)
    last = random.choice(LAST_NAMES)
    domain = random.choice(["gmail.com", "yahoo.com", "hotmail.com", "example.com"])
    # Simulate STT-style email
    if random.random() < 0.5:
        return f"{first}.{last}@{domain}"
    else:
        dom_user, dom_tld = domain.split(".")
        return f"{first} dot {last} at {dom_user} dot {dom_tld}"


def random_person():
    return f"{random.choice(FIRST_NAMES)} {random.choice(LAST_NAMES)}"


def random_date():
    day = random.randint(1, 28)
    month = random.choice(MONTHS)
    year = random.randint(1995, 2025)
    return f"{day} {month} {year}"


def random_city():
    return random.choice(CITIES)


def random_location():
    return random.choice(LOCATIONS)


def generate_example(idx: int):
    """
    Create one noisy STT-style utterance with 1–3 entities randomly inserted.
    Returns dict: {id, text, entities}
    """
    base_templates = [
        "please update my account details",
        "can you check the status of my order",
        "i want to change my registered phone number",
        "this is regarding my previous payment",
        "i need help with my booking",
        "can you confirm my details again",
        "i am talking from the customer care line",
        "the delivery address has changed",
    ]
    text_tokens = random.choice(base_templates).split()

    num_entities = random.randint(1, 3)
    entities = []

    current_text = ""

    # Decide insertion positions (word indices) for entities
    insert_positions = sorted(
        random.sample(range(len(text_tokens) + 1), num_entities)
    )

    pos_idx = 0
    word_idx = 0

    while word_idx <= len(text_tokens):
        # Insert entity if needed at this index
        if pos_idx < len(insert_positions) and insert_positions[pos_idx] == word_idx:
            label = random.choice(PII_LABELS)
            if label == "CREDIT_CARD":
                ent_text = random_credit_card()
            elif label == "PHONE":
                ent_text = random_phone()
            elif label == "EMAIL":
                ent_text = random_email()
            elif label == "PERSON_NAME":
                ent_text = random_person()
            elif label == "DATE":
                ent_text = random_date()
            elif label == "CITY":
                ent_text = random_city()
            else:
                ent_text = random_location()

            if current_text:
                current_text += " "
            start = len(current_text)
            current_text += ent_text
            end = len(current_text)

            entities.append({"start": start, "end": end, "label": label})
            pos_idx += 1

        if word_idx < len(text_tokens):
            # Add normal token
            if current_text:
                current_text += " "
            token = text_tokens[word_idx]
            start = len(current_text)
            current_text += token
            end = len(current_text)
        word_idx += 1

    return {
        "id": f"utt_{idx:05d}",
        "text": current_text,
        "entities": entities,
    }


def write_jsonl(path, examples):
    with open(path, "w", encoding="utf-8") as f:
        for ex in examples:
            f.write(json.dumps(ex, ensure_ascii=False) + "\n")


def main():
    os.makedirs("data", exist_ok=True)
    train_size = 800
    dev_size = 150

    train_examples = [generate_example(i) for i in range(train_size)]
    dev_examples = [generate_example(i + train_size) for i in range(dev_size)]

    write_jsonl("data/train.jsonl", train_examples)
    write_jsonl("data/dev.jsonl", dev_examples)

    # Dummy test file (you can change later)
    write_jsonl("data/test.jsonl", [{"id": "test_00001", "text": "dummy text", "entities": []}])

    print("Generated: data/train.jsonl, data/dev.jsonl, data/test.jsonl")


if __name__ == "__main__":
    main()



Generated: data/train.jsonl, data/dev.jsonl, data/test.jsonl


In [1]:
import os, sys

# Change this path if your PLIVO folder is somewhere else
project_dir = r"C:\Users\joelm\OneDrive\Pictures\Desktop\PLIVO"

os.chdir(project_dir)
if project_dir not in sys.path:
    sys.path.append(project_dir)

print("Current working directory:", os.getcwd())

Current working directory: C:\Users\joelm\OneDrive\Pictures\Desktop\PLIVO


In [None]:

!pip install -r requirements.txt



Collecting seqeval (from -r requirements.txt (line 5))
  Downloading seqeval-1.2.2.tar.gz (43 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: seqeval
  Building wheel for seqeval (pyproject.toml): started
  Building wheel for seqeval (pyproject.toml): finished with status 'done'
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16283 sha256=9fabbbf733b5d26e8ce661698684a31c198f06e2645a50704cb6de7e267af1e0
  Stored in directory: c:\users\joelm\appdata\local\pip\cache\wheels\bc\92\f0\243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Success


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\joelm\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip


In [7]:
import torch
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

Torch version: 2.9.1+cpu
CUDA available: False


In [29]:
%run train.py --model_name distilbert-base-uncased --train data/train.jsonl --dev data/dev.jsonl --out_dir out

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████| 100/100 [00:54<00:00,  1.85it/s]


Epoch 1 average loss: 0.8247


Epoch 2/3: 100%|██████████| 100/100 [00:56<00:00,  1.76it/s]


Epoch 2 average loss: 0.0423


Epoch 3/3: 100%|██████████| 100/100 [00:55<00:00,  1.80it/s]


Epoch 3 average loss: 0.0117
Saved model + tokenizer to out


## First experiment

In [21]:
%run predict.py --model_dir out --input data/dev.jsonl --output out/dev_pred.json

Wrote predictions for 150 utterances to out/dev_pred.json


In [23]:
%run eval_span_f1.py --gold data/dev.jsonl --pred out/dev_pred.json

Per-entity metrics:
CITY            P=1.000 R=1.000 F1=1.000
CREDIT_CARD     P=1.000 R=1.000 F1=1.000
DATE            P=1.000 R=1.000 F1=1.000
EMAIL           P=1.000 R=1.000 F1=1.000
LOCATION        P=1.000 R=1.000 F1=1.000
PERSON_NAME     P=1.000 R=1.000 F1=1.000
PHONE           P=1.000 R=1.000 F1=1.000

Macro-F1: 1.000

PII-only metrics: P=1.000 R=1.000 F1=1.000
Non-PII metrics: P=1.000 R=1.000 F1=1.000


## Second Experiment

In [30]:
%run predict.py --model_dir out --input data/dev.jsonl --output out/dev_pred.json

Wrote predictions for 150 utterances to out/dev_pred.json


In [31]:
%run eval_span_f1.py --gold data/dev.jsonl --pred out/dev_pred.json

Per-entity metrics:
CITY            P=1.000 R=1.000 F1=1.000
CREDIT_CARD     P=1.000 R=1.000 F1=1.000
DATE            P=1.000 R=1.000 F1=1.000
EMAIL           P=1.000 R=1.000 F1=1.000
LOCATION        P=1.000 R=1.000 F1=1.000
PERSON_NAME     P=1.000 R=1.000 F1=1.000
PHONE           P=1.000 R=1.000 F1=1.000

Macro-F1: 1.000

PII-only metrics: P=1.000 R=1.000 F1=1.000
Non-PII metrics: P=1.000 R=1.000 F1=1.000


In [24]:
%run measure_latency.py --model_dir out --input data/dev.jsonl --runs 50

Latency over 50 runs (batch_size=1):
  p50: 19.68 ms
  p95: 22.28 ms


## Second try

In [32]:
%run measure_latency.py --model_dir out --input data/dev.jsonl --runs 50

Latency over 50 runs (batch_size=1):
  p50: 19.41 ms
  p95: 22.71 ms


## ALBERT

In [33]:
%run train.py --model_name albert-base-v2 --train data/train.jsonl --dev data/dev.jsonl --out_dir out_albert


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████| 100/100 [00:55<00:00,  1.79it/s]


Epoch 1 average loss: 0.5765


Epoch 2/3: 100%|██████████| 100/100 [00:56<00:00,  1.77it/s]


Epoch 2 average loss: 0.0290


Epoch 3/3: 100%|██████████| 100/100 [00:55<00:00,  1.80it/s]

Epoch 3 average loss: 0.0045
Saved model + tokenizer to out_albert





In [34]:
%run predict.py --model_dir out_albert --input data/dev.jsonl --output out_albert/dev_pred.json
%run eval_span_f1.py --gold data/dev.jsonl --pred out_albert/dev_pred.json
%run measure_latency.py --model_dir out_albert --input data/dev.jsonl --runs 50


Wrote predictions for 150 utterances to out_albert/dev_pred.json
Per-entity metrics:
CITY            P=1.000 R=1.000 F1=1.000
CREDIT_CARD     P=0.895 R=1.000 F1=0.944
DATE            P=1.000 R=1.000 F1=1.000
EMAIL           P=0.818 R=1.000 F1=0.900
LOCATION        P=1.000 R=1.000 F1=1.000
PERSON_NAME     P=0.929 R=1.000 F1=0.963
PHONE           P=0.811 R=1.000 F1=0.896

Macro-F1: 0.958

PII-only metrics: P=0.882 R=1.000 F1=0.937
Non-PII metrics: P=1.000 R=1.000 F1=1.000
Latency over 50 runs (batch_size=1):
  p50: 39.89 ms
  p95: 61.35 ms


In [37]:
%run train.py --model_name distilbert-base-uncased --train data/train.jsonl --dev data/dev.jsonl --out_dir out_distil128


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████| 100/100 [00:54<00:00,  1.83it/s]


Epoch 1 average loss: 0.8134


Epoch 2/3: 100%|██████████| 100/100 [00:54<00:00,  1.83it/s]


Epoch 2 average loss: 0.0446


Epoch 3/3: 100%|██████████| 100/100 [00:56<00:00,  1.79it/s]


Epoch 3 average loss: 0.0142
Saved model + tokenizer to out_distil128


In [36]:
%run predict.py --model_dir out_distil128 --input data/dev.jsonl --output out_distil128/dev_pred.json
%run eval_span_f1.py --gold data/dev.jsonl --pred out_distil128/dev_pred.json
%run measure_latency.py --model_dir out_distil128 --input data/dev.jsonl --runs 50


Wrote predictions for 150 utterances to out_distil128/dev_pred.json
Per-entity metrics:
CITY            P=1.000 R=1.000 F1=1.000
CREDIT_CARD     P=1.000 R=1.000 F1=1.000
DATE            P=1.000 R=1.000 F1=1.000
EMAIL           P=1.000 R=1.000 F1=1.000
LOCATION        P=1.000 R=1.000 F1=1.000
PERSON_NAME     P=1.000 R=1.000 F1=1.000
PHONE           P=1.000 R=1.000 F1=1.000

Macro-F1: 1.000

PII-only metrics: P=1.000 R=1.000 F1=1.000
Non-PII metrics: P=1.000 R=1.000 F1=1.000
Latency over 50 runs (batch_size=1):
  p50: 21.48 ms
  p95: 25.56 ms


In [52]:
%run train.py --model_name distilbert-base-uncased --train data/train.jsonl --dev data/dev.jsonl --out_dir out_distil64


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████| 100/100 [00:55<00:00,  1.80it/s]


Epoch 1 average loss: 0.8145


Epoch 2/3: 100%|██████████| 100/100 [00:55<00:00,  1.81it/s]


Epoch 2 average loss: 0.0411


Epoch 3/3: 100%|██████████| 100/100 [00:53<00:00,  1.87it/s]


Epoch 3 average loss: 0.0140
Saved model + tokenizer to out_distil64


In [62]:
%run predict.py --model_dir out_distil64 --input data/dev.jsonl --output out_distil64/dev_pred.json
%run eval_span_f1.py --gold data/dev.jsonl --pred out_distil64/dev_pred.json
%run measure_latency.py --model_dir out_distil64 --input data/dev.jsonl --runs 1


Wrote predictions for 150 utterances to out_distil64/dev_pred.json
Per-entity metrics:
CITY            P=1.000 R=1.000 F1=1.000
CREDIT_CARD     P=1.000 R=1.000 F1=1.000
DATE            P=1.000 R=1.000 F1=1.000
EMAIL           P=1.000 R=1.000 F1=1.000
LOCATION        P=1.000 R=1.000 F1=1.000
PERSON_NAME     P=1.000 R=1.000 F1=1.000
PHONE           P=1.000 R=1.000 F1=1.000

Macro-F1: 1.000

PII-only metrics: P=1.000 R=1.000 F1=1.000
Non-PII metrics: P=1.000 R=1.000 F1=1.000
Latency over 1 runs (batch_size=1):
  p50: 17.04 ms
  p95: 17.04 ms
