# Classical ML Baseline for PII Tagging
This notebook implements classical machine learning baselines for PII detection in tokenized educational text.
It is designed to provide a reference point against which advanced transformer models (like DeBERTa) can be compared.

In [1]:

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:

from pathlib import Path
PROJ = Path('/content/drive/MyDrive/edu-pii-detector')
DATA_DIR = PROJ / 'input'
TRAIN_JSON = DATA_DIR / 'train.json'
TEST_JSON  = DATA_DIR / 'test.json'
TRAIN_FILE = DATA_DIR / 'train.csv'
VAL_FILE   = DATA_DIR / 'val.csv'
RUN_DIR    = DATA_DIR / 'pii_runs'
RUN_DIR.mkdir(parents=True, exist_ok=True)

LABELS = [
    'B-USERNAME','B-ID_NUM','I-PHONE_NUM','I-ID_NUM','I-NAME_STUDENT',
    'B-EMAIL','I-STREET_ADDRESS','B-STREET_ADDRESS','B-URL_PERSONAL',
    'O','I-URL_PERSONAL','B-PHONE_NUM','B-NAME_STUDENT'
]


In [3]:

import json, pandas as pd, random
def read_json_or_jsonl(p: Path):
    rows = []
    with open(p,'r',encoding='utf-8') as f:
        head=f.read(1024); f.seek(0)
        if '\n' in head.strip():
            try:
                for line in f:
                    line=line.strip()
                    if line: rows.append(json.loads(line))
                if rows: return rows
            except: f.seek(0)
        data=json.load(f)
        if isinstance(data,dict) and 'data' in data: data=data['data']
        return data
def ensure_csv_splits():
    if TRAIN_FILE.exists() and VAL_FILE.exists(): return
    rows=read_json_or_jsonl(TRAIN_JSON)
    clean=[{'tokens':r['tokens'],'labels':r['labels']} for r in rows if 'tokens'in r]
    random.shuffle(clean); cut=int(0.9*len(clean))
    def to_csv(rows,path):
        df=pd.DataFrame({'tokens':[json.dumps(r['tokens']) for r in rows],
                         'labels':[json.dumps(r['labels']) for r in rows]})
        df.to_csv(path,index=False)
    to_csv(clean[:cut],TRAIN_FILE); to_csv(clean[cut:],VAL_FILE)
ensure_csv_splits()


In [4]:

import ast
train_df=pd.read_csv(TRAIN_FILE); val_df=pd.read_csv(VAL_FILE)
def _to_list(x):
    if isinstance(x,str):
        try: return json.loads(x)
        except: return ast.literal_eval(x)
    return x
for df in [train_df,val_df]:
    df['tokens']=df['tokens'].apply(_to_list)
    df['labels']=df['labels'].apply(_to_list)


In [5]:

from collections import Counter
def flatten_tokens(df):
    X,y=[],[]
    for toks,labs in zip(df['tokens'],df['labels']):
        for i,(tok,lab) in enumerate(zip(toks,labs)):
            prev=toks[i-1] if i>0 else '<BOS>'
            nxt=toks[i+1] if i+1<len(toks) else '<EOS>'
            X.append({'tok':tok,'prev':prev,'next':nxt}); y.append(lab)
    return X,y
X_train,y_train=flatten_tokens(train_df)
X_val,y_val=flatten_tokens(val_df)
# downsample O
ratio=10
idx_o=[i for i,l in enumerate(y_train) if l=='O']
idx_p=[i for i,l in enumerate(y_train) if l!='O']
import random; keep_o=random.sample(idx_o,min(len(idx_o),ratio*len(idx_p)))
keep=set(keep_o)|set(idx_p)
X_train=[X_train[i] for i in range(len(X_train)) if i in keep]
y_train=[y_train[i] for i in range(len(y_train)) if i in keep]
print("Counts:",Counter(y_train))


Counts: Counter({'O': 24600, 'B-NAME_STUDENT': 1228, 'I-NAME_STUDENT': 979, 'B-URL_PERSONAL': 102, 'B-ID_NUM': 65, 'B-EMAIL': 35, 'I-STREET_ADDRESS': 20, 'I-PHONE_NUM': 15, 'B-USERNAME': 6, 'B-PHONE_NUM': 6, 'B-STREET_ADDRESS': 2, 'I-ID_NUM': 1, 'I-URL_PERSONAL': 1})


In [6]:

import re,numpy as np, sklearn
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from packaging import version

class DictFieldExtractor(BaseEstimator,TransformerMixin):
    def __init__(self,field): self.field=field
    def fit(self,X,y=None): return self
    def transform(self,X): return [str(x.get(self.field,'')) for x in X]
class ShapeExtractor(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None): return self
    def transform(self,X):
        def ws(s): return re.sub(r'(.)\1+',r'\1',''.join(['X' if c.isupper() else 'x' if c.islower() else 'D' if c.isdigit() else c for c in s]))
        return np.array([ws(str(x['tok'])) for x in X]).reshape(-1,1)
class RegexFlagsExtractor(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None): return self
    def transform(self,X):
        feats=[]
        for x in X:
            t=str(x['tok'])
            feats.append([int('@' in t),int('http' in t or 'www.' in t),
                          int(re.fullmatch(r'[A-Za-z0-9._-]{6,}',t)!=None),
                          int(any(c.isdigit() for c in t)),int(t.istitle()),int(t.isupper())])
        return np.array(feats)

if version.parse(sklearn.__version__)>=version.parse("1.2"):
    onehot=OneHotEncoder(handle_unknown='ignore',sparse_output=True)
else:
    onehot=OneHotEncoder(handle_unknown='ignore',sparse=True)

shape_feat=Pipeline([('shape',ShapeExtractor()),('onehot',onehot)])
regex_flags=Pipeline([('flags',RegexFlagsExtractor()),('scale',StandardScaler(with_mean=False))])
features=FeatureUnion([
    ('w_cur',Pipeline([('ext',DictFieldExtractor('tok')),('tfidf',TfidfVectorizer(ngram_range=(1,2)))])),
    ('c_ng',Pipeline([('ext',DictFieldExtractor('tok')),('tfidf',TfidfVectorizer(analyzer='char',ngram_range=(3,5)))])),
    ('shape',shape_feat),('rgx',regex_flags)])


In [7]:

from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report,precision_recall_fscore_support
le=LabelEncoder().fit(LABELS)
y_train_enc=le.transform(y_train); y_val_enc=le.transform(y_val)
models={'LogReg':LogisticRegression(max_iter=500,class_weight='balanced',multi_class='multinomial'),
        'LinearSVM':LinearSVC(class_weight='balanced'),
        'SGD':SGDClassifier(loss='log_loss',max_iter=2000,class_weight='balanced')}
preds={}
for name,clf in models.items():
    from sklearn.pipeline import Pipeline
    pipe=Pipeline([('feats',features),('clf',clf)])
    pipe.fit(X_train,y_train_enc)
    y_pred=le.inverse_transform(pipe.predict(X_val))
    preds[name]=y_pred
    print(name,classification_report(y_val,y_pred,labels=LABELS,zero_division=0))
    pii=[lab for lab in LABELS if lab!='O']
    p,r,f1,s=precision_recall_fscore_support(y_val,y_pred,labels=pii,zero_division=0)
    print(name,"PII-only macro F1:",f1.mean())




LogReg                   precision    recall  f1-score   support

      B-USERNAME       0.00      0.00      0.00         0
        B-ID_NUM       0.06      0.92      0.11        13
     I-PHONE_NUM       0.00      0.00      0.00         0
        I-ID_NUM       0.00      0.00      0.00         0
  I-NAME_STUDENT       0.01      0.57      0.02       117
         B-EMAIL       0.57      1.00      0.73         4
I-STREET_ADDRESS       0.00      0.00      0.00         0
B-STREET_ADDRESS       0.00      0.00      0.00         0
  B-URL_PERSONAL       0.30      1.00      0.46         8
               O       1.00      0.89      0.94    512607
  I-URL_PERSONAL       0.00      0.00      0.00         0
     B-PHONE_NUM       0.00      0.00      0.00         0
  B-NAME_STUDENT       0.01      0.72      0.02       137

        accuracy                           0.89    512886
       macro avg       0.15      0.39      0.17    512886
    weighted avg       1.00      0.89      0.94    512886

LogR



LinearSVM                   precision    recall  f1-score   support

      B-USERNAME       0.00      0.00      0.00         0
        B-ID_NUM       0.12      0.92      0.21        13
     I-PHONE_NUM       0.00      0.00      0.00         0
        I-ID_NUM       0.00      0.00      0.00         0
  I-NAME_STUDENT       0.02      0.52      0.04       117
         B-EMAIL       0.57      1.00      0.73         4
I-STREET_ADDRESS       0.00      0.00      0.00         0
B-STREET_ADDRESS       0.00      0.00      0.00         0
  B-URL_PERSONAL       0.30      1.00      0.46         8
               O       1.00      0.98      0.99    512607
  I-URL_PERSONAL       0.00      0.00      0.00         0
     B-PHONE_NUM       0.00      0.00      0.00         0
  B-NAME_STUDENT       0.03      0.70      0.06       137

        accuracy                           0.98    512886
       macro avg       0.16      0.39      0.19    512886
    weighted avg       1.00      0.98      0.99    512886

L

In [8]:

def regroup(df,y_pred):
    out=[]; idx=0
    for toks,labs in zip(df['tokens'],df['labels']):
        n=len(toks); pred=list(y_pred[idx:idx+n]); idx+=n
        out.append({'tokens':toks,'labels_true':labs,'labels_pred':pred})
    return out
import json
for name,y_pred in preds.items():
    data=regroup(val_df,list(y_pred))
    with open(RUN_DIR/f'val_predictions_{name.lower()}.jsonl','w') as f:
        for row in data: f.write(json.dumps(row)+'\n')
