In [20]:
import pandas as pd
import ast

df = pd.read_csv("data/PII43k.csv")

df['tokens'] = df['Tokenised Filled Template'].apply(ast.literal_eval)
df['ner_tags'] = df['Tokens'].apply(ast.literal_eval)

# Check mismatches
mismatch_rows = df[df['tokens'].str.len() != df['ner_tags'].str.len()]
print(f"Found {len(mismatch_rows)} mismatched rows.")
print(mismatch_rows[['Filled Template', 'tokens', 'ner_tags']].head())


Found 115 mismatched rows.
                                        Filled Template  \
701   Can you analyze Walter, Feil and Predovic's cu...   
1123  Please create a sales plan for Little, McKenzi...   
1906  13. Create a list of top 5 digital marketing b...   
2141  Could you help me design a mentoring program f...   
2192  Help Mertz, Kunde and Bernhard create a financ...   

                                                 tokens  \
701   [can, you, analyze, walter, ,, fei, ##l, and, ...   
1123  [please, create, a, sales, plan, for, little, ...   
1906  [13, ., create, a, list, of, top, 5, digital, ...   
2141  [could, you, help, me, design, a, mentoring, p...   
2192  [help, mer, ##tz, ,, kun, ##de, and, bernhard,...   

                                               ner_tags  
701   [O, O, O, B-NAME, I-NAME, I-NAME, I-NAME, I-NA...  
1123  [O, O, O, O, O, O, B-NAME, I-NAME, I-NAME, I-N...  
1906  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...  
2141  [O, O, O, O, O, O, O, O, 

In [21]:
df = df - mismatch_rows


TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [8]:
mismatch_rows = df[df['tokens'].str.len() != df['ner_tags'].str.len()]
print(f"Found {len(mismatch_rows)} mismatched rows.")
print(mismatch_rows[['Filled Template', 'tokens', 'ner_tags']].head())

Found 0 mismatched rows.
Empty DataFrame
Columns: [Filled Template, tokens, ner_tags]
Index: []


In [11]:
from datasets import Dataset


In [10]:
dataset = Dataset.from_pandas(df[['tokens', 'ner_tags', 'Filled Template']])
dataset = dataset.train_test_split(test_size=0.2)

In [12]:
from sklearn.preprocessing import LabelEncoder

all_labels = [label for sublist in df['ner_tags'] for label in sublist]
le = LabelEncoder()
le.fit(all_labels)

label2id = {label: idx for idx, label in enumerate(le.classes_)}
id2label = {idx: label for label, idx in label2id.items()}

def encode_labels(example):
    example['labels'] = [label2id[label] for label in example['ner_tags']]
    return example

dataset = dataset.map(encode_labels)


Map: 100%|██████████| 34208/34208 [00:01<00:00, 21612.09 examples/s]
Map: 100%|██████████| 8552/8552 [00:00<00:00, 21837.11 examples/s]


In [14]:
!pip install transformers -q

In [16]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"],
                                 is_split_into_words=True,
                                 padding="max_length",
                                 truncation=True,
                                 max_length=128)

    word_ids = tokenized_inputs.word_ids()
    label_ids = []
    previous_word_idx = None

    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            label_ids.append(example["labels"][word_idx])
            previous_word_idx = word_idx
        else:
            label_ids.append(-100)  # sub-token
    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

tokenized_dataset = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names  # clean previous list fields
)


Map:   0%|          | 0/34208 [00:00<?, ? examples/s]


ArrowInvalid: cannot mix list and non-list, non-null values

In [17]:
print(dataset['train'].features)


{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'Filled Template': Value(dtype='string', id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}


In [18]:
import pandas as pd
import ast
from datasets import Dataset, DatasetDict
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer

# --------------------------
# Load and clean the dataset
# --------------------------
df = pd.read_csv("data/pii43k.csv")

# Convert stringified lists into real Python lists
df['tokens'] = df['Tokenised Filled Template'].apply(ast.literal_eval)
df['ner_tags'] = df['Tokens'].apply(ast.literal_eval)

# --------------------------
# Fix mismatched token-label rows
# --------------------------
def fix_label_alignment(row):
    tok_len = len(row['tokens'])
    lbl_len = len(row['ner_tags'])
    if lbl_len < tok_len:
        row['ner_tags'] += ['O'] * (tok_len - lbl_len)  # Pad with O
    elif lbl_len > tok_len:
        row['ner_tags'] = row['ner_tags'][:tok_len]     # Truncate extra
    return row

df = df.apply(fix_label_alignment, axis=1)
assert all(len(t) == len(l) for t, l in zip(df['tokens'], df['ner_tags']))

# --------------------------
# Encode labels (BIO to IDs)
# --------------------------
all_labels = [label for sublist in df['ner_tags'] for label in sublist]
le = LabelEncoder().fit(all_labels)
label2id = {label: i for i, label in enumerate(le.classes_)}
id2label = {i: label for label, i in label2id.items()}

df['label_ids'] = df['ner_tags'].apply(lambda tags: [label2id[tag] for tag in tags])

# --------------------------
# Convert to HuggingFace Dataset
# --------------------------
hf_dataset = Dataset.from_pandas(df[['tokens', 'label_ids']])
hf_dataset = hf_dataset.train_test_split(test_size=0.2)

# --------------------------
# Tokenizer + Alignment
# --------------------------
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(
        example['tokens'],
        is_split_into_words=True,
        padding="max_length",
        truncation=True,
        max_length=128
    )

    word_ids = tokenized_inputs.word_ids()
    label_ids = []
    previous_word_idx = None

    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            label_ids.append(example['label_ids'][word_idx])
            previous_word_idx = word_idx
        else:
            label_ids.append(-100)

    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

# ✨ FIXED: Remove old columns to avoid ArrowInvalid issues
tokenized_dataset = hf_dataset.map(
    tokenize_and_align_labels,
    batched=False,
    remove_columns=hf_dataset['train'].column_names
)

# --------------------------
# Save for training
# --------------------------
tokenized_dataset.save_to_disk("data/pii43k_tokenized")

# Save label mappings for later use
import json
with open("data/label2id.json", "w") as f:
    json.dump(label2id, f)
with open("data/id2label.json", "w") as f:
    json.dump(id2label, f)

print("✅ Data preprocessing complete. Tokenized dataset saved to: data/pii43k_tokenized")


Map: 100%|██████████| 34208/34208 [00:09<00:00, 3624.25 examples/s]
Map: 100%|██████████| 8552/8552 [00:02<00:00, 3645.63 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 34208/34208 [00:00<00:00, 991882.31 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 8552/8552 [00:00<00:00, 1102461.51 examples/s]

✅ Data preprocessing complete. Tokenized dataset saved to: data/pii43k_tokenized





In [22]:

import pandas as pd
import ast
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from seqeval.metrics import classification_report
import json


RuntimeError: Failed to import transformers.models.auto.modeling_auto because of the following error (look up to see its traceback):
No module named 'transformers.models.auto.modeling_auto'