In [1]:
# Imports
import os
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
import pickle

In [2]:
# Paths
DATA_DIR = '../../data/ner/'
DATA_FILE = os.path.join(DATA_DIR, 'NER_dataset.csv')

In [3]:
# Load dataset
df = pd.read_csv(DATA_FILE, encoding='latin1')

# Fill missing sentence numbers and convert to int
df['Sentence #'] = df['Sentence #'].fillna(method='ffill')
df['Sentence #'] = df['Sentence #'].apply(lambda x: int(str(x).replace('Sentence: ', '')))

# Clean and filter data
df['Word'] = df['Word'].astype(str).str.strip()
df['POS'] = df['POS'].astype(str).str.strip()
df['Tag'] = df['Tag'].astype(str).str.strip()
df = df[df['Word'] != '']

  df['Sentence #'] = df['Sentence #'].fillna(method='ffill')


In [4]:
# Group tokens by sentence
grouped = df.groupby('Sentence #')
sentences = grouped['Word'].apply(list).tolist()
pos_tags = grouped['POS'].apply(list).tolist()
ner_tags = grouped['Tag'].apply(list).tolist()

print(f"Total sentences: {len(sentences)}")
print(f"Example sentence: {sentences[0]}")
print(f"Example tags: {ner_tags[0]}")

Total sentences: 47959
Example sentence: ['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']
Example tags: ['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [5]:
# Build vocabularies with special tokens
def build_vocab(sequences, specials=['<PAD>', '<UNK>']):
    vocab = specials.copy()
    counter = Counter(tok for seq in sequences for tok in seq)
    vocab.extend([w for w, _ in counter.most_common()])
    return {w:i for i,w in enumerate(vocab)}

word2idx = build_vocab(sentences)
pos2idx = build_vocab(pos_tags, specials=['<PAD>'])
tag2idx = build_vocab(ner_tags, specials=['<PAD>'])

print(f"Vocab sizes - Words: {len(word2idx)}, POS: {len(pos2idx)}, Tags: {len(tag2idx)}")

Vocab sizes - Words: 35179, POS: 43, Tags: 18


In [6]:
# Determine max sequence length (95th percentile)
lengths = [len(s) for s in sentences]
max_len = int(np.percentile(lengths, 95))
print(f"Using max sequence length: {max_len}")

Using max sequence length: 35


In [7]:
# Encode and pad sequences
def encode_and_pad(sequences, vocab, max_len):
    encoded = []
    for seq in sequences:
        enc = [vocab.get(w, vocab.get('<UNK>')) for w in seq]
        if len(enc) > max_len: enc = enc[:max_len]
        else: enc += [vocab.get('<PAD>')] * (max_len - len(enc))
        encoded.append(enc)
    return np.array(encoded)

X_words = encode_and_pad(sentences, word2idx, max_len)
X_pos = encode_and_pad(pos_tags, pos2idx, max_len)
Y_tags = encode_and_pad(ner_tags, tag2idx, max_len)

In [8]:
# Train/val split
Xw_train, Xw_val, Xp_train, Xp_val, Yt_train, Yt_val = train_test_split(
    X_words, X_pos, Y_tags, test_size=0.2, random_state=42
)

print(f"Train size: {len(Xw_train)}, Val size: {len(Xw_val)}")

Train size: 38367, Val size: 9592


In [9]:
# Save processed data and vocabs
os.makedirs(DATA_DIR, exist_ok=True)

np.save(os.path.join(DATA_DIR, 'Xw_train.npy'), Xw_train)
np.save(os.path.join(DATA_DIR, 'Xw_val.npy'), Xw_val)
np.save(os.path.join(DATA_DIR, 'Xp_train.npy'), Xp_train)
np.save(os.path.join(DATA_DIR, 'Xp_val.npy'), Xp_val)
np.save(os.path.join(DATA_DIR, 'Yt_train.npy'), Yt_train)
np.save(os.path.join(DATA_DIR, 'Yt_val.npy'), Yt_val)

with open(os.path.join(DATA_DIR, 'word2idx.pkl'), 'wb') as f:
    pickle.dump(word2idx, f)

with open(os.path.join(DATA_DIR, 'pos2idx.pkl'), 'wb') as f:
    pickle.dump(pos2idx, f)

with open(os.path.join(DATA_DIR, 'tag2idx.pkl'), 'wb') as f:
    pickle.dump(tag2idx, f)

print("Data preprocessing completed and saved.")

Data preprocessing completed and saved.
