# Data Preprocessing

## Load Dataset

In [1]:
# import libraries
import pandas as pd
from datasets import Dataset
from ast import literal_eval
from transformers import AutoTokenizer
import numpy as np
from evaluate import load

In [2]:
# Load your CSV file
df = pd.read_csv('dataset/ner.csv')

# Inspect the dataframe to understand its structure
df.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","['NNS', 'VBD', 'DT', 'NN', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,Sentence: 5,The protest comes on the eve of the annual con...,"['DT', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'D...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


## Change the Representation in Columns

In [3]:
# Try to convert POS and Tag to list representation instead of string
print("POS Type (Before): ", type(df["POS"][0]))
print("Tag Type (Before): ", type(df["Tag"][0]))

df['POS'] = df['POS'].apply(literal_eval)
df['Tag'] = df['Tag'].apply(literal_eval)

print("POS Type (After): ", type(df["POS"][0]))
print("Tag Type (After): ", type(df["Tag"][0]))

POS Type (Before):  <class 'str'>
Tag Type (Before):  <class 'str'>
POS Type (After):  <class 'list'>
Tag Type (After):  <class 'list'>


## Tokenize the Sentence

In [4]:
# Preprocess 1: Tokenize the sentence and store in another column called Token
from nltk.tokenize import word_tokenize
import nltk
# nltk.download('punkt')

In [5]:
df['Token'] = df['Sentence'].map(word_tokenize)

In [6]:
df.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag,Token
0,Sentence: 1,Thousands of demonstrators have marched throug...,"[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...","[Thousands, of, demonstrators, have, marched, ..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"[NNS, IN, NNS, VBN, IN, DT, NN, VBD, DT, NNS, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Families, of, soldiers, killed, in, the, conf..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"[PRP, VBD, IN, DT, NNS, IN, NN, TO, DT, NN, IN...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, I-geo...","[They, marched, from, the, Houses, of, Parliam..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","[NNS, VBD, DT, NN, IN, NNS, IN, CD, IN, NNS, V...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[Police, put, the, number, of, marchers, at, 1..."
4,Sentence: 5,The protest comes on the eve of the annual con...,"[DT, NN, VBZ, IN, DT, NN, IN, DT, JJ, NN, IN, ...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, O, O,...","[The, protest, comes, on, the, eve, of, the, a..."


## Map the BIO Tags to dslim's Tags (Pre-trained model)

In [7]:
# Preprocess 2: Map the BIO entity type in the NER Tag - Tag mapping from current tags to dslim's tags
tag_mapping = {
    'B-art': 'B-MISC', 'I-art': 'I-MISC',
    'B-eve': 'B-MISC', 'I-eve': 'I-MISC',
    'B-geo': 'B-LOC', 'I-geo': 'I-LOC',
    'B-gpe': 'B-LOC', 'I-gpe': 'I-LOC',
    'B-nat': 'B-MISC', 'I-nat': 'I-MISC',
    'B-org': 'B-ORG', 'I-org': 'I-ORG',
    'B-per': 'B-PER', 'I-per': 'I-PER',
    'B-tim': 'B-MISC', 'I-tim': 'I-MISC',
    'O': 'O'
}

In [8]:
def map_tags(ner_tags):
    new_tags = [tag_mapping[tag] for tag in ner_tags]
    return new_tags

In [9]:
df['New_Tag'] = df['Tag'].apply(map_tags)

In [10]:
df.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag,Token,New_Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...","[Thousands, of, demonstrators, have, marched, ...","[O, O, O, O, O, O, B-LOC, O, O, O, O, O, B-LOC..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"[NNS, IN, NNS, VBN, IN, DT, NN, VBD, DT, NNS, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Families, of, soldiers, killed, in, the, conf...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"[PRP, VBD, IN, DT, NNS, IN, NN, TO, DT, NN, IN...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, I-geo...","[They, marched, from, the, Houses, of, Parliam...","[O, O, O, O, O, O, O, O, O, O, O, B-LOC, I-LOC..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","[NNS, VBD, DT, NN, IN, NNS, IN, CD, IN, NNS, V...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[Police, put, the, number, of, marchers, at, 1...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,Sentence: 5,The protest comes on the eve of the annual con...,"[DT, NN, VBZ, IN, DT, NN, IN, DT, JJ, NN, IN, ...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, O, O,...","[The, protest, comes, on, the, eve, of, the, a...","[O, O, O, O, O, O, O, O, O, O, O, B-LOC, O, O,..."


## Check any Mismatch Length between Token and POS or Ner Tags

In [11]:
# Checking Function to identify problematic rows
def is_valid_row(row):
    token = row['Token']
    pos = row['POS']
    tag = row['New_Tag']

    return len(token) == len(pos) and len(token) == len(tag)

In [12]:
# Create mask for valid rows
valid_mask = df.apply(is_valid_row, axis=1)
problematic_df = df[~valid_mask]
clean_df = df[valid_mask]

In [13]:
print(f"Total rows: {len(df)}")
print(f"Problematic rows: {len(problematic_df)}")
print(f"Clean rows: {len(clean_df)}")

Total rows: 47959
Problematic rows: 223
Clean rows: 47736


## Save the Separate Dataset
- Direct use clean_df for training
- problematic_df is only 0.0046 % out of total instances
- manual correction for problematic_df if required

In [14]:
# Save problematic instances for potential manual correction
problematic_df.to_csv('dataset/problematic_instances.csv', index=False)
clean_df.to_csv('dataset/clean_instances.csv', index=False)


## Convert to HuggingFace Dataset

In [15]:
# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(clean_df)

## Final Verification

In [16]:
# Final verification
def verify_dataset(dataset):
    errors = 0
    for example in dataset:
        if len(example['Token']) != len(example['POS']) or len(example['Token']) != len(example['New_Tag']):
            errors += 1
    print(f"Clean dataset contains {errors} length mismatch errors out of {len(dataset)} examples")

verify_dataset(dataset)

Clean dataset contains 0 length mismatch errors out of 47736 examples
