In [32]:
from transformers import BertTokenizer, BertForTokenClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import pandas as pd
import torch

In [2]:
# Load your custom dataset
custom_dataset = pd.read_csv('ner_data.conll', sep='\t', header=None, names=['token', 'ner_label'])

# Split dataset into train, validation, and test sets
train_data, test_data = train_test_split(custom_dataset, test_size=0.2, random_state=42)
train_data, validation_data = train_test_split(train_data, test_size=0.2, random_state=42)

In [3]:
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(custom_dataset['ner_label'].unique()))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
print("Train dataset sample:")
print(train_data.head())

print("\nEval dataset sample:")
print(validation_data.head())

Train dataset sample:
         token ner_label
2492     owner         O
513        NaN         O
1634      risk         O
3361   dometic         O
1758  increase         O

Eval dataset sample:
            token ner_label
1746   electrical         O
2315        crash         O
1293     position         O
2205  compartment         O
2943     movement         O


In [5]:
# Drop rows with NaN values
train_data.dropna(inplace=True)
validation_data.dropna(inplace=True)

# Print the updated dataset sample
print("Train dataset sample after dropping NaN values:")
print(train_data.head())

print("\nEval dataset sample after dropping NaN values:")
print(validation_data.head())

Train dataset sample after dropping NaN values:
         token  ner_label
2492     owner          O
1634      risk          O
3361   dometic          O
1758  increase          O
1399  national  I-company

Eval dataset sample after dropping NaN values:
            token ner_label
1746   electrical         O
2315        crash         O
1293     position         O
2205  compartment         O
2943     movement         O


In [37]:
# Define a mapping of named entities to numerical labels
label_map = {
    "O": 0,                # 'O' represents tokens outside of named entities
    "B-company": 1,        # 'B-' represents the beginning of a company entity
    "I-company": 2,        # 'I-' represents tokens inside a company entity
    "B-failure issue": 3,
    "I-failure issue": 4,
    "B-corrective action": 5,
    "I-corrective action": 6,
    "B-vehicle model": 7,
    "I-vehicle model": 8,
    "B-contact": 9,
    "I-contact": 10,
    "B-standard": 11,
    "I-standard": 12,
    "I-component": 13,     # Include 'I-component' in the label mapping
    "B-component": 14     # Include 'I-component' in the label mapping
}

# Modify tokenize_text function to use numerical labels
def tokenize_text(text, label):
    tokenized_input = tokenizer(text, truncation=True, padding='max_length', max_length=200, return_tensors='pt')
    return {
        'input_ids': tokenized_input['input_ids'],
        'attention_mask': tokenized_input['attention_mask'],
        'token_type_ids': tokenized_input.get('token_type_ids', None),  
        'labels': torch.tensor(label_map[label])  # Convert string label to numerical format
    }

# Preprocess training and validation data
train_features = [(tokenize_text(text, label),) for text, label in zip(train_data['token'], train_data['ner_label'])]
validation_features = [(tokenize_text(text, label),) for text, label in zip(validation_data['token'], validation_data['ner_label'])]


In [51]:
train_features = []
for text, label in zip(train_data['token'], train_data['ner_label']):
    feature = tokenize_text(text, label)
    train_features.append(feature)

In [52]:
validation_features = []
for text, label in zip(validation_data['token'], validation_data['ner_label']):
    feature = tokenize_text(text, label)
    validation_features.append(feature)

In [54]:
type(train_features)

list