In [None]:
# 1. Install Required Libraries
# Ensure you have the following libraries installed:
# bash
# pip install transformers datasets torch scikit-learn

In [2]:
'''
This is script prepares a dataset for Named Entity Recognition (NER) tasks by loading a CSV file, 
tokenizing the text into sentences, and splitting the data into training and validation sets.
'''
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

# Load the dataset
file_path = "my_dataset.csv"  # Path to your dataset
data = pd.read_csv(file_path)

# Define the custom tags and map them to integers
custom_tags = [
    "O", "I-ORG", "I-PERSON", "B-DISEASE", "B-ORG", "B-PERSON",
    "I-DISEASE", "B-CHEMICAL", "B-GPE", "I-CHEMICAL",
    "I-DATE", "B-DATE", "I-GPE", "B-LOC", "I-LOC"
]
tag_to_id = {tag: idx for idx, tag in enumerate(custom_tags)}
id_to_tag = {idx: tag for tag, idx in tag_to_id.items()}

# Map NER tags to their integer IDs
data['NER'] = data['NER'].map(tag_to_id)

# Prepare tokens and tags for sentence grouping
all_tokens = data['Token'].tolist()
all_tags = data['NER'].tolist()

# Tokenize text into sentences
text = " ".join(all_tokens)  # Combine tokens into a single string
sent_tokens = sent_tokenize(text)  # Split text into sentences

# Group tokens and tags by sentences
sentences = []
ner_tags = []
token_index = 0

for sent in sent_tokens:
    words = sent.split()  # Split sentence into words
    current_sentence = []
    current_tags = []

    for word in words:
        if token_index < len(all_tokens):  # Ensure index is within bounds
            current_sentence.append(word)
            current_tags.append(all_tags[token_index])
            token_index += 1
        else:
            break

    if current_sentence:  # Add non-empty sentences
        sentences.append(current_sentence)
        ner_tags.append(current_tags)

# Print initial dataset size
print(f"Total number of sentences: {len(sentences)}")

# Check if we have enough data to split
if len(sentences) > 1:
    # Split into training and validation sets
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(
        sentences, ner_tags, train_size=0.8, random_state=42)
    
    # Display dataset sizes
    print(f"Number of training sentences: {len(train_tokens)}")
    print(f"Number of validation sentences: {len(val_tokens)}")

    # Prepare dataset dictionaries
    train_data = {"tokens": train_tokens, "ner_tags": train_tags}
    val_data = {"tokens": val_tokens, "ner_tags": val_tags}

else:
    print("Dataset is too small to split. Using all data for training.")
    # Use all data for training
    train_data = {"tokens": sentences, "ner_tags": ner_tags}
    val_data = {"tokens": [], "ner_tags": []}

# Print some statistics about the data
print("\nDataset Statistics:")
print(f"Number of unique tokens: {len(set([token for sent in sentences for token in sent]))}")
print(f"Number of unique tags: {len(set([tag for tags in ner_tags for tag in tags]))}")
print(f"Average sentence length: {sum(len(sent) for sent in sentences)/len(sentences) if sentences else 0:.2f}")

Total number of sentences: 1
Dataset is too small to split. Using all data for training.

Dataset Statistics:
Number of unique tokens: 3007
Number of unique tags: 15
Average sentence length: 15777.00


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# 3. Convert to Hugging Face Dataset
# Hugging Face expects datasets in its datasets format.
'''
This script prepares a dataset for Named Entity Recognition (NER) tasks by loading a CSV file, 
tokenizing the text into sentences, splitting the data into training and validation sets, 
and converting the data into the Hugging Face Dataset format.
'''
from datasets import Dataset

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)



In [4]:
# 4. Load a Pre-Trained Transformer

from transformers import AutoTokenizer, AutoModelForTokenClassification

# Load tokenizer and model
model_name = "bert-base-cased" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=len(tag_to_id)
)




Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# my model 
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [None]:
# 5. Tokenize and Align Labels
# tokenize input text and align the corresponding labels. 
#This is a crucial step in preparing data for training a named entity recognition (NER) model.
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'], truncation=True, is_split_into_words=True
    )
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_train = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_val = val_dataset.map(tokenize_and_align_labels, batched=True)




In [None]:
# 6. Define Training Arguments
# Set hyperparameters and configurations.


from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./ner_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)




In [None]:
# Import necessary modules from transformers and sklearn
from transformers import DataCollatorForTokenClassification, Trainer
from sklearn.metrics import classification_report
import numpy as np

# Data collator for token classification; handles padding sequences dynamically
data_collator = DataCollatorForTokenClassification(tokenizer)

# Function to compute evaluation metrics
def compute_metrics(pred):
    # Extract predictions and labels from the output
    predictions, labels = pred
    # Apply argmax to predictions along the last dimension to get predicted classes
    predictions = np.argmax(predictions, axis=2)

    # Remove the ignored index (-100) from labels for true evaluations
    true_labels = [[l for l in label if l != -100] for label in labels]
    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]  # Align predictions with labels
        for prediction, label in zip(predictions, labels)
    ]

    # Flatten the labels and predictions for metrics calculation
    flat_labels = [item for sublist in true_labels for item in sublist]
    flat_predictions = [item for sublist in true_predictions for item in sublist]
    
    # Generate a classification report with precision, recall, and F1-score
    report = classification_report(flat_labels, flat_predictions, output_dict=True)
    return {
        "precision": report["weighted avg"]["precision"],  # Weighted average precision
        "recall": report["weighted avg"]["recall"],        # Weighted average recall
        "f1": report["weighted avg"]["f1-score"],          # Weighted average F1-score
    }

# Define the Trainer instance for model training and evaluation
trainer = Trainer(
    model=model,                         # The model to be trained
    args=training_args,                  # Training arguments (e.g., epochs, batch size)
    train_dataset=tokenized_train,       # Tokenized training dataset
    eval_dataset=tokenized_val,          # Tokenized validation dataset
    tokenizer=tokenizer,                 # Tokenizer for pre-processing
    data_collator=data_collator,         # Data collator for padding sequences
    compute_metrics=compute_metrics,     # Custom metric computation function
)


In [None]:
# 8. Train the Model
# Start fine-tuning the model.

trainer.train()

In [None]:
# 9. Evaluate the Model
# Evaluate on the validation dataset.


metrics = trainer.evaluate()
print(metrics)

In [None]:
# 10. Save the Fine-Tuned Model
# Save the model for future use.


model.save_pretrained("./fine_tuned_ner_model")
tokenizer.save_pretrained("./fine_tuned_ner_model")

In [None]:
# 11. Inference with Fine-Tuned Model
# Use the fine-tuned model for predictions.

from transformers import pipeline

# Load the fine-tuned model
ner_pipeline = pipeline(
    "ner", model="./fine_tuned_ner_model", tokenizer="./fine_tuned_ner_model", aggregation_strategy="simple"
)

# Predict entities
text = "Hassen diagnosed John with COVID-19 at MG"
entities = ner_pipeline(text)
print(entities)
