In [None]:

# Install necessary packages
!pip install pandas scikit-learn transformers
!pip install nltk
!python -m nltk.downloader stopwords

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments, pipeline
import torch
from torch.utils.data import Dataset
import nltk
from nltk.corpus import stopwords
import string
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, accuracy_score



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Database Source ( https://www.kaggle.com/datasets/rajnathpatel/ner-data/data )

In [None]:
# Sample data
df=pd.read_csv('/content/drive/MyDrive/NER/ner.csv')
df.head()

Unnamed: 0,text,labels
0,Thousands of demonstrators have marched throug...,O O O O O O B-geo O O O O O B-geo O O O O O B-...
1,Iranian officials say they expect to get acces...,B-gpe O O O O O O O O O O O O O O B-tim O O O ...
2,Helicopter gunships Saturday pounded militant ...,O O B-tim O O O O O B-geo O O O O O B-org O O ...
3,They left after a tense hour-long standoff wit...,O O O O O O O O O O O
4,U.N. relief coordinator Jan Egeland said Sunda...,B-geo O O B-per I-per O B-tim O B-geo O B-gpe ...


In [None]:
# Splitting data into train, validation, and test sets
train_texts, temp_texts, train_labels, temp_labels = train_test_split(df['text'].tolist(), df['labels'].tolist(), test_size=0.2, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42)

In [None]:
# Load pre-trained BERT model and tokenizer
model_name = "dslim/bert-base-NER"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertFor

In [None]:
# Define custom dataset class
class NERDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, label_map, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.label_map = label_map
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        entities = self.labels[idx].split()  # Split the labels string into a list

        # Tokenize the input text
        encoding = self.tokenizer(
            text.split(),  # Split the text into tokens
            is_split_into_words=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_offsets_mapping=True
        )

        # Initialize labels for each token
        labels_ids = [self.label_map["O"]] * len(encoding["input_ids"])

        # Update labels for each token
        word_ids = encoding.word_ids()
        for i, word_id in enumerate(word_ids):
            if word_id is None or word_id >= len(entities):
                continue
            entity_label = entities[word_id]
            labels_ids[i] = self.label_map.get(entity_label, self.label_map["O"])

        # Convert everything to tensors
        item = {key: torch.tensor(val) for key, val in encoding.items() if key != "offset_mapping"}
        item["labels"] = torch.tensor(labels_ids)

        return item


In [None]:
# Create a label map
label_map = {"O": 0, "B-geo": 1, "B-gpe": 2, "I-geo": 3, "I-gpe": 4}


In [None]:
# Create datasets
train_dataset = NERDataset(train_texts, train_labels, tokenizer, label_map)
val_dataset = NERDataset(val_texts, val_labels, tokenizer, label_map)
test_dataset = NERDataset(test_texts, test_labels, tokenizer, label_map)

In [None]:
# Function to align predictions
def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)

    batch_size, seq_len = preds.shape

    out_label_list = [[] for _ in range(batch_size)]
    preds_list = [[] for _ in range(batch_size)]

    for i in range(batch_size):
        for j in range(seq_len):
            if label_ids[i, j] != -100:
                true_label = label_ids[i, j].item()
                pred_label = preds[i, j].item()
                out_label_list[i].append(list(label_map.keys())[list(label_map.values()).index(true_label)])
                preds_list[i].append(list(label_map.keys())[list(label_map.values()).index(pred_label)])

    return preds_list, out_label_list

# Compute metrics function
def compute_metrics(p):
    predictions, label_ids = p
    preds_list, out_label_list = align_predictions(predictions, label_ids)

    mlb = MultiLabelBinarizer(classes=list(label_map.keys()))
    out_label_list = mlb.fit_transform(out_label_list)
    preds_list = mlb.transform(preds_list)

    results = classification_report(out_label_list, preds_list, output_dict=True)
    accuracy = accuracy_score(out_label_list, preds_list)
    return {
        "accuracy": accuracy,
        "precision": results["macro avg"]["precision"],
        "recall": results["macro avg"]["recall"],
        "f1": results["macro avg"]["f1-score"],
    }

In [None]:

# Training arguments
training_args = TrainingArguments(
    output_dir='output',             # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    weight_decay=0.01,               # strength of weight decay
    logging_dir='logs',              # directory for storing logs
    logging_steps=200,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)



In [None]:
# Train the model
trainer.train()



Step,Training Loss
200,0.0253
400,0.0146
600,0.0139
800,0.0111
1000,0.0123
1200,0.0116
1400,0.0109
1600,0.0104
1800,0.0106
2000,0.0116


Step,Training Loss
200,0.0253
400,0.0146
600,0.0139
800,0.0111
1000,0.0123
1200,0.0116
1400,0.0109
1600,0.0104
1800,0.0106
2000,0.0116


TrainOutput(global_step=14388, training_loss=0.007635824032250066, metrics={'train_runtime': 3264.4452, 'train_samples_per_second': 35.259, 'train_steps_per_second': 4.407, 'total_flos': 7519355343376128.0, 'train_loss': 0.007635824032250066, 'epoch': 3.0})

In [None]:
# Evaluate the model
trainer.evaluate()


{'eval_loss': 0.009686805307865143,
 'eval_accuracy': 0.9022101751459549,
 'eval_precision': 0.8962237487790581,
 'eval_recall': 0.8633579113574642,
 'eval_f1': 0.8767682485125826,
 'eval_runtime': 37.4333,
 'eval_samples_per_second': 128.121,
 'eval_steps_per_second': 16.029,
 'epoch': 3.0}

In [None]:
# Test the model
trainer.predict(test_dataset)

PredictionOutput(predictions=array([[[ 11.302897  ,  -2.1067371 ,  -2.8891938 , ...,  -7.9050264 ,
          -7.2413216 ,  -7.7711067 ],
        [ 11.227863  ,  -2.2539685 ,  -3.0281923 , ...,  -7.4953756 ,
          -6.797913  ,  -7.630873  ],
        [ 11.270838  ,  -2.187447  ,  -2.9387207 , ...,  -7.5696087 ,
          -7.0407195 ,  -7.8028474 ],
        ...,
        [ 11.3654785 ,  -2.2772377 ,  -3.0328865 , ...,  -7.5567274 ,
          -6.839637  ,  -7.531323  ],
        [ 11.366277  ,  -2.266736  ,  -3.0395963 , ...,  -7.5524383 ,
          -6.8401394 ,  -7.543502  ],
        [ 11.376814  ,  -2.2292113 ,  -3.044363  , ...,  -7.6010003 ,
          -6.8977494 ,  -7.570059  ]],

       [[ 11.194849  ,  -2.0121703 ,  -2.7658405 , ...,  -7.923337  ,
          -7.2401996 ,  -7.8077226 ],
        [ 11.192759  ,  -2.1859193 ,  -2.6703866 , ...,  -7.5799127 ,
          -6.986175  ,  -7.671126  ],
        [  9.343191  ,   1.8062549 ,  -0.83995235, ..., -10.572363  ,
         -10.0675    ,

In [None]:
#save model in drive
trainer.save_model('/content/drive/MyDrive/NER/NerModel')