<a href="https://colab.research.google.com/github/Habibaatef143/Named-Entity-recognition-/blob/main/NER_task_with_different_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# install data & transformers

In [None]:
!pip install transformers datasets tokenizers seqeval -q

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import csv
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statistics

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig,BertTokenizerFast
import datasets
from datasets import load_dataset
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from transformers import  pipeline, AutoTokenizer, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput

import transformers
from transformers import BertForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup

from seqeval.metrics import f1_score, accuracy_score


# load data

In [None]:
dataset = load_dataset("adsabs/WIESP2022-NER")
# Inspect the dataset
print(dataset["train"][0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


{'bibcode': '2019MNRAS.486.5558S', 'label_studio_id': 487, 'ner_ids': [62, 62, 62, 62, 62, 62, 62, 15, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 17, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 15, 62, 4, 35, 35, 35, 35, 35, 62, 62, 62, 62, 4, 35, 35, 35, 35, 35, 35, 62, 4, 35, 35, 35, 35, 35, 35, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 15, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 17, 15, 27, 62, 30, 61, 62, 62, 30, 61, 62, 62, 62, 4, 35, 35, 35, 35, 35, 35, 4, 35, 35, 35, 35, 35, 35, 62, 4, 35, 35, 35, 35, 35, 62, 62, 62, 17, 62, 15, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62,

# data preparation

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['bibcode', 'label_studio_id', 'ner_ids', 'ner_tags', 'section', 'tokens', 'unique_id'],
        num_rows: 1753
    })
    validation: Dataset({
        features: ['bibcode', 'label_studio_id', 'ner_ids', 'ner_tags', 'section', 'tokens', 'unique_id'],
        num_rows: 1366
    })
    test: Dataset({
        features: ['bibcode', 'label_studio_id', 'ner_ids', 'ner_tags', 'section', 'tokens', 'unique_id'],
        num_rows: 2505
    })
})

In [None]:
all_labels = set()
for example in dataset["train"]:
    all_labels.update(example["ner_tags"])
print(all_labels)
num_labels = len(all_labels)
print(num_labels)

{'I-Collaboration', 'O', 'B-Grant', 'I-Model', 'B-Formula', 'I-Survey', 'B-Citation', 'B-Dataset', 'I-Telescope', 'B-EntityOfFutureInterest', 'I-Wavelength', 'B-Observatory', 'I-Database', 'I-Tag', 'B-Tag', 'B-Person', 'I-Fellowship', 'I-CelestialRegion', 'I-Mission', 'I-Observatory', 'I-URL', 'I-Dataset', 'B-CelestialObjectRegion', 'I-Archive', 'B-Collaboration', 'I-CelestialObjectRegion', 'B-Identifier', 'B-Software', 'B-ObservationalTechniques', 'I-EntityOfFutureInterest', 'I-CelestialObject', 'I-Software', 'I-Citation', 'B-ComputingFacility', 'B-Database', 'I-ObservationalTechniques', 'I-Grant', 'B-Location', 'B-URL', 'I-TextGarbage', 'B-Archive', 'B-Telescope', 'I-Identifier', 'I-Organization', 'B-CelestialObject', 'B-Wavelength', 'I-Event', 'I-Location', 'B-TextGarbage', 'I-Instrument', 'B-Proposal', 'I-Person', 'B-CelestialRegion', 'B-Event', 'B-Instrument', 'I-Formula', 'B-Survey', 'I-Proposal', 'B-Fellowship', 'B-Model', 'B-Mission', 'B-Organization', 'I-ComputingFacility'}
63

# data preprocessing (Model and Tokenizer initialization)

In [None]:
label2id = {label: i for i, label in enumerate(all_labels)}
id2label = {i: label for label, i in label2id.items()}

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
#model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER",num_labels=9)
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=num_labels,id2label=id2label, label2id=label2id)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
text= dataset["train"][0]
tokenized_input = tokenizer(text["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input.input_ids)
print(tokens)

Token indices sequence length is longer than the specified maximum sequence length for this model (735 > 512). Running this sequence through the model will result in indexing errors


['[CLS]', 'whilst', 'a', 'reasonable', 'harmonic', 'fit', 'to', 'the', 'es', '##pad', '##ons', 'data', 'can', 'be', 'achieved', 'using', 'this', 'period', ',', 'it', 'does', 'not', 'produce', 'an', 'acceptable', 'ph', '##asi', '##ng', 'of', 'all', 'available', '〈', 'b', 'z', '〉', 'measurements', '.', 'figure', '1', '.', 'photo', '##metric', '(', 'top', ')', 'and', 'magnetic', '〈', 'b', 'z', '〉', '(', 'bottom', ')', 'measurements', ',', 'phased', 'with', 'periods', 'determined', 'from', '(', 'left', 'to', 'right', ')', 'k', '##2', 'photo', '##metry', ',', 'all', '〈', 'b', 'z', '〉', 'measurements', ',', 'and', 'all', 'photo', '##metric', 'measurements', '.', '〈', 'b', 'z', '〉', 'measurements', 'were', 'obtained', 'from', 'es', '##pad', '##ons', 'by', 'shu', '##lt', '##z', 'et', 'al', '.', '(', '2018', ')', 'and', 'photo', '##pol', '##ari', '##metric', 'data', 'by', 'bo', '##rra', 'et', 'al', '.', '(', '1983', ',', 'b', '##lt', '##8', '##3', ')', 'and', 'bo', '##hl', '##end', '##er', 'et'

In [None]:
print(tokenized_input)
word_ids = tokenized_input.word_ids()
print(word_ids)

{'input_ids': [101, 5819, 1037, 9608, 19452, 4906, 2000, 1996, 9686, 15455, 5644, 2951, 2064, 2022, 4719, 2478, 2023, 2558, 1010, 2009, 2515, 2025, 3965, 2019, 11701, 6887, 21369, 3070, 1997, 2035, 2800, 1637, 1038, 1062, 1638, 11702, 1012, 3275, 1015, 1012, 6302, 12589, 1006, 2327, 1007, 1998, 8060, 1637, 1038, 1062, 1638, 1006, 3953, 1007, 11702, 1010, 21718, 2007, 6993, 4340, 2013, 1006, 2187, 2000, 2157, 1007, 1047, 2475, 6302, 24327, 1010, 2035, 1637, 1038, 1062, 1638, 11702, 1010, 1998, 2035, 6302, 12589, 11702, 1012, 1637, 1038, 1062, 1638, 11702, 2020, 4663, 2013, 9686, 15455, 5644, 2011, 18454, 7096, 2480, 3802, 2632, 1012, 1006, 2760, 1007, 1998, 6302, 18155, 8486, 12589, 2951, 2011, 8945, 11335, 3802, 2632, 1012, 1006, 3172, 1010, 1038, 7096, 2620, 2509, 1007, 1998, 8945, 7317, 10497, 2121, 3802, 2632, 1012, 1006, 2857, 1010, 1038, 7096, 2683, 2509, 1007, 1012, 1996, 5024, 1998, 18198, 10543, 2265, 1010, 4414, 1010, 1996, 2190, 2117, 1011, 2344, 19452, 4906, 2000, 1996, 9686

# Tokenize all the Dataset

In [None]:
train_dataset = dataset["train"]
#test_func= tokenize_and_align_labels(train_dataset[2:5])
#test_func

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, padding='max_length', max_length=512, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(label2id[label[word_idx]])
            previous_word_idx = word_idx
        # Ensure labels are also padded to max_length
        label_ids += [-100] * (512 - len(label_ids))
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Modify the dataset mapping
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/1753 [00:00<?, ? examples/s]

Map:   0%|          | 0/1366 [00:00<?, ? examples/s]

Map:   0%|          | 0/2505 [00:00<?, ? examples/s]

In [None]:
test_dataset = tokenized_datasets["test"]
tokenized_datasets = tokenized_datasets.remove_columns(["tokens","ner_tags"])

In [None]:
from datasets import load_metric
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")


  metric = load_metric("seqeval")


In [None]:
train_dataset = tokenized_datasets["train"]
val_dataset = tokenized_datasets["validation"]

# train and evaluation model

In [None]:
def compute_metrics(eval_preds, label2id, id2label, zero_division='warn'):
  pred_logits, labels = eval_preds

  pred_logits = np.argmax(pred_logits, axis=2)
  # No need for softmax

  predictions = [
      [id2label[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(pred_logits, labels)
  ]

  true_labels = [
      [id2label[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(pred_logits, labels)
  ]

  results = metric.compute(predictions=predictions, references=true_labels, zero_division=zero_division)
  return {
      "precision": results["overall_precision"],
      "recall": results["overall_recall"],
      "f1": results["overall_f1"],
      "accuracy": results["overall_accuracy"],
  }


In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
    learning_rate = 5e-5,
    save_strategy = 'epoch',
    logging_strategy = 'epoch',
    log_level = "error"
)


trainer = Trainer(
   model=model,
   args=args,
   train_dataset=train_dataset,
   eval_dataset=val_dataset,
   data_collator=data_collator,
   tokenizer=tokenizer,
   compute_metrics=lambda x: compute_metrics(x, label2id, id2label)
)




In [None]:
results= trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.5217,0.275806,0.726846,0.7571,0.741665,0.929071
2,0.2173,0.233277,0.785417,0.790416,0.787909,0.939719
3,0.1432,0.233925,0.767234,0.830411,0.797573,0.941308
4,0.0984,0.230609,0.800117,0.827112,0.81339,0.946121


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.5217,0.275806,0.726846,0.7571,0.741665,0.929071
2,0.2173,0.233277,0.785417,0.790416,0.787909,0.939719
3,0.1432,0.233925,0.767234,0.830411,0.797573,0.941308
4,0.0984,0.230609,0.800117,0.827112,0.81339,0.946121
5,0.0694,0.241866,0.803042,0.826113,0.814414,0.946142
6,0.0503,0.249956,0.794314,0.837857,0.815505,0.945675
7,0.0374,0.265409,0.810406,0.835941,0.822975,0.947776
8,0.0271,0.268482,0.804406,0.839595,0.821624,0.9474
9,0.0209,0.278059,0.812403,0.834476,0.823292,0.948298
10,0.0167,0.281684,0.810551,0.839719,0.824877,0.948467


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
 trainer.evaluate()

{'eval_loss': 0.2816838026046753,
 'eval_precision': 0.8105512102314766,
 'eval_recall': 0.8397185836105066,
 'eval_f1': 0.8248771403793017,
 'eval_accuracy': 0.9484672145899574,
 'eval_runtime': 58.0613,
 'eval_samples_per_second': 23.527,
 'eval_steps_per_second': 5.89,
 'epoch': 10.0}

# saving the model and tokenizer

In [None]:
model.save_pretrained("ner_model")
tokenizer.save_pretrained("ner_model")

('ner_model/tokenizer_config.json',
 'ner_model/special_tokens_map.json',
 'ner_model/vocab.txt',
 'ner_model/added_tokens.json',
 'ner_model/tokenizer.json')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import shutil

# Source folder in Colab
source_folder = '/content/results'  # Replace with your folder path

# Destination folder in Google Drive
destination_folder = '/content/drive/My Drive/my_folder/results'  # Replace with your destination path in Drive

# Copy the folder
shutil.copytree(source_folder, destination_folder)

'/content/drive/My Drive/my_folder/results'

# prediction

In [None]:
import json
config = json.load(open("ner_model/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id
json.dump(config, open("ner_model/config.json","w"))

In [None]:
config["id2label"]

{0: 'I-Collaboration',
 1: 'O',
 2: 'B-Grant',
 3: 'I-Model',
 4: 'B-Formula',
 5: 'I-Survey',
 6: 'B-Citation',
 7: 'B-Dataset',
 8: 'I-Telescope',
 9: 'B-EntityOfFutureInterest',
 10: 'I-Wavelength',
 11: 'B-Observatory',
 12: 'I-Database',
 13: 'I-Tag',
 14: 'B-Tag',
 15: 'B-Person',
 16: 'I-Fellowship',
 17: 'I-CelestialRegion',
 18: 'I-Mission',
 19: 'I-Observatory',
 20: 'I-URL',
 21: 'I-Dataset',
 22: 'B-CelestialObjectRegion',
 23: 'I-Archive',
 24: 'B-Collaboration',
 25: 'I-CelestialObjectRegion',
 26: 'B-Identifier',
 27: 'B-Software',
 28: 'B-ObservationalTechniques',
 29: 'I-EntityOfFutureInterest',
 30: 'I-CelestialObject',
 31: 'I-Software',
 32: 'I-Citation',
 33: 'B-ComputingFacility',
 34: 'B-Database',
 35: 'I-ObservationalTechniques',
 36: 'I-Grant',
 37: 'B-Location',
 38: 'B-URL',
 39: 'I-TextGarbage',
 40: 'B-Archive',
 41: 'B-Telescope',
 42: 'I-Identifier',
 43: 'I-Organization',
 44: 'B-CelestialObject',
 45: 'B-Wavelength',
 46: 'I-Event',
 47: 'I-Location',


In [None]:
config["label2id"]

{'I-Collaboration': 0,
 'O': 1,
 'B-Grant': 2,
 'I-Model': 3,
 'B-Formula': 4,
 'I-Survey': 5,
 'B-Citation': 6,
 'B-Dataset': 7,
 'I-Telescope': 8,
 'B-EntityOfFutureInterest': 9,
 'I-Wavelength': 10,
 'B-Observatory': 11,
 'I-Database': 12,
 'I-Tag': 13,
 'B-Tag': 14,
 'B-Person': 15,
 'I-Fellowship': 16,
 'I-CelestialRegion': 17,
 'I-Mission': 18,
 'I-Observatory': 19,
 'I-URL': 20,
 'I-Dataset': 21,
 'B-CelestialObjectRegion': 22,
 'I-Archive': 23,
 'B-Collaboration': 24,
 'I-CelestialObjectRegion': 25,
 'B-Identifier': 26,
 'B-Software': 27,
 'B-ObservationalTechniques': 28,
 'I-EntityOfFutureInterest': 29,
 'I-CelestialObject': 30,
 'I-Software': 31,
 'I-Citation': 32,
 'B-ComputingFacility': 33,
 'B-Database': 34,
 'I-ObservationalTechniques': 35,
 'I-Grant': 36,
 'B-Location': 37,
 'B-URL': 38,
 'I-TextGarbage': 39,
 'B-Archive': 40,
 'B-Telescope': 41,
 'I-Identifier': 42,
 'I-Organization': 43,
 'B-CelestialObject': 44,
 'B-Wavelength': 45,
 'I-Event': 46,
 'I-Location': 47,


In [None]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("ner_model")

In [None]:
print(dataset['test']['tokens'][0])

['The', 'authors', 'would', 'like', 'to', 'thank', 'Adam', 'Burgasser,', 'Brendan', 'Bowler,', 'Kelle', 'Cruz,', 'Mike', 'Cushing,', 'Michael', 'Liu,', 'and', 'Emily', 'Rice', 'for', 'useful', 'discussions', 'on', 'benchmark', 'systems,', 'data', 'treatment,', 'and', 'various', 'data-model', 'comparison', 'approaches.', 'The', 'authors', 'thank', 'Richard', 'Freedman', 'and', 'Roxana', 'Lupu', 'for', 'providing', 'gas', 'opacities', 'and', 'Caroline', 'Morley', 'for', 'radiative', 'transfer', 'code', 'comparisons', 'and', 'helpful', 'discussions.', 'We', 'thank', 'Jacob', 'Lustig-Yeager', 'and', 'Kyle', 'Luther', 'for', 'rewriting', 'portions', 'of', 'the', 'code', 'in', 'python', 'and', 'C', 'for', 'significant', 'speed', 'improvements', 'and', 'also', 'Dan', 'Foreman-Mackey', 'for', 'making', 'EMCEE', 'available', 'to', 'the', 'community.', 'Finally,', 'we', 'thank', 'the', 'anonymous', 'referee', 'and', 'statistics', 'consultant', 'for', 'useful', 'and', 'insightful', 'comments.', '

In [None]:
from transformers import pipeline
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)
example = "The authors would like to thank Adam Burgasser, Brendan Bowler, Kelle Cruz, Mike Cushing, Michael Liu, and Emily Rice for useful discussions on benchmark systems, data treatment, and various data-model comparison approaches. The authors thank Richard Freedman and Roxana Lupu for providing gas opacities and Caroline Morley for radiative transfer code comparisons and helpful discussions. We thank Jacob Lustig-Yeager and Kyle Luther for rewriting portions of the code in python and C for significant speed improvements and also Dan Foreman-Mackey for making EMCEE available to the community. Finally, we thank the anonymous referee and statistics consultant for useful and insightful comments. J.T. acknowledges financial support from the Carnegie Origins Postdoctoral Fellowship Program. B.B. acknowledges financial support from the European Commission in the form of a Marie Curie International Outgoing Fellowship (PIOF-GA-2013-629435). J.F. acknowledges funding support from NSF award AST-1312545  M.M. acknowledges support from the NASA Astrophysics Theory and Planetary Atmospheres programs. Facility:Magellan:Clay"
ner_results = nlp(example)
print(ner_results)

[{'entity': 'B-Person', 'score': 0.99915886, 'index': 7, 'word': 'adam', 'start': 32, 'end': 36}, {'entity': 'I-Person', 'score': 0.9991059, 'index': 8, 'word': 'bu', 'start': 37, 'end': 39}, {'entity': 'I-Person', 'score': 0.99902916, 'index': 9, 'word': '##rga', 'start': 39, 'end': 42}, {'entity': 'I-Person', 'score': 0.9990195, 'index': 10, 'word': '##sser', 'start': 42, 'end': 46}, {'entity': 'I-Person', 'score': 0.9989586, 'index': 11, 'word': ',', 'start': 46, 'end': 47}, {'entity': 'B-Person', 'score': 0.9990772, 'index': 12, 'word': 'brendan', 'start': 48, 'end': 55}, {'entity': 'I-Person', 'score': 0.9991227, 'index': 13, 'word': 'bowler', 'start': 56, 'end': 62}, {'entity': 'I-Person', 'score': 0.99904746, 'index': 14, 'word': ',', 'start': 62, 'end': 63}, {'entity': 'B-Person', 'score': 0.99904495, 'index': 15, 'word': 'ke', 'start': 64, 'end': 66}, {'entity': 'B-Person', 'score': 0.9991366, 'index': 16, 'word': '##lle', 'start': 66, 'end': 69}, {'entity': 'I-Person', 'score