In [None]:
 !pip install datasets==3.6.0
 !pip install -U spacy



In [None]:
# # Use a code cell in your notebook and add a "!" at the beginning
# !pip uninstall spacy thinc numpy -y
# !pip install --no-cache-dir -U spacy

In [None]:
import torch
if torch.cuda.is_available():
    print(f"Current device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name()}")


Current device: 0
Device name: Tesla T4


In [None]:
from datasets import load_dataset
import os
os.environ['HF_HUB_OFFLINE'] = '1'
ontonotes_dataset = load_dataset("tner/ontonotes5" )

In [None]:
import spacy

In [None]:
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner", last=True)
# if spacy.prefer_gpu():
#     print("✅ GPU enabled for spaCy")


In [None]:
ontonotes_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 59924
    })
    validation: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 8528
    })
    test: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 8262
    })
})

In [None]:
from google.colab import drive
drive.mount('/content/drive')
OUTPUT_DIR = "/content/drive/My Drive/ML/NER"
os.makedirs(OUTPUT_DIR, exist_ok=True)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
TRAIN_DATASET = ontonotes_dataset['train']
TEST_DATASET = ontonotes_dataset['test']
VALIDATION_DATASET = ontonotes_dataset['validation']

In [None]:
int_to_label = {
    0: "O",
    1: "B-CARDINAL",
    2: "B-DATE",
    3: "I-DATE",
    4: "B-PERSON",
    5: "I-PERSON",
    6: "B-NORP",
    7: "B-GPE",
    8: "I-GPE",
    9: "B-LAW",
    10: "I-LAW",
    11: "B-ORG",
    12: "I-ORG",
    13: "B-PERCENT",
    14: "I-PERCENT",
    15: "B-ORDINAL",
    16: "B-MONEY",
    17: "I-MONEY",
    18: "B-WORK_OF_ART",
    19: "I-WORK_OF_ART",
    20: "B-FAC",
    21: "B-TIME",
    22: "I-CARDINAL",
    23: "B-LOC",
    24: "B-QUANTITY",
    25: "I-QUANTITY",
    26: "I-NORP",
    27: "I-LOC",
    28: "B-PRODUCT",
    29: "I-TIME",
    30: "B-EVENT",
    31: "I-EVENT",
    32: "I-FAC",
    33: "B-LANGUAGE",
    34: "I-PRODUCT",
    35: "I-ORDINAL",
    36: "I-LANGUAGE"
}

In [None]:
def preparing_dataset(raw_dataset):
  prepared_data = []
  for i in range(len(raw_dataset)):
    tokens = raw_dataset[i]['tokens']
    entities = raw_dataset[i]['tags']
    list_tokens = []
    list_entities = []

    raw_text = ''
    for j in tokens:
      raw_text += j + ' '
      list_tokens.append(len(raw_text))
    json_format = {}
    json_format['entities'] = []
    for k in range(len(entities)):
      if entities[k] != 0:
        if k == 0:
          entities_pos = (0, list_tokens[k]-1, int_to_label[entities[k]])
          list_entities.append(entities_pos)
        else:
          entities_pos = (list_tokens[k-1], list_tokens[k]-1, int_to_label[entities[k]])
          list_entities.append(entities_pos)


    json_format['entities'] = list_entities
    response = (raw_text, json_format)
    prepared_data.append(response) # Append the prepared data to the list
    # print(response
    # print(response)
    # return response # Remove the return here
    # print(list_tokens)
    # print(raw_text)
    # print(list_entities)
    # print(tokens)
    # print(list_tokens)
    # print(entities)
    # for k in entities:
    #   print()
      # if k != 0
        # print(k)

    # print(raw_text)
    # raw_text =
    # print(raw_dataset[i])
  return prepared_data # Return the list of prepared data

NEW_TRAIN_DATA= preparing_dataset(TRAIN_DATASET)
NEW_TEST_DATASET= preparing_dataset(TEST_DATASET)
NEW_VALIDATION_DATASET= preparing_dataset(VALIDATION_DATASET)

In [None]:
print(NEW_TEST_DATASET)



In [None]:
for _, annotations in NEW_TRAIN_DATA + NEW_VALIDATION_DATASET:
  for ent in annotations.get("entities"):
      ner.add_label(ent[2])


In [None]:
import random
from tqdm import tqdm
from spacy.training.example import Example
n_iter=50


In [None]:
def evaluate_model(nlp,dataset):
  total_predicted = 0
  total_actual = 0
  total_correct = 0
  for text, annotations in tqdm(dataset):
    doc = nlp(text)
    predicted = set()

    for ent in doc.ents:
      predicted.add((ent.start_char, ent.end_char, ent.label_))
    actual = set()
    for start, end, label in annotations.get("entities", []):
      actual.add((start, end, label))
    total_predicted += len(predicted)
    total_actual += len(actual)
    total_correct += len(predicted & actual)
  if total_predicted == 0:
        precision = 0
  else:
        precision = total_correct / total_predicted

  if total_actual == 0:
        recall = 0
  else:
        recall = total_correct / total_actual

  if precision + recall == 0:
        f1 = 0
  else:
        f1 = 2 * (precision * recall) / (precision + recall)
  return f1



In [None]:
import time

In [None]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
train_losses = []
val_scores = []
best_score = 0
best_model = None

with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    start_time = time.time()

    for itn in range(n_iter):
        random.shuffle(NEW_TRAIN_DATA)
        losses = {}
        batch_size = 64  # Add batch size

        for i in tqdm(range(0, len(NEW_TRAIN_DATA), batch_size)):
            batch = NEW_TRAIN_DATA[i:i+batch_size]
            examples = []

            for text, annotations in batch:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                examples.append(example)

            nlp.update(
                examples,
                drop=0.5,
                sgd=optimizer,
                losses=losses
            )

        train_losses.append(losses.get('ner', 0))

        if (itn ) % 1 == 0:
            val_score = evaluate_model(nlp, NEW_VALIDATION_DATASET)
            val_scores.append(val_score)

            # Save best model
            if val_score > best_score:
                best_score = val_score
                best_model = nlp.to_bytes()
                nlp.to_disk(OUTPUT_DIR)
                print(f"Saved to Drive {OUTPUT_DIR}")

            elapsed_time = time.time() - start_time
            print(f"Iteration {itn + 1:2d}: "
                  f"Train Loss = {losses.get('ner', 0):8.4f}, "
                  f"Val F1 = {val_score:6.4f}, "
                  f"Time = {elapsed_time:6.1f}s")

if best_model:
    nlp.from_bytes(best_model)
    print(f"\n✅ Training completed! Best validation F1: {best_score:.4f}")

  4%|▎         | 33/937 [00:07<03:25,  4.40it/s]


KeyboardInterrupt: 

In [None]:
model_path = '/content/drive/My Drive/ML/NER'  # Your main model
nlp_fine_tune = spacy.load(model_path)

In [None]:
text = "Apple is looking at buying a U.K. startup for $1 billion."
doc = nlp_fine_tune(text)
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}")

Entity: U.K., Label: B-GPE
Entity: $, Label: B-MONEY
Entity: 1, Label: I-MONEY
Entity: billion, Label: I-MONEY
