Halyna Trush. Contact Information Phone: +380954200758 Email: frolova.galka@gmail.com LinkedIn: https://www.linkedin.com/in/halyna-trush/

This script loads the fine-tuned Mountain NER model and tokenizer from Google Drive
and runs an interactive inference session in the console. It supports both
`.safetensors` and `.pytorch_model.bin` weight formats and validates required files
(`config.json`, `tokenizer.json` or `vocab.txt`). The user can type any sentence,
and the script outputs token-level labels and recognized mountain entities.

In [1]:
# Load fine-tuned Mountain NER model from Google Drive (handles safetensors or pytorch bin)
from google.colab import drive
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
from pathlib import Path

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Path to model on Drive
MODEL_DIR = Path("/content/drive/MyDrive/mountain_ner_model")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

try:
    if not MODEL_DIR.exists():
        raise FileNotFoundError(f"Model directory not found: {MODEL_DIR}")

    # Accept either safetensors or pytorch .bin
    has_safetensors = (MODEL_DIR / "model.safetensors").exists()
    has_ptbin       = (MODEL_DIR / "pytorch_model.bin").exists()
    if not has_safetensors and not has_ptbin:
        raise FileNotFoundError("No model weight file found: expected 'model.safetensors' or 'pytorch_model.bin'.")

    # Required config/tokenizer files
    required_any = [
        ("config.json",),  # must exist
        ("tokenizer.json", "vocab.txt"),  # at least one of these must exist
    ]
    if not (MODEL_DIR / "config.json").exists():
        raise FileNotFoundError("Missing 'config.json' in model directory.")
    if not ((MODEL_DIR / "tokenizer.json").exists() or (MODEL_DIR / "vocab.txt").exists()):
        raise FileNotFoundError("Missing tokenizer files: need 'tokenizer.json' or 'vocab.txt'.")

    # Load tokenizer and model (local files only)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True, local_files_only=True)
    model = AutoModelForTokenClassification.from_pretrained(MODEL_DIR, local_files_only=True).to(device).eval()

    print(f"Model and tokenizer loaded from: {MODEL_DIR}")
    print(f"Device: {device}")
except Exception as e:
    print("Failed to load model. Please check:")
    print(f"- Folder exists: {MODEL_DIR}")
    print("- Files expected: config.json, tokenizer.json or vocab.txt, and model.safetensors or pytorch_model.bin")
    print("Error details:", e)
    model = None
    tokenizer = None


import re

def ner_predict(sentence: str):
    """
    Run inference and return (tokens, labels, entities).
    - Regex tokenization to split words and punctuation separately
    - Merge consecutive MOUNT tags (B- or I-) into one entity
    - Ignore pure punctuation in entity spans
    """
    if model is None or tokenizer is None:
        print("Model not loaded. Cannot run inference.")
        return [], [], []

    # Clean obvious noise at ends (e.g., long dashes)
    sentence = re.sub(r"^\W+|\W+$", "", sentence)

    # Tokenize into words and punctuation separately (to mimic CoNLL-style)
    words = re.findall(r"\w+|[^\w\s]", sentence)

    # Encode as split-into-words to keep word_ids mapping
    enc = tokenizer(
        words,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True
    ).to(device)

    with torch.inference_mode():
        logits = model(**enc).logits

    pred_ids = logits.argmax(dim=-1)[0].cpu().tolist()
    word_ids = enc.word_ids(0)
    id2label = model.config.id2label

    tokens, labels, seen = [], [], set()
    for pos, wid in enumerate(word_ids):
        if wid is None or wid in seen:
            continue
        seen.add(wid)
        tokens.append(words[wid])
        labels.append(id2label[pred_ids[pos]])

    # Merge logic: treat any consecutive B-MOUNT / I-MOUNT as one entity
    ents, cur = [], []
    def is_punct(tok: str) -> bool:
        return bool(re.fullmatch(r"[^\w\s]", tok))

    for w, lab in zip(tokens, labels):
        if lab.endswith("MOUNT"):
            # append non-punct tokens only
            if not is_punct(w):
                cur.append(w)
        else:
            if cur:
                ents.append(" ".join(cur))
                cur = []
    if cur:
        ents.append(" ".join(cur))

    return tokens, labels, ents


# Interactive input
if model is not None:
    print("Mountain NER — interactive mode. Type a sentence or 'exit' to quit.\n")
    while True:
        text = input("Enter a sentence: ").strip()
        if not text or text.lower() in ["exit", "quit"]:
            print("Goodbye!")
            break
        tokens, labels, ents = ner_predict(text)
        print("\nTokens & Labels:")
        for t, l in zip(tokens, labels):
            print(f"{t:20s} -> {l}")
        print("\nEntities:", ents if ents else "— none —")
        print("-" * 50)


Mounted at /content/drive
Model and tokenizer loaded from: /content/drive/MyDrive/mountain_ner_model
Device: cpu
Mountain NER — interactive mode. Type a sentence or 'exit' to quit.

Enter a sentence: From our camp we could clearly see Stormveil Range in the distance.

Tokens & Labels:
From                 -> O
our                  -> O
camp                 -> O
we                   -> O
could                -> O
clearly              -> O
see                  -> O
Stormveil            -> B-MOUNT
Range                -> I-MOUNT
in                   -> O
the                  -> O
distance             -> O

Entities: ['Stormveil Range']
--------------------------------------------------
Enter a sentence: The sunrise over Emerald Summit was breathtaking.

Tokens & Labels:
The                  -> O
sunrise              -> O
over                 -> O
Emerald              -> B-MOUNT
Summit               -> I-MOUNT
was                  -> O
breathtaking         -> O

Entities: ['Emerald Summit'