In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

MODEL_NAME = "HooshvareLab/bert-base-parsbert-ner-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model     = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)

ner_pipe = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple"
)

sentence = "این یک جمله تستی است که در آن باید بانک ملت و کد ملی تشخیص داده شود"
preds = ner_pipe(sentence)

print("Raw ParsBERT‐NER output:")
for p in preds:
    print(f"  span='{p['word']}'   label={p['entity_group']}   score={p['score']:.3f}")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at HooshvareLab/bert-base-parsbert-ner-uncased were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Raw ParsBERT‐NER output:
  span='بانک ملت'   label=organization   score=0.975


## Read .CoNLL file

In [6]:
def read_conll_file(file_path):
    sentences = []
    current_sentence = []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip() == "":
                if current_sentence:
                    sentences.append(current_sentence)
                    current_sentence = []
            else:
                parts = line.strip().split()
                if len(parts) == 4:
                    token, pos, chunk, ner = parts
                elif len(parts) == 3:
                    token, pos, ner = parts
                    chunk = "_"  # placeholder if missing
                else:
                    raise ValueError(f"Invalid line: {line}")
                current_sentence.append((token, ner))

    return sentences

# Example usage:
data = read_conll_file("Labeled_NER.conll")
print(data[1])  # Print first sentence

[('جهت', 'O'), ('استعلام', 'B-ACTION'), ('کدملی', 'B-IDENTIFICATION_ID'), ('مربوط', 'O'), ('به', 'O'), ('مانده', 'O'), ('ایجادی', 'O'), ('می', 'O'), ('بایست', 'O'), ('از', 'O'), ('طریق', 'O'), ('سامانه', 'B-FINANCIAL_PRODUCT'), ('بک', 'B-FINANCIAL_PRODUCT'), ('آفیس»', 'B-FINANCIAL_PRODUCT'), ('پشتیبانی', 'O'), ('فنی»', 'O'), ('بازیابی', 'B-ACTION'), ('اطلاعات', 'B-ACTION'), ('شناسه', 'O'), ('مشتری', 'O'), ('تسک', 'O'), ('انتقالی', 'O'), ('اقدام', 'O'), ('گردد', 'O'), ('.', 'O')]


In [7]:
len(data)  # Print number of sentences

2