In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

MODEL_NAME = "HooshvareLab/bert-base-parsbert-ner-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model     = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)

ner_pipe = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple"
)

sentence = "این یک جمله تستی است که در آن باید بانک ملت و کد ملی تشخیص داده شود"
preds = ner_pipe(sentence)

print("Raw ParsBERT‐NER output:")
for p in preds:
    print(f"  span='{p['word']}'   label={p['entity_group']}   score={p['score']:.3f}")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at HooshvareLab/bert-base-parsbert-ner-uncased were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Raw ParsBERT‐NER output:
  span='بانک ملت'   label=organization   score=0.975


## Read .CoNLL file

In [32]:
def read_conll_file(file_path):
    sentences = []
    current_sentence = []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip() == "":
                if current_sentence:
                    sentences.append(current_sentence)
                    current_sentence = []
            else:
                parts = line.strip().split()
                if len(parts) == 4:
                    token, pos, chunk, ner = parts
                elif len(parts) == 3:
                    token, pos, ner = parts
                    chunk = "_"  # placeholder if missing
                else:
                    raise ValueError(f"Invalid line: {line}")
                current_sentence.append((token, ner))

    return sentences

# Example usage:
data = read_conll_file("Labeled_NER.conll")
print(data)  # Print first sentence

[[('-DOCSTART-', 'O'), ('در', 'O'), ('پردازش', 'B-ACTION'), ('717', 'O'), ('(', 'O'), ('تبادل', 'O'), ('مانده', 'O'), ('انتقالی', 'O'), ('کاربر', 'O'), (')', 'O'), ('پیغام', 'O'), ('خطای', 'B-ERROR'), ('"', 'O'), ('"', 'O'), ('کد', 'O'), ('ملی', 'O'), ('وارد', 'O'), ('شده', 'O'), ('مرتبط', 'O'), ('با', 'O'), ('این', 'O'), ('عملیات', 'O'), ('نمی', 'O'), ('باشد.', 'O'), ('"', 'O'), ('"', 'O'), ('دریافت', 'O'), ('می', 'O'), ('گردد.', 'O'), ('در', 'O'), ('تمامی', 'O'), ('فعالیتهایی', 'O'), ('که', 'O'), ('منجر', 'O'), ('به', 'O'), ('ایجاد', 'O'), ('مانده', 'B-ACTION'), ('انتقالی', 'B-ACTION'), ('می', 'O'), ('شوند،', 'O'), ('شناسه', 'B-CUSTOMER_ID'), ('مشتری', 'B-CUSTOMER_ID'), ('(', 'O'), ('کدملی', 'B-IDENTIFICATION_ID'), ('/', 'O'), ('شناسه', 'B-IDENTIFICATION_ID'), ('ملی', 'B-IDENTIFICATION_ID'), (')', 'O'), ('در', 'O'), ('فرم', 'O'), ('پولشویی', 'B-EVENT'), ('از', 'O'), ('اطلاعات', 'O'), ('قبلی', 'O'), ('بازیابی', 'O'), ('نشده', 'O'), ('و', 'O'), ('کاربر', 'O'), ('ملزم', 'O'), ('به', 'O'

## Generate fake data

In [33]:
import random
import copy

# Base examples (your list of sentences)
base_data = data.copy() 

# Define replacements for augmentation
synonyms = {
    'استعلام': ['درخواست', 'بررسی', 'پرس‌وجو'],
    'کدملی': ['شناسه ملی', 'کد شناسایی'],
    'سامانه': ['سیستم', 'پلتفرم'],
    'بازیابی': ['دریافت', 'بازخوانی']
}

def augment_sentence(sentence):
    new_sentence = []
    for word, label in sentence:
        if word in synonyms:
            new_word = random.choice(synonyms[word])
        else:
            new_word = word
        new_sentence.append((new_word, label))
    return new_sentence

# Generate 10,000 examples
augmented_data = []

for _ in range(1000):
    base = random.choice(base_data)
    augmented = augment_sentence(base)
    augmented_data.append(augmented)


In [34]:
label_list = ['O', 'B-ACTION', 'I-ACTION', 'B-IDENTIFICATION_ID', 'I-IDENTIFICATION_ID',
              'B-FINANCIAL_PRODUCT', 'I-FINANCIAL_PRODUCT']
label2id = {label: idx for idx, label in enumerate(label_list)}
id2label = {idx: label for label, idx in label2id.items()}


In [35]:
# Convert raw sentences into structured format
def convert_to_hf_format(raw_sentences, label2id):
    dataset = []
    for sentence in raw_sentences:
        tokens = [word for word, tag in sentence]
        ner_tags = [label2id.get(tag, 0) for _, tag in sentence]  # default to 'O' if unknown
        dataset.append({
            "tokens": tokens,
            "ner_tags": ner_tags
        })
    return dataset

converted_dataset = convert_to_hf_format(data, label2id)
converted_dataset = convert_to_hf_format(augmented_data, label2id)


In [37]:
len(converted_dataset)

1000

In [38]:
from datasets import Dataset

hf_dataset = Dataset.from_list(converted_dataset)


In [39]:
from transformers import AutoTokenizer

model_name = "HooshvareLab/bert-base-parsbert-ner-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# def tokenize_and_align_labels(examples):
#     tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    
#     labels = []
#     for i, label in enumerate(examples["ner_tags"]):
#         word_ids = tokenized_inputs.word_ids(batch_index=i)
#         previous_word_idx = None
#         label_ids = []
#         for word_idx in word_ids:
#             if word_idx is None:
#                 label_ids.append(-100)  # ignored in loss
#             elif word_idx != previous_word_idx:
#                 label_ids.append(label[word_idx])  # B or O
#             else:
#                 label_ids.append(label[word_idx] if label[word_idx] % 2 == 1 else label[word_idx] + 1)  # I if needed
#             previous_word_idx = word_idx
#         labels.append(label_ids)

#     tokenized_inputs["labels"] = labels
#     return tokenized_inputs

# tokenized_dataset = hf_dataset.map(tokenize_and_align_labels, batched=True)


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding=False,
        is_split_into_words=True,
        return_offsets_mapping=True  # helps alignment
    )
    
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                # For subwords, use I- version if available
                label_ids.append(label[word_idx] if label[word_idx] % 2 == 1 else label[word_idx] + 1)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs




In [40]:
tokenized_dataset = hf_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["tokens", "ner_tags"])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 1000/1000 [00:00<00:00, 3377.49 examples/s]


In [41]:
from transformers import AutoModelForTokenClassification

# model = AutoModelForTokenClassification.from_pretrained(
#     model_name,
#     num_labels=len(label_list),
#     id2label=id2label,
#     label2id=label2id
# )
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True  # ✅ this line solves missmatch problem
)


Some weights of the model checkpoint at HooshvareLab/bert-base-parsbert-ner-uncased were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-ner-uncased and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([21, 768]) in the checkpoint and torch.Size([7, 768]) in the model instan

In [42]:
from transformers.training_args import TrainingArguments
from transformers.trainer import Trainer
from transformers.data.data_collator import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="./ner_model",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="no"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator  # ✅ required for NER!
)

trainer.train()


Step,Training Loss


TrainOutput(global_step=375, training_loss=0.03169589742024739, metrics={'train_runtime': 36.2883, 'train_samples_per_second': 82.671, 'train_steps_per_second': 10.334, 'total_flos': 209761370478000.0, 'train_loss': 0.03169589742024739, 'epoch': 3.0})

In [43]:
sample = tokenized_dataset[0]
print(len(sample['input_ids']), len(sample['labels']))  # These should match!


137 137


In [44]:
trainer.save_model("./ner_model_fakedata")
tokenizer.save_pretrained("./ner_model_fakedata")

('./ner_model_fakedata/tokenizer_config.json',
 './ner_model_fakedata/special_tokens_map.json',
 './ner_model_fakedata/vocab.txt',
 './ner_model_fakedata/added_tokens.json',
 './ner_model_fakedata/tokenizer.json')

## Test the Model

In [30]:
# from transformers import AutoModelForTokenClassification, AutoTokenizer
# from transformers.pipelines import pipeline

# model = AutoModelForTokenClassification.from_pretrained("./ner_model")
# tokenizer = AutoTokenizer.from_pretrained("./ner_model")
# ner_pipe = pipeline(
#     "ner",
#     model=model,
#     tokenizer=tokenizer,
#     aggregation_strategy="simple"
# )


In [31]:
# # Test it
# text = "اطلاعات مشتری باید از طریق سامانه بک آفیس بازیابی گردد."
# try:
#     result = ner_pipe(text)
#     if result:
#         for entity in result:
#             if isinstance(entity, dict):
#                 print(f"{entity['word']} -> {entity['entity_group']} (score={entity['score']:.2f})")
#             else:
#                 print(f"Unexpected entity format: {entity}")
#     else:
#         print("No entities were found in the text.")
# except Exception as e:
#     print(f"Error processing text: {str(e)}")

In [47]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

model = AutoModelForTokenClassification.from_pretrained("./ner_model_fakedata")
tokenizer = AutoTokenizer.from_pretrained("./ner_model_fakedata")

text = "اطلاعات مشتری باید از طریق سامانه بک آفیس بازیابی گردد در بانک ملت."
text = "اطلاعات مشتری در بانک مرکزی و بانک ملت باید از طریق سامانه بک آفیس بازیابی گردد."
text = "در پردازش 717 (تبادل مانده انتقالی کاربر) پیغام خطای ""کد ملی وارد شده مرتبط با این عملیات نمی باشد."" دریافت می گردد. در تمامی فعالیتهایی که منجر به ایجاد مانده انتقالی می شوند، شناسه مشتری (کدملی/ شناسه ملی) در فرم پولشویی از اطلاعات قبلی بازیابی نشده و کاربر ملزم به درج شناسه مشتری (کدملی/ شناسه ملی) در فرم پولشویی برای فعالیت بعدی می باشد. پس از درج شناسه مشتری توسط کاربر، سامانه بانکداری متمرکز کنترل می نماید که شناسه مذکور با شناسه مانده انتقالی باز کاربر همخوانی داشته باشد. به عبارتی مانده انتقالی ایجاد شده صرفا برای یک شخص با یک شماره تبادل منحصر بفرد قابل پردازش می باشد."


inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=2)
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

for token, pred_id in zip(tokens, predictions[0]):
    print(f"{token:15} → {model.config.id2label[pred_id.item()]}")


[CLS]           → O
در              → O
پردازش          → B-ACTION
[UNK]           → O
(               → O
تبادل           → O
مانده           → O
انتقالی         → O
کاربر           → O
)               → O
پیغام           → O
خطای            → O
کد              → O
ملی             → O
وارد            → O
شده             → O
مرتبط           → O
با              → O
این             → O
عملیات          → O
نمی             → O
باشد            → O
.               → B-ACTION
دریافت          → O
می              → O
گردد            → O
.               → B-ACTION
در              → O
تمامی           → O
فعالیتهایی      → O
که              → O
منجر            → O
به              → O
ایجاد           → O
مانده           → B-ACTION
انتقالی         → B-ACTION
می              → O
شوند            → O
،               → B-ACTION
شناسه           → O
مشتری           → O
(               → O
کدملی           → O
/               → O
شناسه           → B-IDENTIFICATION_ID
ملی             → B-IDENTIFICATION_ID
) 

In [46]:
model.config.id2label


{0: 'O',
 1: 'B-ACTION',
 2: 'I-ACTION',
 3: 'B-IDENTIFICATION_ID',
 4: 'I-IDENTIFICATION_ID',
 5: 'B-FINANCIAL_PRODUCT',
 6: 'I-FINANCIAL_PRODUCT'}