In [6]:
import json
from datasets import Dataset

# 1. JSON faylını yüklə
with open("kyc_ner_dataset.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)

# 2. Unikal etiketləri topla
all_labels = set()
for example in raw_data:
    for tag in example["ner_tags"]:
        all_labels.add(tag)

label_list = sorted(list(all_labels))  # sıralanmış etiketlər
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

# 3. Etiketləri rəqəmlərə çevir
for example in raw_data:
    example["ner_tags"] = [label2id[tag] for tag in example["ner_tags"]]

# 4. Datasetə çevir
dataset = Dataset.from_list(raw_data)
dataset = dataset.train_test_split(test_size=0.2)

# 5. Etiket xəritələrini saxla (modeldə istifadə etmək üçün)
print("Label2ID:", label2id)
print("ID2Label:", id2label)


Label2ID: {'B-ACCOUNT': 0, 'B-CARD': 1, 'B-DATE': 2, 'B-EMAIL': 3, 'B-ID': 4, 'B-LOCATION': 5, 'B-NATIONALITY': 6, 'B-ORGANIZATION': 7, 'B-PERSON': 8, 'B-PHONE': 9, 'B-TAX_ID': 10, 'I-CARD': 11, 'I-DATE': 12, 'I-NATIONALITY': 13, 'I-ORGANIZATION': 14, 'I-PERSON': 15, 'I-PHONE': 16}
ID2Label: {0: 'B-ACCOUNT', 1: 'B-CARD', 2: 'B-DATE', 3: 'B-EMAIL', 4: 'B-ID', 5: 'B-LOCATION', 6: 'B-NATIONALITY', 7: 'B-ORGANIZATION', 8: 'B-PERSON', 9: 'B-PHONE', 10: 'B-TAX_ID', 11: 'I-CARD', 12: 'I-DATE', 13: 'I-NATIONALITY', 14: 'I-ORGANIZATION', 15: 'I-PERSON', 16: 'I-PHONE'}


In [7]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-multilingual-cased"  # Azərbaycan dili üçün multilingual daha uyğundur
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"],
                                 truncation=True,
                                 is_split_into_words=True)

    labels = []
    word_ids = tokenized_inputs.word_ids()
    ner_tags = example["ner_tags"]

    # Debug print (yalnız 1 dəfə işlət)
    # print("TOKENS:", example["tokens"])
    # print("NER_TAGS:", ner_tags)
    # print("WORD IDS:", word_ids)

    previous_word_idx = None

    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(ner_tags[word_idx])
        else:
            # Subtoken üçün I-etiket (eyni etiket davam etdirilir)
            label = ner_tags[word_idx]
            labels.append(label)
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=False)


Map: 100%|██████████| 1600/1600 [00:00<00:00, 2139.85 examples/s]
Map: 100%|██████████| 400/400 [00:00<00:00, 1886.75 examples/s]


In [8]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

training_args = TrainingArguments(
    output_dir="./ner-kyc-bert",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
)

from seqeval.metrics import classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)

    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    report = classification_report(true_labels, true_predictions, output_dict=True)
    return {
        "f1": report["micro avg"]["f1-score"],
        "precision": report["micro avg"]["precision"],
        "recall": report["micro avg"]["recall"]
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_utils because of the following error (look up to see its traceback):
Traceback (most recent call last):
  File "C:\Users\Baku\AppData\Local\Programs\Python\Python311\Lib\site-packages\tensorflow\python\pywrap_tensorflow.py", line 73, in <module>
    from tensorflow.python._pywrap_tensorflow_internal import *
ImportError: DLL load failed while importing _pywrap_tensorflow_internal: A dynamic link library (DLL) initialization routine failed.


Failed to load the native TensorFlow runtime.
See https://www.tensorflow.org/install/errors for some common causes and solutions.
If you need help, create an issue at https://github.com/tensorflow/tensorflow/issues and include the entire stack trace above this error message.