In [1]:
!pip install transformers datasets seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=46eb76d7d3b80aa8c3b30cb5d380e411e9aa09232e04312fcbe3fc49e0138124
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [2]:
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from seqeval.metrics import classification_report
import json

In [31]:
with open("kyc_ner_dataset_strict_bio.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)

# 2. Unikal etiketləri topla - Artıq `label_list` yuxarıda təyin edilib, onu istifadə edəcəyik.
# all_labels = set()
# for example in raw_data:
#     for tag in example["ner_tags"]:
#         all_labels.add(tag)

# label_list = sorted(list(all_labels))  # sıralanmış etiketlər
# `label_list` artıq hIj0cvOUHlHZ hücrəsində təyin edilib.

label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

# 3. Etiketləri rəqəmlərə çevir
for example in raw_data:
    # 'O' etiketinin id'sini alırıq, əgər etiket `label2id` sözlüyündə yoxdursa 0 istifadə edirik
    example["ner_tags"] = [label2id.get(tag, label2id.get('O', 0)) for tag in example["ner_tags"]]


# 4. Datasetə çevir
dataset = Dataset.from_list(raw_data)
dataset = dataset.train_test_split(test_size=0.2)

# 5. Etiket xəritələrini saxla (modeldə istifadə etmək üçün)
print("Label2ID:", label2id)
print("ID2Label:", id2label)

Label2ID: {'O': 0, 'B-PERSON': 1, 'I-PERSON': 2, 'B-DATE': 3, 'I-DATE': 4, 'B-EMAIL': 5, 'I-EMAIL': 6, 'B-ID': 7, 'I-ID': 8, 'B-NATIONALITY': 9, 'I-NATIONALITY': 10, 'B-LOCATION': 11, 'I-LOCATION': 12, 'B-ORGANIZATION': 13, 'I-ORGANIZATION': 14, 'B-PHONE': 15, 'I-PHONE': 16, 'B-ACCOUNT': 17, 'I-ACCOUNT': 18, 'B-CARD': 19, 'I-CARD': 20, 'B-TAX_ID': 21, 'I-TAX_ID': 22}
ID2Label: {0: 'O', 1: 'B-PERSON', 2: 'I-PERSON', 3: 'B-DATE', 4: 'I-DATE', 5: 'B-EMAIL', 6: 'I-EMAIL', 7: 'B-ID', 8: 'I-ID', 9: 'B-NATIONALITY', 10: 'I-NATIONALITY', 11: 'B-LOCATION', 12: 'I-LOCATION', 13: 'B-ORGANIZATION', 14: 'I-ORGANIZATION', 15: 'B-PHONE', 16: 'I-PHONE', 17: 'B-ACCOUNT', 18: 'I-ACCOUNT', 19: 'B-CARD', 20: 'I-CARD', 21: 'B-TAX_ID', 22: 'I-TAX_ID'}


In [24]:
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from seqeval.metrics import classification_report
import json

model_checkpoint = "bert-base-multilingual-cased"  # Azərbaycan dili üçün multilingual daha uyğundur
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

label_list = [
    "O", "B-PERSON", "I-PERSON", "B-DATE", "I-DATE", "B-EMAIL", "I-EMAIL",
    "B-ID", "I-ID", "B-NATIONALITY", "I-NATIONALITY", "B-LOCATION", "I-LOCATION",
    "B-ORGANIZATION", "I-ORGANIZATION", "B-PHONE", "I-PHONE", "B-ACCOUNT",
    "I-ACCOUNT", "B-CARD", "I-CARD", "B-TAX_ID", "I-TAX_ID"
]


def tokenize_and_align_labels(example):
    # Tokenizer input
    tokenized_inputs = tokenizer(
        example["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128,
        return_attention_mask=True
    )

    # Word ID-ləri al
    word_ids = tokenized_inputs.word_ids()
    ner_tags = example["ner_tags"]

    # Label list yaradılır
    labels = []
    previous_word_idx = None

    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            tag = ner_tags[word_idx]
            if isinstance(tag, str):
                labels.append(label2id.get(tag, 0))
            else:
                labels.append(tag)
        else:
            labels.append(-100)
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=False)

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [25]:
trainer.save_model("./ner_model")
tokenizer.save_pretrained("./ner_model")

('./ner_model/tokenizer_config.json',
 './ner_model/special_tokens_map.json',
 './ner_model/vocab.txt',
 './ner_model/added_tokens.json',
 './ner_model/tokenizer.json')

In [26]:
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "accuracy": accuracy_score(true_labels, true_predictions),
    }

In [27]:
from transformers import Trainer, TrainingArguments

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=TrainingArguments(
        output_dir="./results",
        eval_strategy="epoch", # Changed from evaluation_strategy
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
    ),
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [28]:
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 2.5190067291259766, 'eval_model_preparation_time': 0.0046, 'eval_precision': 0.19034152708288013, 'eval_recall': 0.18742636821248795, 'eval_f1': 0.18887269980033455, 'eval_accuracy': 0.16380815446236008, 'eval_runtime': 2.853, 'eval_samples_per_second': 140.203, 'eval_steps_per_second': 8.763}


In [18]:
print(dataset['test'])

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 400
})


In [32]:
from seqeval.metrics import classification_report
import numpy as np

eval_dataset = tokenized_datasets['test']
predictions, labels, _ = trainer.predict(eval_dataset)

# If the model outputs logits, take the argmax
preds = np.argmax(predictions, axis=2)

# Remove ignored index (padding)
true_predictions = [
    [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(preds, labels)
]

true_labels = [
    [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(preds, labels)
]

# Print the classification report
print(classification_report(true_labels, true_predictions))

              precision    recall  f1-score   support

        DATE       0.11      0.58      0.18       800
       EMAIL       0.04      0.05      0.04       800
          ID       0.17      0.17      0.17       800
    LOCATION       0.62      0.18      0.28      5200
 NATIONALITY       0.08      0.07      0.08       937
      PERSON       0.14      0.11      0.12       800

   micro avg       0.19      0.19      0.19      9337
   macro avg       0.19      0.19      0.15      9337
weighted avg       0.39      0.19      0.21      9337



In [33]:
# Save the model and tokenizer
trainer.save_model("./ner_model")
tokenizer.save_pretrained("./ner_model")

('./ner_model/tokenizer_config.json',
 './ner_model/special_tokens_map.json',
 './ner_model/vocab.txt',
 './ner_model/added_tokens.json',
 './ner_model/tokenizer.json')

Now let's try to use the trained model for inference on a new sentence.

In [34]:
from transformers import pipeline

# Load the saved model and tokenizer
ner_pipeline = pipeline("token-classification", model="./ner_model", tokenizer="./ner_model", aggregation_strategy="first")

# Example text for inference
text = "Mən Rəşad Əliyev, 10 yanvar 1990-cı il təvəllüdlüyəm. Əlaqə nömrəm +994 50 123 45 67-dir və emailim rashad.aliyev@example.com-dur. Bakıda yaşayıram və Azərbaycan vətəndaşıyam. İşlədiyim yer Holberton School-dur. Vergi nömrəm 1234567890-dır."

# Perform inference
ner_results = ner_pipeline(text)

# Print the results
print(ner_results)

Device set to use cuda:0


[{'entity_group': 'ORGANIZATION', 'score': np.float32(0.121950604), 'word': 'Mən Rəşad', 'start': 0, 'end': 9}, {'entity_group': 'PERSON', 'score': np.float32(0.11681998), 'word': 'Əliyev', 'start': 10, 'end': 16}, {'entity_group': 'PERSON', 'score': np.float32(0.10802434), 'word': ',', 'start': 16, 'end': 17}, {'entity_group': 'LOCATION', 'score': np.float32(0.14089656), 'word': '10', 'start': 18, 'end': 20}, {'entity_group': 'PERSON', 'score': np.float32(0.12134712), 'word': 'yanvar', 'start': 21, 'end': 27}, {'entity_group': 'ORGANIZATION', 'score': np.float32(0.11243919), 'word': '1990', 'start': 28, 'end': 32}, {'entity_group': 'PERSON', 'score': np.float32(0.0987414), 'word': '-', 'start': 32, 'end': 33}, {'entity_group': 'ORGANIZATION', 'score': np.float32(0.10007587), 'word': 'cı il', 'start': 33, 'end': 38}, {'entity_group': 'DATE', 'score': np.float32(0.109114476), 'word': 'təvəllüdlüyəm', 'start': 39, 'end': 52}, {'entity_group': 'PERSON', 'score': np.float32(0.09614489), 'w

In [35]:
from datasets import Dataset

# 1 nümunədən ibarət test datası yaradılır
raw_eval_data = {
    "tokens": [["Müştərinin", "adı", "Elvin", "Əliyev", "olaraq", "qeyd", "edilmişdir", "."]],
    "ner_tags": [[0, 0, 1, 2, 0, 0, 0, 0]]
}

# Hugging Face formatına çeviririk
eval_dataset = Dataset.from_dict(raw_eval_data)

# Tokenizer və label align etmə lazımdır
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, padding="max_length", max_length=128)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label[word_idx] != 0 else 0)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Məlumatı preprocess edirik
eval_dataset = eval_dataset.map(tokenize_and_align_labels, batched=True)

# Modeli qiymətləndiririk
results = trainer.evaluate(eval_dataset)
print(results)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

{'eval_loss': 2.5329408645629883, 'eval_model_preparation_time': 0.0046, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.16666666666666666, 'eval_runtime': 0.0187, 'eval_samples_per_second': 53.468, 'eval_steps_per_second': 53.468}


In [36]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list), id2label=id2label, label2id=label2id)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
# 1. Əvvəlcə boş bir qovluq yaradılır
!mkdir -p kyc_folder

# 2. İstədiyin faylları həmin qovluğa köçürürük (məsələn, bütün .json və .txt fayllar)
# !mv *.json kyc_folder/ 2>/dev/null
# !mv *.txt kyc_folder/ 2>/dev/null
# !mv *.csv kyc_folder/ 2>/dev/null
# !mv *.py kyc_folder/ 2>/dev/null
# !mv *.pt kyc_folder/ 2>/dev/null

# Əgər xüsusi bir qovluğun varsa onu da daxil edə bilərsən:
!cp -r /content/ner_model/ kyc_folder/
!cp -r /content/results// kyc_folder/
!mv /content/kyc_ner_dataset_strict_bio.json kyc_folder/
!mv /content/kyc_ner_dataset_corrected.json.json kyc_folder/


# 3. Qovluğun ZIP faylına çevrilməsi
!zip -r kyc_project.zip kyc_folder

# 4. ZIP faylını kompüterə yükləmək üçün link
from google.colab import files
files.download("kyc_project.zip")


mv: cannot stat '/content/kyc_ner_dataset_corrected.json.json': No such file or directory
  adding: kyc_folder/ (stored 0%)
  adding: kyc_folder/ner_model/ (stored 0%)
  adding: kyc_folder/ner_model/tokenizer_config.json (deflated 75%)
  adding: kyc_folder/ner_model/tokenizer.json (deflated 67%)
  adding: kyc_folder/ner_model/vocab.txt (deflated 45%)
  adding: kyc_folder/ner_model/config.json (deflated 58%)
  adding: kyc_folder/ner_model/special_tokens_map.json (deflated 42%)
  adding: kyc_folder/ner_model/training_args.bin (deflated 51%)
  adding: kyc_folder/ner_model/model.safetensors (deflated 7%)
  adding: kyc_folder/results/ (stored 0%)
  adding: kyc_folder/results/runs/ (stored 0%)
  adding: kyc_folder/results/runs/Jul23_18-12-55_97d6f805395a/ (stored 0%)
  adding: kyc_folder/results/runs/Jul23_18-12-55_97d6f805395a/events.out.tfevents.1753294382.97d6f805395a.527.1 (deflated 46%)
  adding: kyc_folder/results/runs/Jul23_17-55-03_97d6f805395a/ (stored 0%)
  adding: kyc_folder/resul

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>