In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("erayyildiz/turkish_ner")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'domain', 'ner_tags'],
        num_rows: 532629
    })
})

In [4]:
train = dataset['train']

In [5]:
train.features['domain']

ClassLabel(names=['architecture', 'basketball', 'book', 'business', 'education', 'fictional_universe', 'film', 'food', 'geography', 'government', 'law', 'location', 'military', 'music', 'opera', 'organization', 'people', 'religion', 'royalty', 'soccer', 'sports', 'theater', 'time', 'travel', 'tv'], id=None)

In [6]:
train.features['ner_tags']

Sequence(feature=ClassLabel(names=['O', 'B-PERSON', 'I-PERSON', 'B-ORGANIZATION', 'I-ORGANIZATION', 'B-LOCATION', 'I-LOCATION', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [7]:
#Domain: represents the type of the senteence or where it comes from.
#ner_tags: tokens' type and it is head or que.

In [8]:
train[0]

{'id': '0',
 'tokens': ['Corina',
  'Casanova',
  ',',
  'İsviçre',
  'Federal',
  'Şansölyesidir',
  '.\n'],
 'domain': 9,
 'ner_tags': [1, 2, 0, 5, 0, 7, 0]}

In [9]:
# as you see the sentence already separated by words
#which means this data already pre-tokenized.

In [10]:
from transformers import AutoTokenizer

In [11]:
model_checkpoint = "dbmdz/bert-base-turkish-w"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [12]:
tokenizer.is_fast

True

In [13]:
inputs = tokenizer(train[0]['tokens'], is_split_into_words=True)
inputs

{'input_ids': [2, 11317, 3930, 10628, 12398, 5477, 16, 11046, 17356, 16673, 16938, 2462, 2067, 18, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [14]:
train[0]['tokens']

['Corina', 'Casanova', ',', 'İsviçre', 'Federal', 'Şansölyesidir', '.\n']

In [15]:
inputs.tokens()

['[CLS]',
 'Cor',
 '##ina',
 'Cas',
 '##ano',
 '##va',
 ',',
 'İsviçre',
 'Federal',
 'Şans',
 '##ölye',
 '##si',
 '##dir',
 '.',
 '[SEP]']

## Label Aligment

In [16]:
train[0]['ner_tags']

[1, 2, 0, 5, 0, 7, 0]

In [17]:
inputs.word_ids()

[None, 0, 0, 1, 1, 1, 2, 3, 4, 5, 5, 5, 5, 6, None]

In [18]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [19]:
labels = train[0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[1, 2, 0, 5, 0, 7, 0]
[-100, 1, 2, 2, 2, 2, 0, 5, 0, 7, 8, 8, 8, 0, -100]


In [20]:
def tokenize_with_align_labels(example):
    tokenize  = tokenizer(example['tokens'], is_split_into_words=True)
    all_labels = example['ner_tags']
    new_labels = []
    deleted_row_count = 0
    for i, labels in enumerate(all_labels):
        word_inds = tokenize.word_ids(i)
        
        valid_word_inds = [w for w in word_inds if w is not None]
        if len(valid_word_inds) > 0 and max(valid_word_inds) >= len(labels):
            print("WARNING: word_inds out of range!", word_inds, "labels len:", len(labels))
            new_labels.append([-100]*len(word_ids))
            deleted_row_count+=1
            continue
            
        new_labels.append(align_labels_with_tokens(labels,word_inds))


    tokenize['labels'] = new_labels
    return tokenize

In [21]:
tokenized_dataset = dataset.map(
    tokenize_with_align_labels, batched = True,
    remove_columns=dataset["train"].column_names,
)

In [22]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 532629
    })
})

In [23]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [24]:
tokenized_dataset["train"][0]

{'input_ids': [2,
  11317,
  3930,
  10628,
  12398,
  5477,
  16,
  11046,
  17356,
  16673,
  16938,
  2462,
  2067,
  18,
  3],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 1, 2, 2, 2, 2, 0, 5, 0, 7, 8, 8, 8, 0, -100]}

In [25]:
batch = data_collator([tokenized_dataset["train"][0]])
batch["labels"] 

tensor([[-100,    1,    2,    2,    2,    2,    0,    5,    0,    7,    8,    8,
            8,    0, -100]])

In [26]:
batch = data_collator([tokenized_dataset["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    1,    2,    2,    2,    2,    0,    5,    0,    7,    8,    8,
            8,    0, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100],
        [-100,    7,    8,    8,    0,    3,    4,    4,    4,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    7,    0,
            0,    0, -100]])

# Metric

In [27]:
ner_labels = train.features['ner_tags'].feature.names

In [50]:
ner_labels

['O',
 'B-PERSON',
 'I-PERSON',
 'B-ORGANIZATION',
 'I-ORGANIZATION',
 'B-LOCATION',
 'I-LOCATION',
 'B-MISC',
 'I-MISC']

In [28]:
import evaluate

metric = evaluate.load("seqeval")

Downloading builder script: 6.34kB [00:00, 14.5MB/s]


In [29]:
labels = dataset["train"][0]["ner_tags"]
labels

[1, 2, 0, 5, 0, 7, 0]

In [30]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 532629
    })
})

In [31]:
labels = dataset["train"][0]["ner_tags"]
labels = [ner_labels[i] for i in labels]

In [32]:
labels

['B-PERSON', 'I-PERSON', 'O', 'B-LOCATION', 'O', 'B-MISC', 'O']

In [33]:
#we will check if the metric work well

In [34]:
example = labels.copy()

In [35]:
example[1] = "O"

In [36]:
labels

['B-PERSON', 'I-PERSON', 'O', 'B-LOCATION', 'O', 'B-MISC', 'O']

In [37]:
example

['B-PERSON', 'O', 'O', 'B-LOCATION', 'O', 'B-MISC', 'O']

In [38]:
metric.compute(predictions=[example],references=[labels])

{'LOCATION': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'MISC': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'PERSON': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'overall_precision': 0.6666666666666666,
 'overall_recall': 0.6666666666666666,
 'overall_f1': 0.6666666666666666,
 'overall_accuracy': 0.8571428571428571}

In [51]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[ner_labels[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [ner_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [52]:
id2label = {i: label for i, label in enumerate(ner_labels)}
label2id = {v: k for k, v in id2label.items()}

In [53]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [54]:
from transformers import TrainingArguments

args = TrainingArguments(
    "output_ner",
    logging_strategy="steps", #to make the training process fast
    logging_steps=500, #to make the training process fast
    per_device_train_batch_size=16, #to make it use more gpu 
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    learning_rate=2e-5,
    save_strategy="epoch",
    num_train_epochs=1,
    weight_decay=0.01,
    fp16=True
)

In [55]:
from datasets import DatasetDict
split_dataset = tokenized_dataset['train'].train_test_split(test_size = 0.1, seed = 42)

In [56]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 532629
    })
})

In [57]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 479366
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 53263
    })
})

In [58]:
ds = DatasetDict({
    'train': split_dataset['train'],
    'validation': split_dataset['test']
})

In [48]:
inputs = tokenizer(["Gokhan Ergul 8 Ülke gezdi"], return_tensors="pt", is_split_into_words=True)
outputs = model(**inputs)
print(outputs)

TokenClassifierOutput(loss=None, logits=tensor([[[-0.1449,  0.1564, -0.0080, -0.1890, -0.1102,  1.0029,  0.6703,
          -0.7343,  0.4139],
         [-0.3636, -0.9279,  0.3841,  0.4687, -0.0844,  0.3522, -0.1254,
          -0.3245,  0.7734],
         [-0.5048,  0.1473, -0.2862, -0.4492, -0.2279, -0.0546,  0.2735,
           0.0658, -0.2605],
         [-0.3211,  0.1372, -0.0402,  0.2147, -0.6757, -0.2528,  0.0072,
          -0.6849,  0.2127],
         [ 0.0730, -0.5232, -0.2662,  0.7784, -0.5404,  0.1430,  0.2655,
          -0.2674,  0.7854],
         [-0.5356, -0.2849, -0.3356,  0.1261, -0.8518,  0.4958,  0.0186,
          -0.3732,  0.6601],
         [-0.4989, -0.0532,  0.1087,  0.5134, -0.5403, -0.1699, -0.6500,
           0.1798,  0.0928],
         [-1.2003,  0.1072,  0.5625,  0.3486, -0.3021,  0.2907,  0.3294,
          -0.3006,  0.3240],
         [-0.4777, -0.0480,  0.0340, -0.1152, -0.3919,  0.5954, -0.6397,
          -1.0572,  0.4845],
         [-0.6239,  0.1282, -0.3619,  0.92

In [59]:
ds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 479366
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 53263
    })
})

In [62]:
ds_simple = DatasetDict({
    'train': ds['train'].select(range(5000)),
    "validation": ds['validation'].select(range(2000))
})

In [63]:
ds_simple

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})

In [60]:
from transformers import Trainer

In [64]:
trainer = Trainer(
    model = model,
    args = args,
    train_dataset = ds_simple['train'],
    eval_dataset= ds_simple['validation'],
    data_collator= data_collator,
    compute_metrics= compute_metrics,
    processing_class=tokenizer
    
)
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mgokhannergull[0m ([33mgokhannergull-student[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.493734,0.342811,0.348977,0.345866,0.808825


TrainOutput(global_step=313, training_loss=0.641147942588733, metrics={'train_runtime': 49.483, 'train_samples_per_second': 101.045, 'train_steps_per_second': 6.325, 'total_flos': 130595401352880.0, 'train_loss': 0.641147942588733, 'epoch': 1.0})

In [65]:
ds_simple = DatasetDict({
    'train': ds['train'].select(range(10000)),
    "validation": ds['validation'].select(range(4000))
})

In [66]:
trainer = Trainer(
    model = model,
    args = args,
    train_dataset = ds_simple['train'],
    eval_dataset= ds_simple['validation'],
    data_collator= data_collator,
    compute_metrics= compute_metrics,
    processing_class=tokenizer
    
)
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.475,0.442464,0.417219,0.448346,0.432223,0.822802


TrainOutput(global_step=625, training_loss=0.4720489440917969, metrics={'train_runtime': 93.3317, 'train_samples_per_second': 107.145, 'train_steps_per_second': 6.697, 'total_flos': 261223466867424.0, 'train_loss': 0.4720489440917969, 'epoch': 1.0})

In [70]:
import torch

In [72]:
torch.cuda.is_available()

True

In [67]:
from transformers import pipeline

In [69]:
ner_pipe = pipeline('ner',model = model, tokenizer = tokenizer, aggregation_strategy = "simple")

Device set to use cuda:0


In [75]:
sentence = "Merhaba ben Gökhan, İstanbul Kağıthane'den Hugging Face ailesine selamlar."
results = ner_pipe(sentence)
results

[{'entity_group': 'MISC',
  'score': 0.6844826,
  'word': 'Hugging Face',
  'start': 43,
  'end': 55}]

In [76]:
trainer = Trainer(
    model = model,
    args = args,
    train_dataset = ds['train'],
    eval_dataset= ds['validation'],
    data_collator= data_collator,
    compute_metrics= compute_metrics,
    processing_class=tokenizer
    
)
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2969,0.29478,0.647345,0.66993,0.658444,0.883531


TrainOutput(global_step=29961, training_loss=0.334889830656807, metrics={'train_runtime': 4189.7839, 'train_samples_per_second': 114.413, 'train_steps_per_second': 7.151, 'total_flos': 1.258715000854818e+16, 'train_loss': 0.334889830656807, 'epoch': 1.0})

In [77]:
ner_pipe = pipeline('ner',model = model, tokenizer = tokenizer, aggregation_strategy = "simple")

Device set to use cuda:0


In [85]:
sentence = "Merhaba ben Gökhan, İstanbul'da yaŞıyorumç Hugging Face ailesine selamlar."
results = ner_pipe(sentence)
results

[{'entity_group': 'MISC',
  'score': 0.5826613,
  'word': 'ben',
  'start': 8,
  'end': 11},
 {'entity_group': 'LOCATION',
  'score': 0.6030029,
  'word': "İstanbul ' da",
  'start': 20,
  'end': 31},
 {'entity_group': 'MISC',
  'score': 0.39027178,
  'word': '##gin',
  'start': 46,
  'end': 49}]

In [87]:
print(model.config.id2label)

{0: 'O', 1: 'B-PERSON', 2: 'I-PERSON', 3: 'B-ORGANIZATION', 4: 'I-ORGANIZATION', 5: 'B-LOCATION', 6: 'I-LOCATION', 7: 'B-MISC', 8: 'I-MISC'}


In [100]:
tokenizer.tokenize("Ahmet")

['Ahmet']

In [103]:
ner_pipe = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0
)

sentence = "Merhaba ben Gökhan, İstanbul'da yaşıyorum. Hugging Face ailesine selamlar."
results = ner_pipe(sentence)

for r in results:
    print(r)

Device set to use cuda:0


{'entity_group': 'LOCATION', 'score': 0.5819567, 'word': "İstanbul ' da", 'start': 20, 'end': 31}
{'entity_group': 'MISC', 'score': 0.41383007, 'word': 'Hugging', 'start': 43, 'end': 50}


In [None]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [104]:
# Örnek sayısını belirle
num_examples = 5

# Validation datasetinden rastgele veya ilk birkaç örneği al
examples = ds['validation'][:num_examples]  # ilk 5 örnek

for i in range(num_examples):
    input_ids = examples['input_ids'][i]
    # Tokenleri kelimelere çevir
    sentence = tokenizer.decode(input_ids, skip_special_tokens=True)
    print(f"Example {i+1}: {sentence}")


Example 1: Yönetim merkezi aynı isimli Kurgan şehri olan ilçe 2005 yılında Andhoy ilçesi'nden ayrılarak ilçe yapılmıştır.
Example 2: Bunun ardından Başbakan Ali Mohammed Ghedi tüm ülkeye silahsızlanma çağrısında bulunmuştur.
Example 3: Oxford Üniversitesi'nde siyaset doktorası yapmıştır.
Example 4: Baysungur, tam adı Gıyaseddin Baysungur, Timurlu devlet adamı ve hattat.
Example 5: 199192 NBA sezonu ABD profesyonel basketbol ligi NBA'in 46. sezonudur.


In [105]:
for i in range(num_examples):
    input_ids = examples['input_ids'][i]
    # Tokenleri kelimelere çevir
    sentence = tokenizer.decode(input_ids, skip_special_tokens=True)
    print(f"Example {i+1}: {sentence}")


Example 1: Yönetim merkezi aynı isimli Kurgan şehri olan ilçe 2005 yılında Andhoy ilçesi'nden ayrılarak ilçe yapılmıştır.
Example 2: Bunun ardından Başbakan Ali Mohammed Ghedi tüm ülkeye silahsızlanma çağrısında bulunmuştur.
Example 3: Oxford Üniversitesi'nde siyaset doktorası yapmıştır.
Example 4: Baysungur, tam adı Gıyaseddin Baysungur, Timurlu devlet adamı ve hattat.
Example 5: 199192 NBA sezonu ABD profesyonel basketbol ligi NBA'in 46. sezonudur.


In [106]:
for i in range(num_examples):
    input_ids = examples['input_ids'][i]
    sentence = tokenizer.decode(input_ids, skip_special_tokens=True)
    predictions = ner_pipe(sentence)
    print(f"\nSentence {i+1}: {sentence}")
    for p in predictions:
        print(p)



Sentence 1: Yönetim merkezi aynı isimli Kurgan şehri olan ilçe 2005 yılında Andhoy ilçesi'nden ayrılarak ilçe yapılmıştır.
{'entity_group': 'LOCATION', 'score': 0.6800206, 'word': 'Kurgan', 'start': 28, 'end': 34}
{'entity_group': 'LOCATION', 'score': 0.90730876, 'word': 'Andhoy', 'start': 64, 'end': 70}
{'entity_group': 'LOCATION', 'score': 0.58469677, 'word': 'nden', 'start': 78, 'end': 82}

Sentence 2: Bunun ardından Başbakan Ali Mohammed Ghedi tüm ülkeye silahsızlanma çağrısında bulunmuştur.
{'entity_group': 'MISC', 'score': 0.87300646, 'word': 'Başbakan', 'start': 15, 'end': 23}
{'entity_group': 'PERSON', 'score': 0.44243264, 'word': 'Mohammed', 'start': 28, 'end': 36}
{'entity_group': 'PERSON', 'score': 0.4504234, 'word': '##he', 'start': 38, 'end': 40}

Sentence 3: Oxford Üniversitesi'nde siyaset doktorası yapmıştır.
{'entity_group': 'ORGANIZATION', 'score': 0.8838109, 'word': "Oxford Üniversitesi ' nde", 'start': 0, 'end': 23}
{'entity_group': 'MISC', 'score': 0.99340665, 'wor

In [117]:
import os

output_dir = "output_ner/checkpoint-29961"

files = os.listdir(output_dir)
print(f"'{output_dir}' klasöründeki dosyalar:")
for f in files:
    print("-", f)


'output_ner/checkpoint-29961' klasöründeki dosyalar:
- optimizer.pt
- vocab.txt
- model.safetensors
- tokenizer.json
- rng_state.pth
- scaler.pt
- trainer_state.json
- tokenizer_config.json
- scheduler.pt
- training_args.bin
- special_tokens_map.json
- config.json
