In [11]:
from datasets import load_dataset

In [12]:
dataset = load_dataset("kde4",lang1= "en", lang2 = 'tr')
dataset

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: ff4f1dda-7c84-42a1-b689-727ea8489f0e)')' thrown while requesting HEAD https://huggingface.co/datasets/kde4/resolve/main/README.md
Retrying in 1s [Retry 1/5].


DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 153438
    })
})

In [13]:
split_ds = dataset['train'].train_test_split(seed=101,train_size=0.9)
split_ds

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 138094
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 15344
    })
})

In [14]:
split_ds['train'][23]['translation']

{'en': 'Starting month of the fiscal year', 'tr': 'Mali yılın başlangıç ayı'}

In [15]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
check_point = 'Helsinki-NLP/opus-mt-tc-big-en-tr'
model = AutoModelForSeq2SeqLM.from_pretrained(check_point)
tokenizer = AutoTokenizer.from_pretrained(check_point)

In [16]:
text_to_translate = "The best way to learn is by doing"
inputs = tokenizer(text_to_translate, return_tensors = 'pt')
translated_tokens = model.generate(**inputs)
translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
translated_text

'Öğrenmenin en iyi yolu yapmaktır.'

In [17]:
split_ds

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 138094
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 15344
    })
})

In [18]:
split_ds['train'][0]

{'id': '103429',
 'translation': {'en': 'Chat with %1', 'tr': '% 1 ile sohbet et'}}

In [19]:
import time
import statistics
start_time = time.perf_counter()
length_list = []
for i in split_ds['train']['translation']:
    tokenized_example_en = tokenizer(i['en'])
    length = len(tokenized_example_en['input_ids'])
    length_list.append(length)

print(f"mean of the tokenized_en: {statistics.mean(length_list)} , standart_devision: {statistics.stdev(length_list)}, median:{statistics.median(length_list)}")
end_time = time.perf_counter()

duration = end_time - start_time
print(f"duration:{duration}")

Token indices sequence length is longer than the specified maximum sequence length for this model (890 > 512). Running this sequence through the model will result in indexing errors


mean of the tokenized_en: 7.422683099917448 , standart_devision: 11.69622304809656, median:5.0
duration:8.537915215999988


## for your memory use this:

In [20]:
import numpy as np
start_time = time.perf_counter()
def length_of_en_token(example):
    en_batch = [en['en'] for en in example['translation']]
    tokenized_en = tokenizer(en_batch)
    return {'length': [len(i) for i in tokenized_en['input_ids']]}

lengths_ds = split_ds.map(length_of_en_token, batched=True, num_proc=4)

length_list_fast = lengths_ds['train']['length']

print(f"Mean: {np.mean(length_list_fast)}")
print(f"Standard Deviation: {np.std(length_list_fast)}")
print(f"Median: {np.median(length_list_fast)}")

end_time = time.perf_counter()

duration = end_time - start_time
print(f"duration:{duration}")

Mean: 7.422683099917448
Standard Deviation: 11.696180699246419
Median: 5.0
duration:0.6049959889999741


# this shows us how much map important.

In [21]:
def tokenizer_fun(example):
    source_text = [ex['en'] for ex in example['translation']]
    target_text = [ex['tr'] for ex in example['translation']]
    inputs = tokenizer(source_text,max_length=128, padding='longest', truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(target_text,max_length=128, padding='longest', truncation=True)
        
    inputs["labels"] = labels["input_ids"]
    return inputs
        

In [22]:
tk_ds = split_ds.map(
     tokenizer_fun,
    batched=True,
    remove_columns=split_ds["train"].column_names,
)
tk_ds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 138094
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 15344
    })
})

In [23]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

we don't need to change paddings with -100 for input_ids manually because data_collator already handles it. 

In [24]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model = model)

In [25]:
batch = data_collator([tk_ds["train"][i] for i in range(7,9)])

In [26]:
list(batch.keys())

['input_ids', 'attention_mask', 'labels', 'decoder_input_ids']

the model already knows the padding values it labeled with 57059

In [27]:
batch['labels']

tensor([[ 4698, 13254, 23479, 51664, 29453,    33, 29477, 32858, 43741, 57059,
         57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059,
         57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059,
         57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059,
         57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059,
         57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059,
         57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059,
         57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059,
         57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059,
         57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059,
         57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059,
         57059, 57059],
        [  151, 53200,  6806, 31375, 43741, 57059, 57059, 57059, 57059, 57059,
         57059, 57059, 57059

In [28]:
batch['decoder_input_ids']

tensor([[57059,  4698, 13254, 23479, 51664, 29453,    33, 29477, 32858, 43741,
         57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059,
         57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059,
         57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059,
         57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059,
         57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059,
         57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059,
         57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059,
         57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059,
         57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059,
         57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059, 57059,
         57059, 57059],
        [57059,   151, 53200,  6806, 31375, 43741, 57059, 57059, 57059, 57059,
         57059, 57059, 57059

In [67]:
%pip install sacrebleu

Collecting sacrebleu
  Using cached sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting portalocker (from sacrebleu)
  Using cached portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting tabulate>=0.8.9 (from sacrebleu)
  Using cached tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting colorama (from sacrebleu)
  Using cached colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting lxml (from sacrebleu)
  Downloading lxml-6.0.2-cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl.metadata (3.6 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading lxml-6.0.2-cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl (5.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Install

In [29]:
import evaluate
metric = evaluate.load('sacrebleu')

In [30]:
predictions = [
    "This plugin lets you translate web pages between several languages automatically."
]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references)

{'score': 46.750469682990165,
 'counts': [11, 6, 4, 3],
 'totals': [12, 11, 10, 9],
 'precisions': [91.66666666666667,
  54.54545454545455,
  40.0,
  33.333333333333336],
 'bp': 0.9200444146293233,
 'sys_len': 12,
 'ref_len': 13}

In [31]:
tokenizer.pad_token_id

57059

In [32]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]

    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)

    return {"bleu": result["score"]}

In [33]:
from huggingface_hub import login

login(token="hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
print('basarili')

basarili


In [34]:
from huggingface_hub import whoami
try:
    user_info = whoami()
    print("Kullanıcı Adı:", user_info['name'])
except Exception as e:
    print(" Giriş yapılamadı", e)

Kullanıcı Adı: gokhanErgul


In [35]:
#bu degisikliklerle 3 saat 33 dakikaya indi 12 saatten.

from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    output_dir="kde4-en-to-tr_with-Helsinki",
    
    eval_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    
    per_device_train_batch_size=8,

    gradient_accumulation_steps=8,
    
    # Değerlendirme batch boyutu (eğitimin 2 katı genellikle güvenlidir)
    per_device_eval_batch_size=16,
    
    # fp16, hem VRAM kullanımını azaltır hem de RTX 30 serisi kartlarda 
    fp16=True,
    
    # Veri yüklemeyi hızlandırmak için (CPU çekirdek sayınıza göre artırılabilir)
    dataloader_num_workers=8,
    
    # torch_compile=True, 
    
    # Değerlendirme ve Hub
    predict_with_generate=True,
    push_to_hub=True,
)

# Ayarları kontrol edelim
effective_batch_size = args.per_device_train_batch_size * args.gradient_accumulation_steps
print(f"Kullanılacak Gerçek Batch Boyutu: {args.per_device_train_batch_size}")
print(f"Gradyan Biriktirme Adımları: {args.gradient_accumulation_steps}")
print(f"Modelin Hissedeceği Etkili Batch Boyutu: {effective_batch_size}")

Kullanılacak Gerçek Batch Boyutu: 8
Gradyan Biriktirme Adımları: 8
Modelin Hissedeceği Etkili Batch Boyutu: 64


In [79]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tk_ds["train"],
    eval_dataset=tk_ds["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [82]:
trainer.evaluate(max_length=128)

[34m[1mwandb[0m: Currently logged in as: [33mgokhannergull[0m ([33mgokhannergull-student[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


{'eval_loss': 9.69012451171875,
 'eval_model_preparation_time': 0.0029,
 'eval_bleu': 13.689495555514489,
 'eval_runtime': 925.0187,
 'eval_samples_per_second': 16.588,
 'eval_steps_per_second': 0.259}

In [28]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tk_ds["train"],
    eval_dataset=tk_ds["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# In here i made a mistakes: the targets value was english too instead of turkish. thats why the loss values like this: 

In [91]:
trainer.train(resume_from_checkpoint=True)

There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].


Step,Training Loss
2500,0.0029
3000,0.0027
3500,0.0026
4000,0.0024
4500,0.0019
5000,0.0017
5500,0.0017
6000,0.0017


TrainOutput(global_step=6474, training_loss=0.0014031449419722205, metrics={'train_runtime': 11329.1515, 'train_samples_per_second': 36.568, 'train_steps_per_second': 0.571, 'total_flos': 5.735620995239117e+16, 'train_loss': 0.0014031449419722205, 'epoch': 3.0})

In [29]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mgokhannergull[0m ([33mgokhannergull-student[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,1.7166
1000,0.1136
1500,0.1073
2000,0.1022
2500,0.0937
3000,0.0906
3500,0.0894
4000,0.0889




TrainOutput(global_step=4316, training_loss=0.28476363315529246, metrics={'train_runtime': 7372.4471, 'train_samples_per_second': 37.462, 'train_steps_per_second': 0.585, 'total_flos': 3.740743108657152e+16, 'train_loss': 0.28476363315529246, 'epoch': 2.0})

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [38]:
local_model_path = "./kde4-en-to-tr_with-Helsinki"
model = AutoModelForSeq2SeqLM.from_pretrained(local_model_path)

tokenizer_path = 'Helsinki-NLP/opus-mt-tc-big-en-tr'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

In [39]:
save_directory = "./-en-to-tr_with-Helsinki_ALL"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Tüm dosyalar '{save_directory}' klasörüne başarıyla kaydedildi!")

Tüm dosyalar './-en-to-tr_with-Helsinki_ALL' klasörüne başarıyla kaydedildi!


In [40]:
from huggingface_hub import HfApi
api = HfApi()

api.upload_folder(
    folder_path="./-en-to-tr_with-Helsinki_ALL", 
    repo_id="gokhanErgul/kde4-en-to-tr_with-Helsinki",
    repo_type="model"
)

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/gokhanErgul/kde4-en-to-tr_with-Helsinki/commit/7592bbe0805514df5bc4a56b6a2af66ae109395e', commit_message='Upload folder using huggingface_hub', commit_description='', oid='7592bbe0805514df5bc4a56b6a2af66ae109395e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/gokhanErgul/kde4-en-to-tr_with-Helsinki', endpoint='https://huggingface.co', repo_type='model', repo_id='gokhanErgul/kde4-en-to-tr_with-Helsinki'), pr_revision=None, pr_num=None)

In [5]:
model_path = "gokhanErgul/kde4-en-to-tr_with-Helsinki"
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)



In [36]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tk_ds["train"],
    eval_dataset=tk_ds["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [37]:
model.generation_config.max_length = 128
model.generation_config.num_beams = 4

In [38]:
evaluation_results = trainer.evaluate(max_length=128)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


In [39]:
evaluation_results

{'eval_loss': 9.745146751403809,
 'eval_model_preparation_time': 0.003,
 'eval_bleu': 28.943267751793897,
 'eval_runtime': 766.7728,
 'eval_samples_per_second': 20.011,
 'eval_steps_per_second': 1.251}

# Predict

In [40]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(57060, 1024, padding_idx=57059)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(57060, 1024, padding_idx=57059)
      (embed_positions): MarianSinusoidalPositionalEmbedding(1024, 1024)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((1

In [41]:
text_to_translate = "The book is on the table."
inputs = tokenizer(text_to_translate, return_tensors='pt')
inputs = {k: v.to(device) for k,v in inputs.items()}
output_tokens= model.generate(**inputs, max_length = 128,num_beams=4)
output_to_text = tokenizer.decode(output_tokens[0],skip_special_tokens=True)
output_to_text

'Kitap masanın üzerinde.'