In [1]:
!pip install datasets transformers tokenizers accelerate --upgrade




In [2]:
from datasets import load_dataset
dataset = load_dataset("yudum/wiki-tr-50000", split="train")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
def save_dataset_to_txt(dataset, path="corpus.txt"):
    with open(path, "w", encoding="utf-8") as f:
        for example in dataset:
            f.write(example["text"] + "\n")

save_dataset_to_txt(dataset)


In [4]:
from tokenizers import ByteLevelBPETokenizer
import os

os.makedirs("turkish-tokenizer", exist_ok=True)

tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files="corpus.txt", vocab_size=32000, min_frequency=2, special_tokens=[
    "<s>", "<pad>", "</s>", "<unk>", "<mask>"
])
tokenizer.save_model("turkish-tokenizer")


['turkish-tokenizer/vocab.json', 'turkish-tokenizer/merges.txt']

In [5]:
from transformers import GPT2TokenizerFast, GPT2Config, GPT2LMHeadModel

tokenizer = GPT2TokenizerFast.from_pretrained("turkish-tokenizer")
tokenizer.pad_token = tokenizer.eos_token

config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    n_positions=512,
    n_ctx=512,
    n_embd=256,
    n_layer=4,
    n_head=4,
)

model = GPT2LMHeadModel(config)
model.resize_token_embeddings(len(tokenizer))
model = model.to("cuda")


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["title", "text"])


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [7]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # GPT modelleri için mlm kullanılmaz
)


In [8]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./gpt2-turkish",
    eval_strategy="no",
    save_strategy="steps",
    save_steps=500,
    logging_steps=100,
    per_device_train_batch_size=2,
    num_train_epochs=1,
    weight_decay=0.01,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)


  trainer = Trainer(


In [9]:
trainer.train()




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maltin-eren63[0m ([33maltin-eren63-maltepe-niversitesi[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,9.5158
200,8.7117
300,8.1489
400,8.0413
500,7.809
600,7.7159
700,7.7017
800,7.5863
900,7.4963
1000,7.519


TrainOutput(global_step=25000, training_loss=6.16072908203125, metrics={'train_runtime': 917.9143, 'train_samples_per_second': 54.471, 'train_steps_per_second': 27.236, 'total_flos': 293216991141888.0, 'train_loss': 6.16072908203125, 'epoch': 1.0})

In [10]:
model.save_pretrained("./my-turkish-gpt2")
tokenizer.save_pretrained("./my-turkish-gpt2")


('./my-turkish-gpt2/tokenizer_config.json',
 './my-turkish-gpt2/special_tokens_map.json',
 './my-turkish-gpt2/vocab.json',
 './my-turkish-gpt2/merges.txt',
 './my-turkish-gpt2/added_tokens.json',
 './my-turkish-gpt2/tokenizer.json')

In [12]:
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0,  # CUDA
)

output = generator(
    "Bir gün Nasrettin Hoca",
    max_length=100,
    do_sample=True,
    temperature=0.9,
    top_k=50,
    top_p=0.95,
    num_return_sequences=1
)
print(output[0]["generated_text"])


Device set to use cuda:0


Bir gün Nasrettin Hoca, Gac, Amerika Birleşik Devletleri'nin bir şehirdir. İlçenin nüfusu 26 Mart 1993'tir.

Kaynakça 

Pendrdae ili belediyeleri
TBMM 6.1
A.de'deki şehirler
21. dönem yerleşim birimleri
Tak'deki şehirler
O. dönem köy
18. dönem yerleşim birimleri
Türkiye'deki iller
2010'nin mahalleleri
Evrifaessa Bot tarafından oluşturulan mahalle maddeleri
Evrifaessa Bot tarafından oluşturulan mahalle maddeleri
Evrifaessa Bot tarafından oluşturulan


In [13]:
from google.colab import files
uploaded = files.upload()


Saving fikralarFinal.json to fikralarFinal.json


In [14]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="fikralarFinal.json")  # ya da .txt için uygun format


Generating train split: 0 examples [00:00, ? examples/s]

In [20]:
!ls


corpus.txt	    gpt2-turkish  my-turkish-gpt2  turkish-tokenizer
fikralarFinal.json  logs	  sample_data	   wandb


In [21]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(
    "./my-turkish-gpt2",  # klasör adını dosya sistemindeki haliyle yaz
    local_files_only=True
)

model = AutoModelForCausalLM.from_pretrained(
    "./my-turkish-gpt2",
    local_files_only=True
)


In [22]:
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))


Embedding(32001, 256)

In [23]:
!mkdir -p own-model


In [30]:
# 1. Tokenizasyon
def tokenize_fn(example):
    tokenized = tokenizer(
        example["icerik"],  # önceki adı "text" yerine "icerik"
        truncation=True,
        padding="max_length",
        max_length=128
    )
    tokenized["labels"] = tokenized["input_ids"].copy()  # burada labels'ı ekliyoruz
    return tokenized


tokenized_ds = dataset.map(tokenize_fn, batched=True)
split_dataset = tokenized_ds["train"].train_test_split(test_size=0.1)



# 3. TrainingArguments ayarları
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./own-model",
    eval_strategy="epoch",  # her epoch'ta eval yapsın
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch"
)

# 4. Trainer tanımı
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    tokenizer=tokenizer
)

# 5. Eğitimi başlat
trainer.train()


Map:   0%|          | 0/1207 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,5.80743
2,5.941500,5.250755
3,5.941500,5.118674


TrainOutput(global_step=816, training_loss=5.58597235586129, metrics={'train_runtime': 24.9172, 'train_samples_per_second': 130.753, 'train_steps_per_second': 32.748, 'total_flos': 7905654079488.0, 'train_loss': 5.58597235586129, 'epoch': 3.0})

In [33]:
trainer.save_model("./own-model")
tokenizer.save_pretrained("./own-model")


('./own-model/tokenizer_config.json',
 './own-model/special_tokens_map.json',
 './own-model/vocab.json',
 './own-model/merges.txt',
 './own-model/added_tokens.json',
 './own-model/tokenizer.json')

In [34]:
import os
os.listdir("./own-model")


['training_args.bin',
 'checkpoint-544',
 'tokenizer.json',
 'vocab.json',
 'special_tokens_map.json',
 'runs',
 'merges.txt',
 'config.json',
 'model.safetensors',
 'checkpoint-272',
 'tokenizer_config.json',
 'generation_config.json',
 'added_tokens.json',
 'checkpoint-816']

In [35]:
from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer

# Modeli ve tokenizer'ı yükle
model = GPT2LMHeadModel.from_pretrained("./own-model")
tokenizer = GPT2Tokenizer.from_pretrained("./own-model")

# Text generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Örnek giriş cümlesi
prompt = "Nasreddin Hoca bir gün"

# Fıkra üretimi
result = generator(
    prompt,
    max_length=120,            # Maksimum toplam token sayısı
    num_return_sequences=1,    # Kaç farklı çıktı dönsün
    do_sample=True,            # Sampling (random üretim)
    temperature=0.9,           # Rastgelelik seviyesi (0.7–1.0 önerilir)
    top_k=50,                  # En olası 50 token arasından seçim
    top_p=0.95,                # Nucleus sampling
    eos_token_id=tokenizer.eos_token_id,  # cümle bitiş tokenı
)

print(result[0]["generated_text"])


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Nasreddin Hoca bir gün üç süre birlikte karar var verir. İlk ay sonra o pidilini dür: -Diyin-Rk. Eiş. Bir: -
-B-Eis.
-His;
