In [1]:
!pip install -U bitsandbytes
!pip install -U transformers
!pip install -U peft
!pip install -U accelerate
!pip install -U trl
!pip install -U datasets

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.5
Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.31.1-py3-none-any.whl.metadata (13 kB)
Collecting hf-xet<2.0.0,>=1.1.0 (from huggingface-hub<1.0,>=0.30.0->transformers)
  Downloading hf_xet-1.1.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (494 bytes)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m81.5 MB/s[0m eta [36m0:00:

In [2]:
import torch
print("GPU kullanılıyor mu?", torch.cuda.is_available())
print("GPU ismi:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "Yok")

GPU kullanılıyor mu? True
GPU ismi: Tesla T4


In [None]:
from huggingface_hub import login
login("###########")

In [4]:
import pandas as pd
from datasets import load_dataset

# Dataset yolunu belirt
dataset_path = "/kaggle/input/fikradataset"

# JSON formatındaki dataset'i yükle
dataset = load_dataset("json", data_files={"train": f"{dataset_path}/dataset.json"}, split="train")

# Dataset'i pandas DataFrame'e çevir
df = pd.DataFrame(dataset)

# Her bir fıkranın başına prompt ekle
df["text"] = "<s>[INST] Fıkra yazar mısın? [/INST] " + df["text"]


# Eğer sadece textleri liste olarak almak istersen
texts = df["text"].tolist()

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
# 🧠 4. Modeli ve Tokenizer’ı Yükle

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_name = "ytu-ce-cosmos/turkish-gpt2-large"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

config.json:   0%|          | 0.00/894 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.10G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/537 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/927k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/585k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.37M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

In [6]:
# 🔧 5. PEFT ile LoRA Ayarlarını Yap

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["c_attn", "q_attn", "v_attn"]  # GPT-2 tabanlı modellerde bu olur
)

model = get_peft_model(model, peft_config)

In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForLanguageModeling

dataset = dataset.filter(lambda x: x["text"].strip() != "")

import re

def clean_text(text):
    # HTML / CSS tag'lerini kaldır
    text = re.sub(r"<[^>]+>", "", text)
    # BBCode-style şeyleri kaldır
    text = re.sub(r"\[/?[^\]]+\]", "", text)
    # Çok uzun boşlukları, satırları düzelt
    text = re.sub(r"\s+", " ", text)
    return text.strip()

df["text"] = df["text"].apply(clean_text)


# Tokenizer ayarları
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Collator → MLM olmadan klasik causal language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

'''
# Eğitim ayarları
training_args = TrainingArguments(
    output_dir="./fikra-GPT",
    per_device_train_batch_size=4,
    num_train_epochs=4,
    logging_steps=100,
    save_steps=200,
    learning_rate=3e-5,
    bf16=True,
    save_total_limit=2,
    report_to="none"
)
'''

training_args = TrainingArguments(
    output_dir="./fikra-GPT",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,  # Etkili batch size: 4x2=8
    num_train_epochs=6,
    learning_rate=3e-5,             # Daha istikrarlı öğrenme
    save_steps=500,
    logging_steps=100,
    save_total_limit=2,
    warmup_steps=200,               # Isınma adımı
    weight_decay=0.01,
    lr_scheduler_type="cosine",     # Daha yumuşak düşüş
    report_to="none"
)


# SFTTrainer → Artık formatting_func YOK
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    peft_config=peft_config,
    data_collator=data_collator,
    #tokenizer=tokenizer   
)

# Eğitimi başlat
trainer.train()

Filter:   0%|          | 0/4614 [00:00<?, ? examples/s]

AttributeError: 'NoneType' object has no attribute 'strip'

In [None]:
# Model ve tokenizer'ı kaydet
trainer.model.save_pretrained("fine-tuned-fikra")
tokenizer.save_pretrained("fine-tuned-fikra")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_path = "fine-tuned-fikra"

model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_path)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto"
)

In [None]:
prompt = "<s>[INST] Fıkra yazar mısın? [/INST]"

outputs = pipe(
    prompt,
    max_new_tokens=150,
    do_sample=True,
    temperature=0.7,
    top_k=40,
    top_p=0.9,
    repetition_penalty=1.4,
    num_return_sequences=3
)

result = outputs[0]["generated_text"]
cleaned_result = result.split("[/INST]")[-1].strip()
print(cleaned_result)