In [14]:
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import sentencepiece as spm
import re
from tqdm import tqdm

torch.backends.cudnn.benchmark = True

In [5]:
jokes_file = 'jokes_clean.txt'
train_file = 'jokes_clean.txt'

with open(jokes_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()

fikralar = []
current_fikra = []
for line in lines:
    if line.strip() == '':
        if current_fikra:
            fikralar.append(' '.join(current_fikra))
            current_fikra = []
    else:
        current_fikra.append(line.strip())
if current_fikra:
    fikralar.append(' '.join(current_fikra))

with open(train_file, 'w', encoding='utf-8') as f:
    for fikra in fikralar:
        f.write(fikra + '\n\n')

In [6]:
spm.SentencePieceTrainer.train(
    input=train_file,
    model_prefix='fikra_tokenizer',
    vocab_size=10000,
    character_coverage=0.9995,
    model_type='bpe'
)
sp = spm.SentencePieceProcessor(model_file='fikra_tokenizer.model')

In [7]:
def tokenize_dataset(dataset_file):
    with open(dataset_file, 'r', encoding='utf-8') as file:
        text = file.read().strip()
    jokes = re.split(r'\n\s*\n', text)
    jokes = [re.sub(r'\s+', ' ', joke).strip() for joke in jokes if joke.strip()]
    tokenized_jokes = [sp.encode(joke, out_type=int) for joke in jokes]
    return tokenized_jokes

train_tokenized = tokenize_dataset('jokes_clean.txt')
print("Tokenized joke sayısı:", len(train_tokenized))

Tokenized joke sayısı: 777


In [8]:
# Dataset Tanımı ve tokenizasyon
class JokesDataset(Dataset):
    def __init__(self, tokenized_jokes, max_len=50, pad_token=0):
        self.sequences = []
        self.max_len = max_len
        for tokens in tokenized_jokes:
            if len(tokens) < max_len + 1:
                tokens = tokens + [pad_token] * (max_len + 1 - len(tokens))
            for i in range(len(tokens) - max_len):
                input_seq = tokens[i:i+max_len]
                target_seq = tokens[i+1:i+max_len+1]
                self.sequences.append((input_seq, target_seq))

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        input_seq, target_seq = self.sequences[idx]
        return torch.tensor(input_seq, dtype=torch.long), torch.tensor(target_seq, dtype=torch.long)


In [9]:
# PositionalEncoding: max_len burada 5000, üretim sırasında kullanılacak
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    def forward(self, x):
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len]

# Model Tanımı: Eğitimde max_len=50 kullanılıyor.
class SimpleDecoderLLM(nn.Module):
    def __init__(self, vocab_size, d_model=256, nhead=4, num_layers=2, max_len=50):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len)  # Eğitimde giriş uzunluğu 50
        decoder_layer = nn.TransformerDecoderLayer(d_model, nhead)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.d_model = d_model
        self.max_len = max_len

    def forward(self, tgt, memory):
        tgt = self.token_embedding(tgt) * math.sqrt(self.d_model)
        tgt = self.pos_encoder(tgt)
        tgt = tgt.transpose(0, 1)
        seq_len = tgt.size(0)
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(seq_len).to(tgt.device)
        output = self.transformer_decoder(tgt, memory, tgt_mask=tgt_mask)
        logits = self.fc_out(output)
        return logits.transpose(0, 1)


In [10]:
# Parametreler
vocab_size = 10000
train_max_len = 50            # Eğitimde kullanılan maksimum giriş uzunluğu
gen_max_length = 150       # Üretim çıktısı olarak maksimum token sayısı
batch_size = 64
epochs = 100

# Dataset ve DataLoader
train_dataset = JokesDataset(train_tokenized, max_len=train_max_len)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print(f"✅ GPU Kullanılıyor: {torch.cuda.get_device_name(0)}")
else:
    print("❌ GPU kullanılmıyor, CPU'ya geçildi!")

# Model oluşturulması (eğitimde kullanılan max_len = train_max_len)
model = SimpleDecoderLLM(vocab_size=vocab_size, d_model=256, nhead=4, num_layers=2, max_len=train_max_len)
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)

scaler = torch.amp.GradScaler('cuda')

✅ GPU Kullanılıyor: NVIDIA GeForce RTX 4060 Laptop GPU


In [26]:
train_losses = []
model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch_inputs, batch_targets in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=False):
        batch_inputs  = batch_inputs.to(device, non_blocking=True)
        batch_targets = batch_targets.to(device, non_blocking=True)
        optimizer.zero_grad()
        dummy_memory = torch.zeros(1, batch_inputs.size(0), model.d_model, device=device)
        with torch.amp.autocast(device_type='cuda'):
            outputs = model(batch_inputs, dummy_memory)
            outputs = outputs.contiguous().view(-1, vocab_size)
            batch_targets = batch_targets.contiguous().view(-1)
            loss = criterion(outputs, batch_targets)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    scheduler.step()
    train_losses.append(avg_loss)
    print(f"Epoch {epoch+1}/{epochs} | Loss: {avg_loss:.4f} | LR: {scheduler.get_last_lr()[0]:.6f}")
print("Eğitim tamamlandı!")

                                                              

Epoch 1/100 | Loss: 3.8340 | LR: 0.001000


                                                              

Epoch 2/100 | Loss: 1.1762 | LR: 0.000999


                                                              

Epoch 3/100 | Loss: 0.6422 | LR: 0.000998


                                                              

Epoch 4/100 | Loss: 0.4695 | LR: 0.000996


                                                              

Epoch 5/100 | Loss: 0.3876 | LR: 0.000994


                                                              

Epoch 6/100 | Loss: 0.3377 | LR: 0.000991


                                                              

Epoch 7/100 | Loss: 0.3012 | LR: 0.000988


                                                              

Epoch 8/100 | Loss: 0.2734 | LR: 0.000984


                                                              

Epoch 9/100 | Loss: 0.2535 | LR: 0.000980


                                                               

Epoch 10/100 | Loss: 0.2356 | LR: 0.000976


                                                               

Epoch 11/100 | Loss: 0.2194 | LR: 0.000970


                                                               

Epoch 12/100 | Loss: 0.2074 | LR: 0.000965


                                                               

Epoch 13/100 | Loss: 0.2004 | LR: 0.000959


                                                               

Epoch 14/100 | Loss: 0.1931 | LR: 0.000952


                                                               

Epoch 15/100 | Loss: 0.1860 | LR: 0.000946


                                                               

Epoch 16/100 | Loss: 0.1799 | LR: 0.000938


                                                               

Epoch 17/100 | Loss: 0.1752 | LR: 0.000930


                                                               

Epoch 18/100 | Loss: 0.1705 | LR: 0.000922


                                                               

Epoch 19/100 | Loss: 0.1676 | LR: 0.000914


                                                               

Epoch 20/100 | Loss: 0.1633 | LR: 0.000905


                                                               

Epoch 21/100 | Loss: 0.1605 | LR: 0.000895


                                                               

Epoch 22/100 | Loss: 0.1580 | LR: 0.000885


                                                               

Epoch 23/100 | Loss: 0.1545 | LR: 0.000875


                                                               

Epoch 24/100 | Loss: 0.1507 | LR: 0.000864


                                                               

Epoch 25/100 | Loss: 0.1483 | LR: 0.000854


                                                               

Epoch 26/100 | Loss: 0.1452 | LR: 0.000842


                                                               

Epoch 27/100 | Loss: 0.1432 | LR: 0.000831


                                                               

Epoch 28/100 | Loss: 0.1413 | LR: 0.000819


                                                               

Epoch 29/100 | Loss: 0.1392 | LR: 0.000806


                                                               

Epoch 30/100 | Loss: 0.1364 | LR: 0.000794


                                                               

Epoch 31/100 | Loss: 0.1344 | LR: 0.000781


                                                               

Epoch 32/100 | Loss: 0.1326 | LR: 0.000768


                                                               

Epoch 33/100 | Loss: 0.1298 | LR: 0.000755


                                                               

Epoch 34/100 | Loss: 0.1277 | LR: 0.000741


                                                               

Epoch 35/100 | Loss: 0.1252 | LR: 0.000727


                                                               

Epoch 36/100 | Loss: 0.1234 | LR: 0.000713


                                                               

Epoch 37/100 | Loss: 0.1222 | LR: 0.000699


                                                               

Epoch 38/100 | Loss: 0.1201 | LR: 0.000684


                                                               

Epoch 39/100 | Loss: 0.1187 | LR: 0.000669


                                                               

Epoch 40/100 | Loss: 0.1160 | LR: 0.000655


                                                               

Epoch 41/100 | Loss: 0.1142 | LR: 0.000639


                                                               

Epoch 42/100 | Loss: 0.1124 | LR: 0.000624


                                                               

Epoch 43/100 | Loss: 0.1113 | LR: 0.000609


                                                               

Epoch 44/100 | Loss: 0.1104 | LR: 0.000594


                                                               

Epoch 45/100 | Loss: 0.1083 | LR: 0.000578


                                                               

Epoch 46/100 | Loss: 0.1067 | LR: 0.000563


                                                               

Epoch 47/100 | Loss: 0.1058 | LR: 0.000547


                                                               

Epoch 48/100 | Loss: 0.1036 | LR: 0.000531


                                                               

Epoch 49/100 | Loss: 0.1017 | LR: 0.000516


                                                               

Epoch 50/100 | Loss: 0.1009 | LR: 0.000500


                                                               

Epoch 51/100 | Loss: 0.0993 | LR: 0.000484


                                                               

Epoch 52/100 | Loss: 0.0988 | LR: 0.000469


                                                               

Epoch 53/100 | Loss: 0.0969 | LR: 0.000453


                                                               

Epoch 54/100 | Loss: 0.0957 | LR: 0.000437


                                                               

Epoch 55/100 | Loss: 0.0943 | LR: 0.000422


                                                               

Epoch 56/100 | Loss: 0.0937 | LR: 0.000406


                                                               

Epoch 57/100 | Loss: 0.0924 | LR: 0.000391


                                                               

Epoch 58/100 | Loss: 0.0915 | LR: 0.000376


                                                               

Epoch 59/100 | Loss: 0.0902 | LR: 0.000361


                                                               

Epoch 60/100 | Loss: 0.0889 | LR: 0.000345


                                                               

Epoch 61/100 | Loss: 0.0883 | LR: 0.000331


                                                               

Epoch 62/100 | Loss: 0.0869 | LR: 0.000316


                                                               

Epoch 63/100 | Loss: 0.0861 | LR: 0.000301


                                                               

Epoch 64/100 | Loss: 0.0853 | LR: 0.000287


                                                               

Epoch 65/100 | Loss: 0.0843 | LR: 0.000273


                                                               

Epoch 66/100 | Loss: 0.0832 | LR: 0.000259


                                                               

Epoch 67/100 | Loss: 0.0826 | LR: 0.000245


                                                               

Epoch 68/100 | Loss: 0.0816 | LR: 0.000232


                                                               

Epoch 69/100 | Loss: 0.0807 | LR: 0.000219


                                                               

Epoch 70/100 | Loss: 0.0803 | LR: 0.000206


                                                               

Epoch 71/100 | Loss: 0.0792 | LR: 0.000194


                                                               

Epoch 72/100 | Loss: 0.0785 | LR: 0.000181


                                                               

Epoch 73/100 | Loss: 0.0776 | LR: 0.000169


                                                               

Epoch 74/100 | Loss: 0.0770 | LR: 0.000158


                                                               

Epoch 75/100 | Loss: 0.0763 | LR: 0.000146


                                                               

Epoch 76/100 | Loss: 0.0756 | LR: 0.000136


                                                               

Epoch 77/100 | Loss: 0.0750 | LR: 0.000125


                                                               

Epoch 78/100 | Loss: 0.0745 | LR: 0.000115


                                                               

Epoch 79/100 | Loss: 0.0738 | LR: 0.000105


                                                               

Epoch 80/100 | Loss: 0.0733 | LR: 0.000095


                                                               

Epoch 81/100 | Loss: 0.0727 | LR: 0.000086


                                                               

Epoch 82/100 | Loss: 0.0722 | LR: 0.000078


                                                               

Epoch 83/100 | Loss: 0.0714 | LR: 0.000070


                                                               

Epoch 84/100 | Loss: 0.0713 | LR: 0.000062


                                                               

Epoch 85/100 | Loss: 0.0709 | LR: 0.000054


                                                               

Epoch 86/100 | Loss: 0.0699 | LR: 0.000048


                                                               

Epoch 87/100 | Loss: 0.0701 | LR: 0.000041


                                                               

Epoch 88/100 | Loss: 0.0698 | LR: 0.000035


                                                               

Epoch 89/100 | Loss: 0.0694 | LR: 0.000030


                                                               

Epoch 90/100 | Loss: 0.0690 | LR: 0.000024


                                                               

Epoch 91/100 | Loss: 0.0686 | LR: 0.000020


                                                               

Epoch 92/100 | Loss: 0.0684 | LR: 0.000016


                                                               

Epoch 93/100 | Loss: 0.0682 | LR: 0.000012


                                                               

Epoch 94/100 | Loss: 0.0680 | LR: 0.000009


                                                               

Epoch 95/100 | Loss: 0.0679 | LR: 0.000006


                                                               

Epoch 96/100 | Loss: 0.0679 | LR: 0.000004


                                                               

Epoch 97/100 | Loss: 0.0680 | LR: 0.000002


                                                               

Epoch 98/100 | Loss: 0.0676 | LR: 0.000001


                                                               

Epoch 99/100 | Loss: 0.0676 | LR: 0.000000


                                                                

Epoch 100/100 | Loss: 0.0676 | LR: 0.000000
Eğitim tamamlandı!




In [27]:
torch.save(model.state_dict(), 'model_v10.pth')
print("Model 'model_v10.pth' olarak kaydedildi!")

Model 'model_v10.pth' olarak kaydedildi!


In [17]:
model = SimpleDecoderLLM(vocab_size=vocab_size, d_model=256, nhead=4, num_layers=2, max_len=train_max_len)
checkpoint = torch.load('model_v10.pth', weights_only=True)
if checkpoint['pos_encoder.pe'].shape != model.pos_encoder.pe.shape:
    checkpoint['pos_encoder.pe'] = model.pos_encoder.pe
model.load_state_dict(checkpoint)
model.to(device)
model.eval()
print("Model yüklendi ve kullanıma hazır")

Model yüklendi ve kullanıma hazır


In [19]:
def generate_text(model, sp, start_text, max_length=gen_max_length, temperature=1.0, top_p=0.9, min_gen_length=50):
    model.eval()
    with torch.no_grad():
        tokens = sp.encode(start_text, out_type=int)
        if len(tokens) > model.max_len:
            tokens = tokens[-model.max_len:]
        generated = tokens.copy()
        dummy_memory = torch.zeros(1, 1, model.d_model, device=device)
        for _ in range(max_length - len(tokens)):
            trimmed_input = torch.tensor(generated[-model.max_len:], dtype=torch.long, device=device).unsqueeze(0)
            outputs = model(trimmed_input, dummy_memory)
            next_token_logits = outputs[:, -1, :] / temperature

            # Nucleus (top-p) sampling:
            sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
            cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
            sorted_indices_to_remove = cumulative_probs > top_p
            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
            sorted_indices_to_remove[..., 0] = 0
            next_token_logits[0, sorted_indices[sorted_indices_to_remove]] = -float('Inf')
            probs = torch.softmax(next_token_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1).item()
            generated.append(next_token)
            decoded = sp.decode(generated)
            if next_token == sp.eos_id():
                break
            # Eğer minimum token sayısına ulaşılmışsa ve çıktı son karakteri nokta, ünlem veya soru işareti ile bitiyorsa dur.
            if len(generated) >= min_gen_length and decoded.strip()[-1] in ['.', '!', '?']:
                break
        generated_text = sp.decode(generated)
        return generated_text

# Örnek kullanım:
start_text = "ensar hoca"  # Başlangıç metni
generated_fikra = generate_text(model, sp, start_text, max_length=gen_max_length, temperature=0.7, top_p=0.9, min_gen_length=50)
print("Üretilen Fıkra:", generated_fikra)

Üretilen Fıkra: ensar hoca bir gün pazarda dolaşırken bir adamın sürekli bedava akıl bedava akıl diye bağırdığını duymuş hoca yaklaşmış ve adama sormuş ne yapıyorsun böyle adam cevap vermiş insanlara bedava akıl dağıtıyorum bunun üzerine hoca o halde bana iki kilo ver demiş adam şaşırarak ama hoca akıl tartılmaz ki hoca gülümseyerek o zaman sen de bağırarak satma demiş adam şaşırarak ama hoca gülümseyerek o zaman sen de bağırarak satma demiş ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  tane don.


In [20]:
with open("jokes_clean.txt", "r", encoding="utf-8") as fin, open("jokes_preprocessed.txt", "w", encoding="utf-8") as fout:
    joke = []
    for line in fin:
        if line.strip() == "":
            if joke:
                fout.write(" ".join(joke).strip() + " <EOS>\n")
                joke = []
        else:
            joke.append(line.strip())
    if joke:
        fout.write(" ".join(joke).strip() + " <EOS>\n")
