# 🚀 Turkcell-LLM-7B Knowledge Distillation Pipeline
## Türkçe Eğitim Asistanı için Optimize Edilmiş Bilgi Damıtma

Bu notebook, **Turkcell-LLM-7b-v1** teacher model kullanarak Türkçe için optimize edilmiş knowledge distillation pipeline'ı içerir.

### ✅ Özellikler:
- Turkcell-LLM-7b-v1 teacher model
- A100 GPU için optimize edilmiş
- Mixed precision training (bf16)
- Gradient checkpointing
- Curriculum learning
- Türkçe'ye özel metrikler

In [None]:
# Sistem kontrolü ve setup
import torch
import os
import sys
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("🎯 TURKCELL-LLM-7B TEACHER MODEL SETUP")
print("="*70)

# GPU kontrolü
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    
    print(f"✅ GPU: {gpu_name}")
    print(f"💾 GPU Memory: {gpu_memory:.1f} GB")
    print(f"🔧 CUDA Version: {torch.version.cuda}")
    print(f"⚡ Compute Capability: {torch.cuda.get_device_properties(0).major}.{torch.cuda.get_device_properties(0).minor}")
    
    # A100 optimizasyonları
    if 'A100' in gpu_name:
        print("\n🚀 A100 GPU tespit edildi - Optimizasyonlar aktif:")
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
        print("  ✓ TF32 precision aktif")
        print("  ✓ Flash Attention 2 kullanılabilir")
        OPTIMAL_BATCH_SIZE = 16
    else:
        OPTIMAL_BATCH_SIZE = 8
else:
    print("⚠️ GPU bulunamadı")
    OPTIMAL_BATCH_SIZE = 4

print(f"\n📊 Önerilen batch size: {OPTIMAL_BATCH_SIZE}")

In [None]:
# Gerekli kütüphaneleri yükle
!pip install -q transformers>=4.36.0 accelerate>=0.25.0 bitsandbytes>=0.41.3 peft>=0.7.1
!pip install -q datasets evaluate nltk rouge-score sacrebleu bert-score
!pip install -q sentencepiece protobuf ftfy langdetect
!pip install -q wandb  # Opsiyonel: Experiment tracking

print("✅ Tüm kütüphaneler yüklendi")

## 📚 1. Turkcell Teacher Model Yükleme

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from dataclasses import dataclass
import gc

@dataclass
class TurkcellTeacherConfig:
    """Turkcell Teacher Model Konfigürasyonu"""
    model_id: str = "TURKCELL/Turkcell-LLM-7b-v1"
    quantization: str = "8bit"  # 4bit, 8bit, none
    use_flash_attention: bool = True
    max_length: int = 2048
    temperature: float = 3.0  # Distillation temperature
    batch_size: int = 16
    
def load_turkcell_teacher(config: TurkcellTeacherConfig):
    """Turkcell-LLM-7b-v1 modelini optimize şekilde yükle"""
    
    print(f"\n📥 Turkcell Teacher Model yükleniyor...")
    print(f"   Model: {config.model_id}")
    print(f"   Quantization: {config.quantization}")
    
    # Quantization config
    if config.quantization == "4bit":
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )
        print("   ✓ 4-bit quantization aktif (En düşük bellek)")
    elif config.quantization == "8bit":
        bnb_config = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_compute_dtype=torch.bfloat16
        )
        print("   ✓ 8-bit quantization aktif (Dengeli kalite-bellek)")
    else:
        bnb_config = None
        print("   ✓ Full precision (Maksimum kalite)")
    
    # Model yükleme argümanları
    model_kwargs = {
        "device_map": "auto",
        "trust_remote_code": True,
        "torch_dtype": torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
    }
    
    if bnb_config:
        model_kwargs["quantization_config"] = bnb_config
        
    # Flash Attention 2 (A100 için)
    if config.use_flash_attention and torch.cuda.is_available():
        try:
            model_kwargs["attn_implementation"] = "flash_attention_2"
            print("   ✓ Flash Attention 2 aktif")
        except:
            print("   ⚠️ Flash Attention 2 kullanılamıyor")
    
    # Model yükle
    try:
        model = AutoModelForCausalLM.from_pretrained(
            config.model_id,
            **model_kwargs
        )
        
        # Tokenizer yükle
        tokenizer = AutoTokenizer.from_pretrained(
            config.model_id,
            trust_remote_code=True,
            use_fast=True  # Fast tokenizer
        )
        
        # Padding token ayarla
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            tokenizer.pad_token_id = tokenizer.eos_token_id
        
        # Model bilgileri
        total_params = sum(p.numel() for p in model.parameters()) / 1e9
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e9
        
        print(f"\n✅ Turkcell Teacher Model başarıyla yüklendi!")
        print(f"   Total parameters: {total_params:.2f}B")
        print(f"   Trainable parameters: {trainable_params:.2f}B")
        
        if torch.cuda.is_available():
            memory_used = torch.cuda.memory_allocated() / 1e9
            print(f"   GPU Memory used: {memory_used:.2f}GB")
        
        # Model'i eval moduna al
        model.eval()
        for param in model.parameters():
            param.requires_grad = False
            
        print("   ✓ Model evaluation modunda ve donduruldu")
        
        return model, tokenizer
        
    except Exception as e:
        print(f"\n❌ Model yükleme hatası: {e}")
        print("\n💡 Çözüm önerileri:")
        print("   1. Quantization seviyesini artırın (8bit -> 4bit)")
        print("   2. Batch size'ı azaltın")
        print("   3. Gradient checkpointing kullanın")
        return None, None

# Teacher model'i yükle
teacher_config = TurkcellTeacherConfig(
    quantization="8bit",  # A100 40GB için 8-bit yeterli
    use_flash_attention=True,
    batch_size=OPTIMAL_BATCH_SIZE
)

teacher_model, teacher_tokenizer = load_turkcell_teacher(teacher_config)

# Bellek temizleme
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()

## 🎓 2. Student Model Seçimi ve Yükleme

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

@dataclass
class StudentModelConfig:
    """Student Model Konfigürasyonu"""
    model_id: str = "ytu-ce-cosmos/turkish-gpt2-large"  # Hafif Türkçe model
    use_lora: bool = True
    lora_r: int = 64
    lora_alpha: int = 128
    lora_dropout: float = 0.05
    target_modules: list = None
    
def load_student_model(config: StudentModelConfig):
    """Student model'i LoRA ile yükle"""
    
    print(f"\n📥 Student Model yükleniyor...")
    print(f"   Model: {config.model_id}")
    
    # Student model - daha küçük
    student_model = AutoModelForCausalLM.from_pretrained(
        config.model_id,
        torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    
    student_tokenizer = AutoTokenizer.from_pretrained(
        config.model_id,
        trust_remote_code=True
    )
    
    # Padding token
    if student_tokenizer.pad_token is None:
        student_tokenizer.pad_token = student_tokenizer.eos_token
        student_tokenizer.pad_token_id = student_tokenizer.eos_token_id
    
    # LoRA configuration
    if config.use_lora:
        print("\n🔧 LoRA konfigürasyonu uygulanıyor...")
        
        # Model'i LoRA için hazırla
        student_model = prepare_model_for_kbit_training(student_model)
        
        # LoRA config
        lora_config = LoraConfig(
            r=config.lora_r,
            lora_alpha=config.lora_alpha,
            target_modules=config.target_modules or ["q_proj", "v_proj"],
            lora_dropout=config.lora_dropout,
            bias="none",
            task_type=TaskType.CAUSAL_LM
        )
        
        # LoRA uygula
        student_model = get_peft_model(student_model, lora_config)
        
        # LoRA istatistikleri
        trainable_params = sum(p.numel() for p in student_model.parameters() if p.requires_grad)
        all_params = sum(p.numel() for p in student_model.parameters())
        
        print(f"\n✅ Student Model (LoRA) hazır!")
        print(f"   Total parameters: {all_params/1e6:.2f}M")
        print(f"   Trainable parameters: {trainable_params/1e6:.2f}M")
        print(f"   Trainable %: {100 * trainable_params / all_params:.2f}%")
    else:
        print("\n✅ Student Model (Full Fine-tuning) hazır!")
        
    # Gradient checkpointing
    if hasattr(student_model, 'gradient_checkpointing_enable'):
        student_model.gradient_checkpointing_enable()
        print("   ✓ Gradient checkpointing aktif")
        
    return student_model, student_tokenizer

# Student model alternatifler
STUDENT_MODEL_OPTIONS = {
    "turkish-gpt2": "ytu-ce-cosmos/turkish-gpt2-large",
    "turkish-bert-gpt2": "redrussianarmy/turkish-bert-gpt2",
    "gpt2-small-turkish": "ytu-ce-cosmos/turkish-gpt2-small",
    "distilgpt2-turkish": "dbmdz/distilbert-base-turkish-cased"  # Encoder model
}

print("📋 Mevcut Student Model Seçenekleri:")
for key, value in STUDENT_MODEL_OPTIONS.items():
    print(f"   • {key}: {value}")

# Student model'i yükle
student_config = StudentModelConfig(
    model_id=STUDENT_MODEL_OPTIONS["turkish-gpt2"],
    use_lora=True,
    lora_r=64,
    lora_alpha=128
)

student_model, student_tokenizer = load_student_model(student_config)

# Bellek durumu
if torch.cuda.is_available():
    memory_used = torch.cuda.memory_allocated() / 1e9
    memory_reserved = torch.cuda.memory_reserved() / 1e9
    print(f"\n💾 GPU Bellek Durumu:")
    print(f"   Kullanılan: {memory_used:.2f}GB")
    print(f"   Rezerve: {memory_reserved:.2f}GB")

## 🔄 3. Knowledge Distillation Training Loop

In [None]:
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm
import numpy as np

class TurkcellDistillationTrainer:
    """Turkcell Teacher Model için Knowledge Distillation Trainer"""
    
    def __init__(self,
                 teacher_model,
                 student_model,
                 teacher_tokenizer,
                 student_tokenizer,
                 temperature: float = 3.0,
                 alpha: float = 0.7,
                 max_length: int = 512):
        
        self.teacher_model = teacher_model
        self.student_model = student_model
        self.teacher_tokenizer = teacher_tokenizer
        self.student_tokenizer = student_tokenizer
        self.temperature = temperature
        self.alpha = alpha  # KD loss weight
        self.max_length = max_length
        
        # Device
        self.device = next(student_model.parameters()).device
        
        # Mixed precision scaler
        self.use_amp = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
        if self.use_amp:
            self.scaler = torch.cuda.amp.GradScaler()
            print("✅ Mixed precision (BF16) training aktif")
            
        # Metrics tracking
        self.training_history = []
        
    def compute_distillation_loss(self, 
                                 student_logits: torch.Tensor,
                                 teacher_logits: torch.Tensor,
                                 labels: torch.Tensor,
                                 attention_mask: torch.Tensor = None):
        """Distillation loss hesaplama"""
        
        # Reshape logits
        vocab_size = student_logits.size(-1)
        student_logits_view = student_logits.view(-1, vocab_size)
        teacher_logits_view = teacher_logits.view(-1, teacher_logits.size(-1))
        labels_view = labels.view(-1)
        
        # Mask invalid positions
        if attention_mask is not None:
            mask = attention_mask.view(-1) == 1
            student_logits_view = student_logits_view[mask]
            teacher_logits_view = teacher_logits_view[mask]
            labels_view = labels_view[mask]
        
        # KL Divergence loss (soft targets)
        kd_loss = F.kl_div(
            F.log_softmax(student_logits_view / self.temperature, dim=-1),
            F.softmax(teacher_logits_view / self.temperature, dim=-1),
            reduction='batchmean'
        ) * (self.temperature ** 2)
        
        # Cross entropy loss (hard targets)
        ce_loss = F.cross_entropy(
            student_logits_view,
            labels_view,
            ignore_index=-100
        )
        
        # Combined loss
        total_loss = self.alpha * kd_loss + (1 - self.alpha) * ce_loss
        
        return {
            'loss': total_loss,
            'kd_loss': kd_loss.item(),
            'ce_loss': ce_loss.item()
        }
    
    @torch.no_grad()
    def get_teacher_outputs(self, input_ids, attention_mask):
        """Teacher model çıktılarını al"""
        outputs = self.teacher_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )
        return outputs.logits
    
    def train_step(self, batch, optimizer, scheduler=None):
        """Tek training adımı"""
        
        # Prepare inputs
        texts = batch['text'] if isinstance(batch, dict) else batch
        
        # Teacher tokenization
        teacher_inputs = self.teacher_tokenizer(
            texts,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        ).to(self.device)
        
        # Student tokenization
        student_inputs = self.student_tokenizer(
            texts,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        ).to(self.device)
        
        # Get teacher logits (no grad)
        teacher_logits = self.get_teacher_outputs(
            teacher_inputs['input_ids'],
            teacher_inputs['attention_mask']
        )
        
        # Forward pass with mixed precision
        if self.use_amp:
            with torch.cuda.amp.autocast(dtype=torch.bfloat16):
                student_outputs = self.student_model(
                    input_ids=student_inputs['input_ids'],
                    attention_mask=student_inputs['attention_mask'],
                    labels=student_inputs['input_ids']
                )
                
                # Compute loss
                loss_dict = self.compute_distillation_loss(
                    student_outputs.logits,
                    teacher_logits,
                    student_inputs['input_ids'],
                    student_inputs['attention_mask']
                )
        else:
            student_outputs = self.student_model(
                input_ids=student_inputs['input_ids'],
                attention_mask=student_inputs['attention_mask'],
                labels=student_inputs['input_ids']
            )
            
            loss_dict = self.compute_distillation_loss(
                student_outputs.logits,
                teacher_logits,
                student_inputs['input_ids'],
                student_inputs['attention_mask']
            )
        
        # Backward pass
        loss = loss_dict['loss']
        
        if self.use_amp:
            self.scaler.scale(loss).backward()
            self.scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(self.student_model.parameters(), max_norm=1.0)
            self.scaler.step(optimizer)
            self.scaler.update()
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.student_model.parameters(), max_norm=1.0)
            optimizer.step()
        
        if scheduler:
            scheduler.step()
            
        optimizer.zero_grad()
        
        # Clear cache periodically
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            
        return loss_dict
    
    def train_epoch(self, dataloader, optimizer, scheduler=None, epoch=0):
        """Bir epoch training"""
        
        self.student_model.train()
        epoch_losses = []
        
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}")
        
        for batch_idx, batch in enumerate(progress_bar):
            try:
                loss_dict = self.train_step(batch, optimizer, scheduler)
                epoch_losses.append(loss_dict)
                
                # Update progress bar
                progress_bar.set_postfix({
                    'loss': f"{loss_dict['loss']:.4f}",
                    'kd': f"{loss_dict['kd_loss']:.4f}",
                    'ce': f"{loss_dict['ce_loss']:.4f}"
                })
                
            except RuntimeError as e:
                if "out of memory" in str(e):
                    print("\n⚠️ CUDA OOM! Batch'i atlayıp devam ediliyor...")
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()
                    continue
                else:
                    raise e
        
        # Epoch statistics
        avg_loss = np.mean([d['loss'] for d in epoch_losses])
        avg_kd_loss = np.mean([d['kd_loss'] for d in epoch_losses])
        avg_ce_loss = np.mean([d['ce_loss'] for d in epoch_losses])
        
        print(f"\n📊 Epoch {epoch+1} Özeti:")
        print(f"   Average Loss: {avg_loss:.4f}")
        print(f"   KD Loss: {avg_kd_loss:.4f}")
        print(f"   CE Loss: {avg_ce_loss:.4f}")
        
        return {
            'epoch': epoch + 1,
            'avg_loss': avg_loss,
            'avg_kd_loss': avg_kd_loss,
            'avg_ce_loss': avg_ce_loss
        }

# Trainer'ı oluştur
trainer = TurkcellDistillationTrainer(
    teacher_model=teacher_model,
    student_model=student_model,
    teacher_tokenizer=teacher_tokenizer,
    student_tokenizer=student_tokenizer,
    temperature=teacher_config.temperature,
    alpha=0.7,  # %70 KD, %30 CE
    max_length=512
)

print("\n✅ Turkcell Distillation Trainer hazır!")
print(f"   Temperature: {trainer.temperature}")
print(f"   Alpha (KD weight): {trainer.alpha}")
print(f"   Max length: {trainer.max_length}")

## 📊 4. Türkçe Veri Yükleme ve Hazırlama

In [None]:
from datasets import load_dataset, Dataset
import pandas as pd

class TurkishEducationDataset(Dataset):
    """Türkçe eğitim veri seti"""
    
    def __init__(self, data, tokenizer=None, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        if isinstance(item, dict):
            text = item.get('text', '') or item.get('content', '')
        else:
            text = str(item)
            
        return {'text': text}

def load_turkish_data(source="demo", max_samples=1000):
    """Türkçe veri yükle"""
    
    print(f"\n📥 Türkçe veri yükleniyor: {source}")
    
    if source == "demo":
        # Demo veri oluştur
        demo_texts = [
            "Yapay zeka, insan zekasını taklit eden bilgisayar sistemleridir.",
            "Python programlama dili, veri bilimi için popüler bir araçtır.",
            "Makine öğrenmesi, verilerden örüntü çıkarma sürecidir.",
            "Derin öğrenme, yapay sinir ağlarını kullanan bir makine öğrenmesi yöntemidir.",
            "Türkiye'de teknoloji eğitimi hızla gelişmektedir.",
            "Matematik, bilimin temelidir ve problem çözme becerilerini geliştirir.",
            "Fizik, doğadaki olayları açıklayan temel bilimlerden biridir.",
            "Kimya, maddenin yapısını ve özelliklerini inceler.",
            "Biyoloji, canlıların yapısını ve yaşam süreçlerini araştırır.",
            "Tarih, geçmişi anlamamıza ve geleceği şekillendirmemize yardımcı olur."
        ]
        
        # Veriyi çoğalt
        data = demo_texts * (max_samples // len(demo_texts) + 1)
        data = data[:max_samples]
        
        print(f"✅ {len(data)} demo örnek oluşturuldu")
        
    elif source == "huggingface":
        # Hugging Face'den Türkçe veri yükle
        try:
            dataset = load_dataset(
                "ytu-ce-cosmos/turkish-qa",  # Türkçe Q&A dataset
                split=f"train[:{max_samples}]"
            )
            data = [item['text'] for item in dataset]
            print(f"✅ {len(data)} örnek Hugging Face'den yüklendi")
            
        except Exception as e:
            print(f"⚠️ Hugging Face yükleme hatası: {e}")
            print("Demo veri kullanılıyor...")
            return load_turkish_data("demo", max_samples)
            
    else:
        # CSV'den yükle
        try:
            df = pd.read_csv(source)
            data = df['text'].tolist()[:max_samples]
            print(f"✅ {len(data)} örnek CSV'den yüklendi")
        except Exception as e:
            print(f"⚠️ CSV yükleme hatası: {e}")
            return load_turkish_data("demo", max_samples)
    
    return data

# Veri yükle
train_data = load_turkish_data("demo", max_samples=100)
val_data = load_turkish_data("demo", max_samples=20)

# Dataset oluştur
train_dataset = TurkishEducationDataset(train_data)
val_dataset = TurkishEducationDataset(val_data)

# DataLoader
train_loader = DataLoader(
    train_dataset,
    batch_size=teacher_config.batch_size,
    shuffle=True,
    num_workers=0
)

val_loader = DataLoader(
    val_dataset,
    batch_size=teacher_config.batch_size,
    shuffle=False,
    num_workers=0
)

print(f"\n📊 Veri Özeti:")
print(f"   Train samples: {len(train_dataset)}")
print(f"   Val samples: {len(val_dataset)}")
print(f"   Batch size: {teacher_config.batch_size}")
print(f"   Train batches: {len(train_loader)}")
print(f"   Val batches: {len(val_loader)}")

## 🚀 5. Training Execution

In [None]:
# Optimizer ve Scheduler setup
optimizer = AdamW(
    student_model.parameters(),
    lr=2e-5,
    weight_decay=0.01
)

# Learning rate scheduler
total_steps = len(train_loader) * 3  # 3 epochs
warmup_steps = int(0.1 * total_steps)  # %10 warmup

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

print("\n🎯 Training Konfigürasyonu:")
print(f"   Learning rate: {2e-5}")
print(f"   Total steps: {total_steps}")
print(f"   Warmup steps: {warmup_steps}")
print(f"   Epochs: 3")

# Training loop
print("\n" + "="*70)
print("🚀 TRAINING BAŞLIYOR")
print("="*70)

num_epochs = 3
best_loss = float('inf')

for epoch in range(num_epochs):
    print(f"\n📚 Epoch {epoch+1}/{num_epochs}")
    print("-" * 50)
    
    # Train
    train_metrics = trainer.train_epoch(
        train_loader,
        optimizer,
        scheduler,
        epoch
    )
    
    # Validation
    print("\n🔍 Validation...")
    student_model.eval()
    val_losses = []
    
    with torch.no_grad():
        for batch in val_loader:
            texts = batch['text']
            
            # Student forward
            student_inputs = student_tokenizer(
                texts,
                max_length=512,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            ).to(trainer.device)
            
            outputs = student_model(**student_inputs, labels=student_inputs['input_ids'])
            val_losses.append(outputs.loss.item())
    
    avg_val_loss = np.mean(val_losses)
    print(f"   Validation Loss: {avg_val_loss:.4f}")
    
    # Best model saving
    if avg_val_loss < best_loss:
        best_loss = avg_val_loss
        print(f"   🏆 Yeni en iyi model! (loss: {best_loss:.4f})")
        
        # Model'i kaydet
        save_path = f"./turkcell_distilled_epoch{epoch+1}"
        student_model.save_pretrained(save_path)
        student_tokenizer.save_pretrained(save_path)
        print(f"   💾 Model kaydedildi: {save_path}")
    
    # Memory cleanup
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()

print("\n" + "="*70)
print("✅ TRAINING TAMAMLANDI!")
print("="*70)
print(f"\n📊 Final Sonuçlar:")
print(f"   Best validation loss: {best_loss:.4f}")
print(f"   Model saved to: ./turkcell_distilled_epoch{epoch+1}")

## 🧪 6. Model Test ve Değerlendirme

In [None]:
def generate_text(model, tokenizer, prompt, max_length=100, temperature=0.8):
    """Model ile metin üret"""
    
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,
            temperature=temperature,
            do_sample=True,
            top_p=0.95,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated

# Test prompts
test_prompts = [
    "Yapay zeka nedir?",
    "Python programlama dilinin avantajları",
    "Matematik öğrenmenin önemi",
    "Türkiye'de teknoloji eğitimi"
]

print("\n🧪 Model Test Sonuçları")
print("="*70)

for i, prompt in enumerate(test_prompts, 1):
    print(f"\n📝 Test {i}: {prompt}")
    print("-" * 50)
    
    # Teacher model çıktısı
    print("👨‍🏫 Teacher (Turkcell):")
    teacher_output = generate_text(teacher_model, teacher_tokenizer, prompt, max_length=50)
    print(teacher_output)
    
    # Student model çıktısı
    print("\n👨‍🎓 Student (Distilled):")
    student_output = generate_text(student_model, student_tokenizer, prompt, max_length=50)
    print(student_output)

print("\n" + "="*70)
print("✅ Test tamamlandı!")

## 💾 7. Model Export ve Deployment

In [None]:
# Model'i Hugging Face formatında kaydet
final_save_path = "./turkcell_distilled_final"

print("\n💾 Final model kaydediliyor...")

# Student model'i kaydet
student_model.save_pretrained(final_save_path)
student_tokenizer.save_pretrained(final_save_path)

# Training config'i kaydet
import json

training_config = {
    "teacher_model": teacher_config.model_id,
    "student_model": student_config.model_id,
    "temperature": teacher_config.temperature,
    "alpha": 0.7,
    "batch_size": teacher_config.batch_size,
    "epochs": num_epochs,
    "best_loss": float(best_loss),
    "quantization": teacher_config.quantization
}

with open(f"{final_save_path}/training_config.json", "w") as f:
    json.dump(training_config, f, indent=2)

print(f"✅ Model kaydedildi: {final_save_path}")
print("\n📦 Model içeriği:")
print(f"   - Model weights")
print(f"   - Tokenizer files")
print(f"   - Training config")

# Model boyutu
import os
model_size = sum(
    os.path.getsize(os.path.join(final_save_path, f))
    for f in os.listdir(final_save_path)
    if os.path.isfile(os.path.join(final_save_path, f))
) / (1024 * 1024)  # MB

print(f"\n📊 Model boyutu: {model_size:.2f} MB")

# Deployment önerileri
print("\n🚀 Deployment Önerileri:")
print("   1. Hugging Face Hub'a yükleyin:")
print("      ```python")
print("      from huggingface_hub import HfApi")
print("      api = HfApi()")
print("      api.upload_folder(")
print(f"          folder_path='{final_save_path}',")
print("          repo_id='username/turkcell-distilled',")
print("          repo_type='model'")
print("      )")
print("      ```")
print("   2. ONNX formatına dönüştürün (hızlı inference)")
print("   3. TorchScript ile optimize edin")
print("   4. API endpoint olarak servis edin")

print("\n✅ Pipeline tamamlandı!")