# 🚀 Turkcell-LLM-7B Knowledge Distillation Pipeline
## Türkçe Eğitim Asistanı için Optimize Edilmiş - TEKNOFEST 2025

Bu notebook, **Turkcell-LLM-7B** teacher model kullanarak Türkçe için optimize edilmiş knowledge distillation pipeline'ı içerir.

### ✅ Özellikler:
- Turkcell-LLM-7B teacher model
- Türkçe özel tokenizer entegrasyonu
- Mixed precision training (bf16)
- Gradient checkpointing
- Curriculum learning
- Knowledge distillation yapılandırması
- Kapsamlı hata kurtarma

In [None]:
# GPU kontrolü ve sistem bilgisi
import torch
import os
import sys

def check_gpu_and_setup():
    """GPU kontrolü ve optimal ayarlar"""
    if torch.cuda.is_available():
        gpu_name = torch.cuda.get_device_name(0)
        print(f"✅ GPU Bulundu: {gpu_name}")
        
        # A100 için özel optimizasyonlar
        if 'A100' in gpu_name:
            print("🚀 A100 GPU tespit edildi - Özel optimizasyonlar aktif")
            # TF32 precision for A100
            torch.backends.cuda.matmul.allow_tf32 = True
            torch.backends.cudnn.allow_tf32 = True
            # Büyük batch size kullanılabilir
            optimal_batch_size = 32
        else:
            optimal_batch_size = 16
            
        # CUDA özellikleri
        print(f"  CUDA Version: {torch.version.cuda}")
        print(f"  GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
        print(f"  Compute Capability: {torch.cuda.get_device_properties(0).major}.{torch.cuda.get_device_properties(0).minor}")
        
        return optimal_batch_size
    else:
        print("⚠️ GPU bulunamadı, CPU kullanılacak")
        return 8

BATCH_SIZE = check_gpu_and_setup()

In [None]:
# Gerekli kütüphaneleri yükle
!pip install -q transformers>=4.36.0 accelerate>=0.25.0 bitsandbytes>=0.41.3 peft>=0.7.1
!pip install -q datasets evaluate nltk rouge-score sacrebleu bert-score
!pip install -q sentencepiece protobuf ftfy langdetect

print("✅ Tüm kütüphaneler yüklendi")

## 📚 1. Turkcell Teacher Model Yükleme

Türkçe için özel eğitilmiş Turkcell-LLM-7B modelini teacher olarak kullanacağız:

In [None]:
from dataclasses import dataclass
from typing import Optional, Dict, Any
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

@dataclass
class TeacherModelConfig:
    """Teacher model konfigürasyonu"""
    model_id: str
    turkish_optimized: bool
    min_gpu_memory_gb: float
    recommended_batch_size: int
    distillation_temperature: float
    description: str

# Türkçe için optimize edilmiş teacher model seçenekleri
TEACHER_MODELS = {
    "turkcell/Turkcell-LLM-7b-v1": TeacherModelConfig(
        model_id="turkcell/Turkcell-LLM-7b-v1",
        turkish_optimized=True,
        min_gpu_memory_gb=16,
        recommended_batch_size=8,
        distillation_temperature=3.0,
        description="Turkcell'in Türkçe için özel eğitilmiş 7B modeli"
    ),
    "ytu-ce-cosmos/turkish-gpt2-large": TeacherModelConfig(
        model_id="ytu-ce-cosmos/turkish-gpt2-large",
        turkish_optimized=True,
        min_gpu_memory_gb=8,
        recommended_batch_size=16,
        distillation_temperature=2.5,
        description="YTÜ Turkish GPT-2 Large - Hafif ve hızlı"
    ),
    "google/mt5-xl": TeacherModelConfig(
        model_id="google/mt5-xl",
        turkish_optimized=True,
        min_gpu_memory_gb=24,
        recommended_batch_size=4,
        distillation_temperature=3.5,
        description="Google mT5-XL - Çok dilli, Türkçe dahil"
    ),
    "dbmdz/bert-base-turkish-cased": TeacherModelConfig(
        model_id="dbmdz/bert-base-turkish-cased",
        turkish_optimized=True,
        min_gpu_memory_gb=4,
        recommended_batch_size=32,
        distillation_temperature=2.0,
        description="Turkish BERT - Encoder model, hafif"
    ),
    "Qwen/Qwen2.5-7B-Instruct": TeacherModelConfig(
        model_id="Qwen/Qwen2.5-7B-Instruct",
        turkish_optimized=False,
        min_gpu_memory_gb=16,
        recommended_batch_size=8,
        distillation_temperature=4.0,
        description="Qwen 2.5 - Güçlü çok dilli model"
    )
}

def select_optimal_teacher(gpu_memory_gb: float = 40) -> TeacherModelConfig:
    """GPU belleğine göre optimal teacher model seç"""
    suitable_models = [
        config for config in TEACHER_MODELS.values()
        if config.min_gpu_memory_gb <= gpu_memory_gb
    ]
    
    # Türkçe optimize modelleri önceliklendir
    turkish_models = [m for m in suitable_models if m.turkish_optimized]
    
    if turkish_models:
        # En büyük Türkçe modeli seç
        return max(turkish_models, key=lambda x: x.min_gpu_memory_gb)
    elif suitable_models:
        return suitable_models[0]
    else:
        return TEACHER_MODELS["dbmdz/bert-base-turkish-cased"]  # Fallback

# A100 için optimal teacher seçimi (40GB veya 80GB)
if torch.cuda.is_available():
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
else:
    gpu_memory = 16  # Default

selected_teacher = select_optimal_teacher(gpu_memory)
print(f"\n📚 Seçilen Teacher Model: {selected_teacher.model_id}")
print(f"   {selected_teacher.description}")
print(f"   Önerilen batch size: {selected_teacher.recommended_batch_size}")
print(f"   Distillation temperature: {selected_teacher.distillation_temperature}")

## 🔧 2. Advanced Data Validation & Quality Control

In [None]:
import hashlib
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass, field
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

@dataclass
class DataQualityReport:
    """Veri kalite raporu"""
    total_samples: int
    valid_samples: int
    duplicate_samples: int
    contaminated_samples: int
    low_quality_samples: int
    class_distribution: Dict[str, int]
    length_statistics: Dict[str, float]
    quality_scores: List[float]
    warnings: List[str] = field(default_factory=list)
    
class AdvancedDataValidator:
    """Gelişmiş veri doğrulama ve kalite kontrol"""
    
    def __init__(self, 
                 min_length: int = 10,
                 max_length: int = 2048,
                 min_quality_score: float = 0.7,
                 contamination_threshold: float = 0.8):
        self.min_length = min_length
        self.max_length = max_length
        self.min_quality_score = min_quality_score
        self.contamination_threshold = contamination_threshold
        
        # Türkçe özel karakterler
        self.turkish_chars = set('çğıöşüÇĞİÖŞÜ')
        
        # Temizleme regex'leri
        self.url_pattern = re.compile(r'https?://\S+')
        self.email_pattern = re.compile(r'\S+@\S+')
        self.html_pattern = re.compile(r'<[^>]+>')
        
        # Veri hash'leri (tekilleştirme için)
        self.seen_hashes = set()
        
    def calculate_quality_score(self, text: str) -> float:
        """Metin kalite skoru hesapla (0-1)"""
        if not text:
            return 0.0
            
        scores = []
        
        # 1. Uzunluk skoru
        text_len = len(text)
        if self.min_length <= text_len <= self.max_length:
            scores.append(1.0)
        elif text_len < self.min_length:
            scores.append(text_len / self.min_length)
        else:
            scores.append(max(0, 1 - (text_len - self.max_length) / self.max_length))
            
        # 2. Türkçe karakter oranı
        turkish_ratio = sum(1 for c in text if c in self.turkish_chars) / max(len(text), 1)
        scores.append(min(turkish_ratio * 50, 1.0))  # %2 Türkçe karakter = tam puan
        
        # 3. Kelime çeşitliliği
        words = text.lower().split()
        if words:
            unique_ratio = len(set(words)) / len(words)
            scores.append(unique_ratio)
        else:
            scores.append(0)
            
        # 4. Alfanumerik oran (çok fazla sembol olmamalı)
        alnum_ratio = sum(1 for c in text if c.isalnum() or c.isspace()) / max(len(text), 1)
        scores.append(alnum_ratio)
        
        # 5. Cümle yapısı (noktalama)
        sentence_endings = sum(1 for c in text if c in '.!?')
        expected_sentences = len(words) / 15 if words else 0  # Ortalama 15 kelime/cümle
        if expected_sentences > 0:
            sentence_score = min(sentence_endings / expected_sentences, 1.0)
            scores.append(sentence_score)
        else:
            scores.append(0.5)
            
        return np.mean(scores)
    
    def is_duplicate(self, text: str) -> bool:
        """Tekillik kontrolü"""
        text_hash = hashlib.md5(text.encode()).hexdigest()
        if text_hash in self.seen_hashes:
            return True
        self.seen_hashes.add(text_hash)
        return False
    
    def check_contamination(self, train_text: str, test_texts: List[str]) -> bool:
        """Test/train veri kirlenmesi kontrolü"""
        if not test_texts:
            return False
            
        train_words = set(train_text.lower().split())
        
        for test_text in test_texts:
            test_words = set(test_text.lower().split())
            overlap = len(train_words.intersection(test_words))
            similarity = overlap / max(len(train_words), len(test_words), 1)
            
            if similarity > self.contamination_threshold:
                return True
                
        return False
    
    def clean_text(self, text: str) -> str:
        """Metni temizle"""
        # HTML, URL, email temizle
        text = self.html_pattern.sub('', text)
        text = self.url_pattern.sub('[URL]', text)
        text = self.email_pattern.sub('[EMAIL]', text)
        
        # Fazla boşlukları temizle
        text = ' '.join(text.split())
        
        return text.strip()
    
    def validate_dataset(self, 
                        train_data: List[Dict],
                        test_data: Optional[List[Dict]] = None) -> DataQualityReport:
        """Veri setini doğrula ve rapor oluştur"""
        
        valid_samples = []
        duplicate_count = 0
        contaminated_count = 0
        low_quality_count = 0
        quality_scores = []
        lengths = []
        class_counts = Counter()
        warnings = []
        
        # Test metinleri hazırla (kirlenme kontrolü için)
        test_texts = [item.get('text', '') for item in test_data] if test_data else []
        
        for item in train_data:
            text = item.get('text', '')
            label = item.get('label', 'unknown')
            
            # Temizle
            text = self.clean_text(text)
            
            # Uzunluk kontrolü
            if len(text) < self.min_length or len(text) > self.max_length:
                low_quality_count += 1
                continue
                
            # Tekillik kontrolü
            if self.is_duplicate(text):
                duplicate_count += 1
                continue
                
            # Kalite skoru
            quality = self.calculate_quality_score(text)
            quality_scores.append(quality)
            
            if quality < self.min_quality_score:
                low_quality_count += 1
                continue
                
            # Kirlenme kontrolü
            if test_texts and self.check_contamination(text, test_texts[:100]):  # İlk 100 test örneği
                contaminated_count += 1
                warnings.append(f"Potential contamination detected in sample")
                continue
                
            # Valid sample
            valid_samples.append({
                'text': text,
                'label': label,
                'quality_score': quality
            })
            
            lengths.append(len(text))
            class_counts[label] += 1
            
        # Sınıf dengesizliği kontrolü
        if class_counts:
            max_class = max(class_counts.values())
            min_class = min(class_counts.values())
            if max_class > 3 * min_class:
                warnings.append("Severe class imbalance detected")
                
        # İstatistikler
        length_stats = {
            'mean': np.mean(lengths) if lengths else 0,
            'std': np.std(lengths) if lengths else 0,
            'min': min(lengths) if lengths else 0,
            'max': max(lengths) if lengths else 0
        }
        
        return DataQualityReport(
            total_samples=len(train_data),
            valid_samples=len(valid_samples),
            duplicate_samples=duplicate_count,
            contaminated_samples=contaminated_count,
            low_quality_samples=low_quality_count,
            class_distribution=dict(class_counts),
            length_statistics=length_stats,
            quality_scores=quality_scores,
            warnings=warnings
        )

# Örnek kullanım
validator = AdvancedDataValidator()
print("✅ Advanced Data Validator hazır")

## 🎓 3. Knowledge Distillation with Unified Tokenizer

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
import gc

class UnifiedTokenizerWrapper:
    """Teacher ve Student model için unified tokenizer wrapper"""
    
    def __init__(self, teacher_tokenizer, student_tokenizer):
        self.teacher_tokenizer = teacher_tokenizer
        self.student_tokenizer = student_tokenizer
        
        # Padding token ayarla
        if self.teacher_tokenizer.pad_token is None:
            self.teacher_tokenizer.pad_token = self.teacher_tokenizer.eos_token
        if self.student_tokenizer.pad_token is None:
            self.student_tokenizer.pad_token = self.student_tokenizer.eos_token
            
    def encode_for_teacher(self, texts, max_length=512, return_tensors='pt'):
        """Teacher model için encode"""
        return self.teacher_tokenizer(
            texts,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_tensors=return_tensors
        )
        
    def encode_for_student(self, texts, max_length=512, return_tensors='pt'):
        """Student model için encode"""
        return self.student_tokenizer(
            texts,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_tensors=return_tensors
        )
        
    def decode(self, token_ids, skip_special_tokens=True, use_teacher=True):
        """Token'ları decode et"""
        tokenizer = self.teacher_tokenizer if use_teacher else self.student_tokenizer
        return tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)

class KnowledgeDistillationTrainer:
    """Bilgi damıtma trainer"""
    
    def __init__(self,
                 teacher_model,
                 student_model,
                 unified_tokenizer,
                 temperature: float = 3.0,
                 alpha: float = 0.7,
                 use_gradient_checkpointing: bool = True,
                 mixed_precision: str = 'bf16'):
        
        self.teacher_model = teacher_model
        self.student_model = student_model
        self.unified_tokenizer = unified_tokenizer
        self.temperature = temperature
        self.alpha = alpha  # KD loss weight
        self.mixed_precision = mixed_precision
        
        # Teacher modeli eval moduna al ve dondur
        self.teacher_model.eval()
        for param in self.teacher_model.parameters():
            param.requires_grad = False
            
        # Gradient checkpointing
        if use_gradient_checkpointing:
            self.student_model.gradient_checkpointing_enable()
            print("✅ Gradient checkpointing aktif")
            
        # Mixed precision setup
        if mixed_precision == 'bf16' and torch.cuda.is_bf16_supported():
            self.use_amp = True
            self.scaler = torch.cuda.amp.GradScaler()
            print("✅ BF16 mixed precision aktif")
        else:
            self.use_amp = False
            
    def compute_distillation_loss(self, student_logits, teacher_logits, labels):
        """Distillation loss hesapla"""
        # KL Divergence loss
        kd_loss = F.kl_div(
            F.log_softmax(student_logits / self.temperature, dim=-1),
            F.softmax(teacher_logits / self.temperature, dim=-1),
            reduction='batchmean'
        ) * (self.temperature ** 2)
        
        # Student CE loss
        student_loss = F.cross_entropy(
            student_logits.view(-1, student_logits.size(-1)),
            labels.view(-1)
        )
        
        # Combined loss
        total_loss = self.alpha * kd_loss + (1 - self.alpha) * student_loss
        
        return total_loss, kd_loss, student_loss
    
    @torch.no_grad()
    def get_teacher_logits(self, input_ids, attention_mask):
        """Teacher model'den logits al"""
        outputs = self.teacher_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        return outputs.logits
    
    def training_step(self, batch, optimizer):
        """Bir training step"""
        # Teacher encoding
        teacher_inputs = self.unified_tokenizer.encode_for_teacher(
            batch['text'], 
            return_tensors='pt'
        ).to(self.teacher_model.device)
        
        # Student encoding
        student_inputs = self.unified_tokenizer.encode_for_student(
            batch['text'],
            return_tensors='pt'
        ).to(self.student_model.device)
        
        # Teacher forward (no grad)
        teacher_logits = self.get_teacher_logits(
            teacher_inputs['input_ids'],
            teacher_inputs['attention_mask']
        )
        
        # Student forward with mixed precision
        if self.use_amp:
            with torch.cuda.amp.autocast(dtype=torch.bfloat16):
                student_outputs = self.student_model(
                    input_ids=student_inputs['input_ids'],
                    attention_mask=student_inputs['attention_mask']
                )
                student_logits = student_outputs.logits
                
                # Compute loss
                total_loss, kd_loss, student_loss = self.compute_distillation_loss(
                    student_logits,
                    teacher_logits,
                    student_inputs['input_ids']
                )
        else:
            student_outputs = self.student_model(
                input_ids=student_inputs['input_ids'],
                attention_mask=student_inputs['attention_mask']
            )
            student_logits = student_outputs.logits
            
            total_loss, kd_loss, student_loss = self.compute_distillation_loss(
                student_logits,
                teacher_logits,
                student_inputs['input_ids']
            )
        
        # Backward
        if self.use_amp:
            self.scaler.scale(total_loss).backward()
            self.scaler.step(optimizer)
            self.scaler.update()
        else:
            total_loss.backward()
            optimizer.step()
            
        optimizer.zero_grad()
        
        # Clear cache
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            
        return {
            'total_loss': total_loss.item(),
            'kd_loss': kd_loss.item(),
            'student_loss': student_loss.item()
        }

print("✅ Knowledge Distillation Trainer hazır")

## 📊 4. Turkish-Specific Evaluation Metrics

In [None]:
from evaluate import load
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from rouge_score import rouge_scorer
import numpy as np
from typing import List, Dict, Tuple

# NLTK data indir
nltk.download('punkt', quiet=True)

class TurkishEvaluationMetrics:
    """Türkçe için özelleştirilmiş değerlendirme metrikleri"""
    
    def __init__(self):
        # Metrik yükleyiciler
        try:
            self.bertscore = load("bertscore")
            self.sacrebleu = load("sacrebleu")
        except:
            print("⚠️ Bazı metrikler yüklenemedi")
            self.bertscore = None
            self.sacrebleu = None
            
        self.rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
        
        # Türkçe özel karakterler için normalizasyon
        self.turkish_chars = {
            'ı': 'i', 'ğ': 'g', 'ü': 'u', 'ş': 's', 'ö': 'o', 'ç': 'c',
            'İ': 'I', 'Ğ': 'G', 'Ü': 'U', 'Ş': 'S', 'Ö': 'O', 'Ç': 'C'
        }
        
    def normalize_turkish(self, text: str) -> str:
        """Türkçe karakterleri normalize et (opsiyonel)"""
        for tr_char, eng_char in self.turkish_chars.items():
            text = text.replace(tr_char, eng_char)
        return text
        
    def calculate_bleu(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
        """BLEU skorları hesapla"""
        bleu_scores = []
        
        for pred, ref in zip(predictions, references):
            # Tokenize
            pred_tokens = pred.split()
            ref_tokens = [ref.split()]
            
            # BLEU-1 to BLEU-4
            scores = []
            for n in range(1, 5):
                score = sentence_bleu(
                    ref_tokens, 
                    pred_tokens,
                    weights=tuple([1/n] * n + [0] * (4-n))
                )
                scores.append(score)
                
            bleu_scores.append(scores)
            
        bleu_scores = np.array(bleu_scores)
        
        return {
            'bleu1': np.mean(bleu_scores[:, 0]),
            'bleu2': np.mean(bleu_scores[:, 1]),
            'bleu3': np.mean(bleu_scores[:, 2]),
            'bleu4': np.mean(bleu_scores[:, 3]),
        }
        
    def calculate_rouge(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
        """ROUGE skorları hesapla"""
        rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
        
        for pred, ref in zip(predictions, references):
            scores = self.rouge.score(pred, ref)
            for key in rouge_scores:
                rouge_scores[key].append(scores[key].fmeasure)
                
        return {
            key: np.mean(values) 
            for key, values in rouge_scores.items()
        }
        
    def calculate_bertscore(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
        """BERTScore hesapla"""
        if self.bertscore is None:
            return {'bertscore_f1': 0.0}
            
        results = self.bertscore.compute(
            predictions=predictions,
            references=references,
            lang="tr"  # Türkçe için
        )
        
        return {
            'bertscore_precision': np.mean(results['precision']),
            'bertscore_recall': np.mean(results['recall']),
            'bertscore_f1': np.mean(results['f1'])
        }
        
    def calculate_turkish_specific_metrics(self, 
                                          predictions: List[str], 
                                          references: List[str]) -> Dict[str, float]:
        """Türkçe'ye özel metrikler"""
        metrics = {}
        
        # Türkçe karakter korunumu
        tr_char_preservation = []
        for pred, ref in zip(predictions, references):
            ref_tr_chars = sum(1 for c in ref if c in 'çğıöşüÇĞİÖŞÜ')
            pred_tr_chars = sum(1 for c in pred if c in 'çğıöşüÇĞİÖŞÜ')
            
            if ref_tr_chars > 0:
                preservation = min(pred_tr_chars / ref_tr_chars, 1.0)
                tr_char_preservation.append(preservation)
                
        metrics['turkish_char_preservation'] = np.mean(tr_char_preservation) if tr_char_preservation else 0.0
        
        # Kelime uzunluğu benzerliği (Türkçe'de önemli)
        length_similarity = []
        for pred, ref in zip(predictions, references):
            pred_len = len(pred.split())
            ref_len = len(ref.split())
            if ref_len > 0:
                similarity = 1 - abs(pred_len - ref_len) / ref_len
                length_similarity.append(max(0, similarity))
                
        metrics['length_similarity'] = np.mean(length_similarity) if length_similarity else 0.0
        
        return metrics
        
    def evaluate_all(self, 
                    predictions: List[str], 
                    references: List[str]) -> Dict[str, float]:
        """Tüm metrikleri hesapla"""
        all_metrics = {}
        
        # BLEU
        all_metrics.update(self.calculate_bleu(predictions, references))
        
        # ROUGE
        all_metrics.update(self.calculate_rouge(predictions, references))
        
        # BERTScore
        if len(predictions) < 100:  # Büyük veri setleri için yavaş olabilir
            all_metrics.update(self.calculate_bertscore(predictions, references))
            
        # Türkçe özel
        all_metrics.update(self.calculate_turkish_specific_metrics(predictions, references))
        
        return all_metrics

# Test
evaluator = TurkishEvaluationMetrics()
print("✅ Turkish Evaluation Metrics hazır")

# Örnek değerlendirme
test_preds = ["Bu bir test cümlesidir."]
test_refs = ["Bu bir deneme cümlesidir."]
test_metrics = evaluator.evaluate_all(test_preds, test_refs)
print("\n📊 Örnek metrikler:")
for key, value in test_metrics.items():
    print(f"  {key}: {value:.4f}")

## 🎯 5. Curriculum Learning Implementation

In [None]:
import random
from typing import List, Dict, Tuple
import numpy as np
from torch.utils.data import Dataset, DataLoader

class CurriculumLearningDataset(Dataset):
    """Müfredat öğrenmesi için veri seti"""
    
    def __init__(self, 
                 data: List[Dict],
                 difficulty_scorer=None,
                 curriculum_strategy: str = 'linear'):
        """
        Args:
            data: Veri listesi
            difficulty_scorer: Zorluk skoru hesaplayan fonksiyon
            curriculum_strategy: 'linear', 'exponential', 'adaptive'
        """
        self.data = data
        self.curriculum_strategy = curriculum_strategy
        
        # Zorluk skorlarını hesapla
        if difficulty_scorer is None:
            self.difficulty_scorer = self._default_difficulty_scorer
        else:
            self.difficulty_scorer = difficulty_scorer
            
        self._calculate_difficulties()
        self._sort_by_difficulty()
        
        # Curriculum state
        self.current_phase = 0
        self.total_phases = 5
        self.samples_per_phase = len(self.data) // self.total_phases
        
    def _default_difficulty_scorer(self, item: Dict) -> float:
        """Varsayılan zorluk skoru hesaplama"""
        text = item.get('text', '')
        
        # Zorluk faktörleri
        factors = []
        
        # 1. Uzunluk (daha uzun = daha zor)
        length_score = min(len(text) / 1000, 1.0)
        factors.append(length_score)
        
        # 2. Kelime çeşitliliği
        words = text.lower().split()
        if words:
            unique_ratio = len(set(words)) / len(words)
            factors.append(unique_ratio)
        else:
            factors.append(0)
            
        # 3. Ortalama kelime uzunluğu
        if words:
            avg_word_len = np.mean([len(w) for w in words])
            factors.append(min(avg_word_len / 10, 1.0))
        else:
            factors.append(0)
            
        # 4. Cümle karmaşıklığı (noktalama sayısı)
        punct_count = sum(1 for c in text if c in '.,;:!?')
        factors.append(min(punct_count / 20, 1.0))
        
        return np.mean(factors)
        
    def _calculate_difficulties(self):
        """Her örnek için zorluk skoru hesapla"""
        for item in self.data:
            item['difficulty'] = self.difficulty_scorer(item)
            
    def _sort_by_difficulty(self):
        """Veriyi zorluğa göre sırala"""
        self.data.sort(key=lambda x: x['difficulty'])
        
    def advance_curriculum(self):
        """Müfredatta ilerleme"""
        if self.current_phase < self.total_phases - 1:
            self.current_phase += 1
            print(f"📚 Curriculum advanced to phase {self.current_phase + 1}/{self.total_phases}")
            return True
        return False
        
    def get_current_data(self) -> List[Dict]:
        """Mevcut fazdaki veriyi getir"""
        if self.curriculum_strategy == 'linear':
            # Lineer ilerleme
            end_idx = min(
                (self.current_phase + 1) * self.samples_per_phase,
                len(self.data)
            )
            return self.data[:end_idx]
            
        elif self.curriculum_strategy == 'exponential':
            # Üstel ilerleme
            ratio = (self.current_phase + 1) / self.total_phases
            end_idx = int(len(self.data) * (ratio ** 2))
            return self.data[:max(1, end_idx)]
            
        elif self.curriculum_strategy == 'adaptive':
            # Adaptif (performance bazlı - şimdilik linear)
            return self.get_current_data_linear()
            
        else:
            return self.data
            
    def __len__(self):
        return len(self.get_current_data())
        
    def __getitem__(self, idx):
        current_data = self.get_current_data()
        return current_data[idx]
        
    def get_curriculum_stats(self) -> Dict:
        """Müfredat istatistikleri"""
        current_data = self.get_current_data()
        difficulties = [item['difficulty'] for item in current_data]
        
        return {
            'current_phase': self.current_phase + 1,
            'total_phases': self.total_phases,
            'current_samples': len(current_data),
            'total_samples': len(self.data),
            'avg_difficulty': np.mean(difficulties) if difficulties else 0,
            'min_difficulty': min(difficulties) if difficulties else 0,
            'max_difficulty': max(difficulties) if difficulties else 0,
        }

# Test
test_data = [
    {'text': 'Kısa metin.'},
    {'text': 'Bu biraz daha uzun bir metin örneği.'},
    {'text': 'Çok uzun ve karmaşık bir metin örneği ile devam ediyoruz, noktalama işaretleri de var!'},
]

curriculum_dataset = CurriculumLearningDataset(
    test_data,
    curriculum_strategy='linear'
)

print("✅ Curriculum Learning Dataset hazır")
print("\n📊 Curriculum İstatistikleri:")
stats = curriculum_dataset.get_curriculum_stats()
for key, value in stats.items():
    print(f"  {key}: {value}")

## 🔄 6. Complete Training Pipeline with Error Recovery

In [None]:
import os
import json
import time
import traceback
from pathlib import Path
from dataclasses import dataclass, asdict
from typing import Optional
import torch
from tqdm import tqdm

@dataclass
class TrainingConfig:
    """Eğitim konfigürasyonu"""
    # Model configs
    teacher_model_id: str = "turkcell/Turkcell-LLM-7b-v1"
    student_model_id: str = "ytu-ce-cosmos/turkish-gpt2-large"
    
    # Training configs
    batch_size: int = 16
    learning_rate: float = 2e-5
    num_epochs: int = 3
    warmup_steps: int = 500
    gradient_accumulation_steps: int = 2
    
    # Distillation configs
    temperature: float = 3.0
    alpha: float = 0.7
    
    # Optimization configs
    use_gradient_checkpointing: bool = True
    mixed_precision: str = "bf16"
    max_grad_norm: float = 1.0
    
    # Curriculum learning
    use_curriculum: bool = True
    curriculum_strategy: str = "linear"
    
    # Checkpointing
    save_steps: int = 1000
    eval_steps: int = 500
    checkpoint_dir: str = "./checkpoints"
    
    # Early stopping
    early_stopping_patience: int = 3
    early_stopping_threshold: float = 0.001

class RobustTrainingPipeline:
    """Hata kurtarma özellikli eğitim pipeline'ı"""
    
    def __init__(self, config: TrainingConfig):
        self.config = config
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # Checkpoint dizini
        self.checkpoint_dir = Path(config.checkpoint_dir)
        self.checkpoint_dir.mkdir(exist_ok=True)
        
        # Training state
        self.global_step = 0
        self.current_epoch = 0
        self.best_eval_loss = float('inf')
        self.patience_counter = 0
        
        # Metrics tracking
        self.training_history = []
        
    def save_checkpoint(self, 
                       model, 
                       optimizer, 
                       scheduler,
                       metrics: Dict,
                       is_best: bool = False):
        """Checkpoint kaydet"""
        checkpoint = {
            'global_step': self.global_step,
            'epoch': self.current_epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict() if scheduler else None,
            'metrics': metrics,
            'config': asdict(self.config),
            'training_history': self.training_history
        }
        
        # Regular checkpoint
        checkpoint_path = self.checkpoint_dir / f"checkpoint-{self.global_step}.pt"
        torch.save(checkpoint, checkpoint_path)
        print(f"💾 Checkpoint saved: {checkpoint_path}")
        
        # Best model
        if is_best:
            best_path = self.checkpoint_dir / "best_model.pt"
            torch.save(checkpoint, best_path)
            print(f"🏆 Best model saved: {best_path}")
            
        # Son 3 checkpoint'i tut
        self._cleanup_old_checkpoints()
        
    def load_checkpoint(self, checkpoint_path: str, model, optimizer=None, scheduler=None):
        """Checkpoint yükle"""
        checkpoint = torch.load(checkpoint_path, map_location=self.device)
        
        model.load_state_dict(checkpoint['model_state_dict'])
        
        if optimizer and 'optimizer_state_dict' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            
        if scheduler and 'scheduler_state_dict' in checkpoint:
            scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
            
        self.global_step = checkpoint.get('global_step', 0)
        self.current_epoch = checkpoint.get('epoch', 0)
        self.training_history = checkpoint.get('training_history', [])
        
        print(f"✅ Checkpoint loaded from: {checkpoint_path}")
        print(f"   Resuming from epoch {self.current_epoch}, step {self.global_step}")
        
        return checkpoint.get('metrics', {})
        
    def _cleanup_old_checkpoints(self, keep_last: int = 3):
        """Eski checkpoint'leri temizle"""
        checkpoints = sorted(self.checkpoint_dir.glob("checkpoint-*.pt"))
        
        if len(checkpoints) > keep_last:
            for checkpoint in checkpoints[:-keep_last]:
                checkpoint.unlink()
                print(f"🗑️ Deleted old checkpoint: {checkpoint}")
                
    def handle_cuda_oom(self):
        """CUDA OOM hatası yönetimi"""
        print("\n⚠️ CUDA OOM Detected! Attempting recovery...")
        
        # GPU belleğini temizle
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.synchronize()
            
        # Batch size'ı azalt
        self.config.batch_size = max(1, self.config.batch_size // 2)
        print(f"   Reduced batch size to: {self.config.batch_size}")
        
        # Gradient accumulation'ı artır
        self.config.gradient_accumulation_steps *= 2
        print(f"   Increased gradient accumulation to: {self.config.gradient_accumulation_steps}")
        
        time.sleep(5)  # GPU'nun toparlanması için bekle
        
        return True
        
    def train_with_recovery(self, 
                           train_dataloader,
                           eval_dataloader,
                           model,
                           optimizer,
                           scheduler,
                           trainer):
        """Hata kurtarma özellikli eğitim döngüsü"""
        
        max_retries = 3
        retry_count = 0
        
        while retry_count < max_retries:
            try:
                # Ana eğitim döngüsü
                for epoch in range(self.current_epoch, self.config.num_epochs):
                    self.current_epoch = epoch
                    print(f"\n📚 Epoch {epoch + 1}/{self.config.num_epochs}")
                    
                    # Training
                    model.train()
                    epoch_losses = []
                    
                    progress_bar = tqdm(train_dataloader, desc="Training")
                    for batch_idx, batch in enumerate(progress_bar):
                        try:
                            # Training step
                            loss_dict = trainer.training_step(batch, optimizer)
                            
                            epoch_losses.append(loss_dict['total_loss'])
                            self.global_step += 1
                            
                            # Update progress
                            progress_bar.set_postfix({
                                'loss': loss_dict['total_loss'],
                                'kd_loss': loss_dict['kd_loss']
                            })
                            
                            # Checkpoint kaydet
                            if self.global_step % self.config.save_steps == 0:
                                metrics = {
                                    'train_loss': np.mean(epoch_losses),
                                    'learning_rate': optimizer.param_groups[0]['lr']
                                }
                                self.save_checkpoint(model, optimizer, scheduler, metrics)
                                
                            # Evaluation
                            if self.global_step % self.config.eval_steps == 0:
                                eval_loss = self.evaluate(model, eval_dataloader, trainer)
                                
                                # Early stopping check
                                if self.check_early_stopping(eval_loss):
                                    print("\n🛑 Early stopping triggered!")
                                    return
                                    
                        except RuntimeError as e:
                            if "out of memory" in str(e):
                                if self.handle_cuda_oom():
                                    continue
                                else:
                                    raise
                            else:
                                raise
                                
                    # Epoch bitişi
                    avg_epoch_loss = np.mean(epoch_losses)
                    print(f"\n📊 Epoch {epoch + 1} completed. Avg loss: {avg_epoch_loss:.4f}")
                    
                    # Curriculum learning advancement
                    if hasattr(train_dataloader.dataset, 'advance_curriculum'):
                        train_dataloader.dataset.advance_curriculum()
                        
                # Eğitim başarıyla tamamlandı
                print("\n✅ Training completed successfully!")
                break
                
            except Exception as e:
                retry_count += 1
                print(f"\n❌ Training error (attempt {retry_count}/{max_retries}): {e}")
                print(traceback.format_exc())
                
                if retry_count < max_retries:
                    print("\n🔄 Attempting to recover from last checkpoint...")
                    
                    # Son checkpoint'i yükle
                    latest_checkpoint = self.get_latest_checkpoint()
                    if latest_checkpoint:
                        self.load_checkpoint(latest_checkpoint, model, optimizer, scheduler)
                        time.sleep(10)  # Sistem toparlanması için bekle
                    else:
                        print("No checkpoint found, starting from scratch")
                        self.global_step = 0
                        self.current_epoch = 0
                else:
                    print("\n❌ Maximum retries reached. Training failed.")
                    raise
                    
    def evaluate(self, model, eval_dataloader, trainer) -> float:
        """Model değerlendirme"""
        model.eval()
        eval_losses = []
        
        with torch.no_grad():
            for batch in tqdm(eval_dataloader, desc="Evaluating"):
                loss_dict = trainer.training_step(batch, None)  # optimizer=None for eval
                eval_losses.append(loss_dict['total_loss'])
                
        avg_eval_loss = np.mean(eval_losses)
        print(f"\n📊 Evaluation loss: {avg_eval_loss:.4f}")
        
        # Best model check
        if avg_eval_loss < self.best_eval_loss:
            self.best_eval_loss = avg_eval_loss
            metrics = {'eval_loss': avg_eval_loss}
            self.save_checkpoint(model, None, None, metrics, is_best=True)
            self.patience_counter = 0
        else:
            self.patience_counter += 1
            
        model.train()
        return avg_eval_loss
        
    def check_early_stopping(self, eval_loss: float) -> bool:
        """Early stopping kontrolü"""
        if self.patience_counter >= self.config.early_stopping_patience:
            return True
            
        if abs(self.best_eval_loss - eval_loss) < self.config.early_stopping_threshold:
            self.patience_counter += 1
        else:
            self.patience_counter = 0
            
        return False
        
    def get_latest_checkpoint(self) -> Optional[str]:
        """En son checkpoint'i bul"""
        checkpoints = list(self.checkpoint_dir.glob("checkpoint-*.pt"))
        if checkpoints:
            return str(max(checkpoints, key=lambda x: x.stat().st_mtime))
        return None

# Test
config = TrainingConfig()
pipeline = RobustTrainingPipeline(config)
print("✅ Robust Training Pipeline hazır")
print(f"\n📁 Checkpoint directory: {pipeline.checkpoint_dir}")

## 🚀 7. Main Training Execution

In [None]:
def main_training():
    """Ana eğitim fonksiyonu"""
    
    print("="*70)
    print("🚀 TURKISH NLP KNOWLEDGE DISTILLATION PIPELINE")
    print("="*70)
    
    # 1. Configuration
    config = TrainingConfig(
        batch_size=BATCH_SIZE,  # GPU'ya göre ayarlandı
        num_epochs=3,
        learning_rate=2e-5,
        temperature=selected_teacher.distillation_temperature
    )
    
    print("\n📋 Configuration:")
    for key, value in asdict(config).items():
        print(f"  {key}: {value}")
        
    # 2. Load models
    print("\n📚 Loading models...")
    
    # Teacher model (4-bit quantization)
    teacher_bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    
    teacher_model = AutoModelForCausalLM.from_pretrained(
        config.teacher_model_id,
        quantization_config=teacher_bnb_config,
        device_map="auto",
        trust_remote_code=True
    )
    
    teacher_tokenizer = AutoTokenizer.from_pretrained(
        config.teacher_model_id,
        trust_remote_code=True
    )
    
    print(f"✅ Teacher model loaded: {config.teacher_model_id}")
    
    # Student model
    student_model = AutoModelForCausalLM.from_pretrained(
        config.student_model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )
    
    student_tokenizer = AutoTokenizer.from_pretrained(config.student_model_id)
    
    print(f"✅ Student model loaded: {config.student_model_id}")
    
    # 3. Unified tokenizer
    unified_tokenizer = UnifiedTokenizerWrapper(teacher_tokenizer, student_tokenizer)
    
    # 4. Initialize trainer
    kd_trainer = KnowledgeDistillationTrainer(
        teacher_model=teacher_model,
        student_model=student_model,
        unified_tokenizer=unified_tokenizer,
        temperature=config.temperature,
        alpha=config.alpha,
        use_gradient_checkpointing=config.use_gradient_checkpointing,
        mixed_precision=config.mixed_precision
    )
    
    print("\n✅ Training setup complete!")
    print("\n" + "="*70)
    print("Ready for training! Load your data and start the pipeline.")
    print("="*70)
    
    return config, kd_trainer, unified_tokenizer

# Ana pipeline'ı başlat
if __name__ == "__main__":
    config, trainer, tokenizer = main_training()