# Data Collection - Turkish Educational Datasets
## TEKNOFEST 2025 - Eğitim Teknolojileri

Bu notebook, Türkçe eğitim veri setlerini toplamak için kullanılır.

In [None]:
# Gerekli kütüphaneleri import et
import os
import sys
import pandas as pd

# Proje root'unu path'e ekle
sys.path.append('../')

print("Libraries imported successfully")

In [None]:
# Hugging Face datasets kütüphanesini yükle
try:
    from datasets import load_dataset
    print("[OK] datasets library is available")
except ImportError:
    print("[INFO] Installing datasets library...")
    !pip install datasets
    from datasets import load_dataset
    print("[OK] datasets library installed and imported")

## Turkish Quiz Instruct Dataset

In [None]:
# Hugging Face'ten Turkish Quiz dataset'ini indir
print("Downloading Turkish Quiz Instruct dataset...")

try:
    dataset = load_dataset("Kamyar-zeinalipour/Turkish-Quiz-Instruct")
    print(f"[OK] Dataset loaded successfully")
    print(f"Available splits: {list(dataset.keys())}")
except Exception as e:
    print(f"[ERROR] Failed to load dataset: {e}")
    dataset = None

In [None]:
# Dataset bilgilerini göster
if dataset is not None:
    train_data = dataset['train']
    print(f"\nDataset Info:")
    print(f"- Number of examples: {len(train_data)}")
    print(f"- Features: {train_data.features}")
    
    # İlk birkaç örneği göster
    print("\nFirst 3 examples:")
    for i in range(min(3, len(train_data))):
        print(f"\nExample {i+1}:")
        for key, value in train_data[i].items():
            print(f"  {key}: {str(value)[:100]}..." if len(str(value)) > 100 else f"  {key}: {value}")

In [None]:
# CSV olarak kaydet
if dataset is not None:
    output_path = '../data/raw/turkish_quiz_instruct.csv'
    
    # Dizin yoksa oluştur
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # DataFrame'e çevir ve kaydet
    df = pd.DataFrame(dataset['train'])
    df.to_csv(output_path, index=False, encoding='utf-8')
    
    print(f"[OK] {len(df)} examples saved to {output_path}")
    print(f"File size: {os.path.getsize(output_path) / 1024 / 1024:.2f} MB")

## Additional Turkish Educational Datasets

In [None]:
# Diğer Türkçe eğitim datasetleri listesi
turkish_datasets = [
    "turkish-nlp-suite/turkish-texts",
    "ertugruldemir/TurkishNLP",
    # Daha fazla dataset eklenebilir
]

print("Other available Turkish datasets:")
for ds_name in turkish_datasets:
    print(f"- {ds_name}")

In [None]:
# Veri istatistikleri
if os.path.exists('../data/raw/turkish_quiz_instruct.csv'):
    df = pd.read_csv('../data/raw/turkish_quiz_instruct.csv')
    
    print("\nDataset Statistics:")
    print(f"- Total rows: {len(df)}")
    print(f"- Total columns: {len(df.columns)}")
    print(f"- Column names: {list(df.columns)}")
    print(f"\nMemory usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")
    
    # Null değerleri kontrol et
    print("\nNull values per column:")
    print(df.isnull().sum())