In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


# **"google/mt5-small"**

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
import torch
from sklearn.model_selection import train_test_split

# GPU kontrolü
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# CSV dosyasını okuma
def load_data(file_path):
    df = pd.read_csv(file_path)
    return df

# Veri setini hazırlama ve bölme
def prepare_dataset(df):
    train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)
    train_dataset = Dataset.from_pandas(train_df)
    eval_dataset = Dataset.from_pandas(eval_df)
    return train_dataset, eval_dataset

# Model ve tokenizer'ı yükleme
def load_model_and_tokenizer():
    model_name = "google/mt5-small"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
    return model, tokenizer

# Veriyi tokenize etme
def tokenize_data(dataset, tokenizer):
    def tokenize_function(examples):
        model_inputs = tokenizer(
            examples["input"],
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )

        labels = tokenizer(
            examples["response"],
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )

        model_inputs = {k: v.to(device) for k, v in model_inputs.items()}
        model_inputs["labels"] = labels["input_ids"].to(device)
        return model_inputs

    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)
    return tokenized_dataset

# Eğitim fonksiyonu
def train_model(model, train_dataset, eval_dataset, tokenizer):
    training_args = Seq2SeqTrainingArguments(
        output_dir="./results",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=500,
        save_steps=1000,
        save_total_limit=2,
        no_cuda=not torch.cuda.is_available()
    )

    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
    )

    trainer.train()
    return trainer

# Modeli kaydetme
def save_model(trainer, output_dir):
    trainer.save_model(output_dir)

# Tahmin yapma fonksiyonu
def predict(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Ana çalıştırma fonksiyonu
def main():
    file_path = "/content/drive/MyDrive/Yüksek Lisans Projeleri/verbilimi/Model İnput/499satır.csv"
    df = load_data(file_path)

    train_dataset, eval_dataset = prepare_dataset(df)
    model, tokenizer = load_model_and_tokenizer()

    tokenized_train_dataset = tokenize_data(train_dataset, tokenizer)
    tokenized_eval_dataset = tokenize_data(eval_dataset, tokenizer)

    trainer = train_model(model, tokenized_train_dataset, tokenized_eval_dataset, tokenizer)
    save_model(trainer, "./final_model")

    # Test için örnek tahmin
    sample_text = "data nedir"
    prediction = predict(sample_text, model, tokenizer)
    print(f"Soru: {sample_text}")
    print(f"Cevap: {prediction}")

if __name__ == "__main__":
    main()

Using device: cuda




Map:   0%|          | 0/398 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss


Soru: data nedir
Cevap: ... <extra_id_0>ругivöl骛entals <extra_id_3>labu <extra_id_11>ಿಪ


# **ozcangundes/mt5-small-turkish-summarization**

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
import torch
from sklearn.model_selection import train_test_split

# GPU kontrolü
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# CSV dosyasını okuma
def load_data(file_path):
    df = pd.read_csv(file_path)
    # Sütun isimlerini kontrol et ve düzelt
    if 'input' not in df.columns or 'response' not in df.columns:
        df.columns = ['input', 'response']
    return df

# Veri setini hazırlama ve bölme
def prepare_dataset(df):
    # Veriyi temizle
    df['input'] = df['input'].str.strip()
    df['response'] = df['response'].str.strip()

    # Eğitim ve değerlendirme için böl
    train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)

    # Dataset objelerine dönüştür
    train_dataset = Dataset.from_pandas(train_df)
    eval_dataset = Dataset.from_pandas(eval_df)

    return train_dataset, eval_dataset

# Model ve tokenizer'ı yükleme
def load_model_and_tokenizer():
    # Türkçe için daha uygun bir model seçelim
    model_name = "ozcangundes/mt5-small-turkish-summarization"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
    return model, tokenizer

# Veriyi tokenize etme
def tokenize_data(dataset, tokenizer):
    def tokenize_function(examples):
        # Girdi metinlerine prefix ekle
        prefix = "soru: "
        inputs = [prefix + text for text in examples["input"]]

        model_inputs = tokenizer(
            inputs,
            padding="max_length",
            truncation=True,
            max_length=256,  # Daha uzun metin için
            return_tensors="pt"
        )

        labels = tokenizer(
            examples["response"],
            padding="max_length",
            truncation=True,
            max_length=256,  # Daha uzun cevaplar için
            return_tensors="pt"
        )

        model_inputs = {k: v.to(device) for k, v in model_inputs.items()}
        model_inputs["labels"] = labels["input_ids"].to(device)
        return model_inputs

    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)
    return tokenized_dataset

# Eğitim fonksiyonu
def train_model(model, train_dataset, eval_dataset, tokenizer):
    training_args = Seq2SeqTrainingArguments(
        output_dir="./results",
        num_train_epochs=5,  # Epoch sayısını artırdık
        per_device_train_batch_size=4,  # Batch size'ı düşürdük
        per_device_eval_batch_size=4,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=100,
        save_steps=200,
        save_total_limit=2,
        no_cuda=not torch.cuda.is_available(),
        learning_rate=2e-5  # Learning rate'i düşürdük
    )

    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
    )

    trainer.train()
    return trainer

# Modeli kaydetme
def save_model(trainer, output_dir):
    trainer.save_model(output_dir)

# Tahmin yapma fonksiyonu
def predict(text, model, tokenizer):
    # Girdi metnine prefix ekle
    text = f"soru: {text}"
    inputs = tokenizer(text,
                      return_tensors="pt",
                      padding=True,
                      truncation=True,
                      max_length=256)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    outputs = model.generate(
        **inputs,
        max_length=256,
        num_beams=5,
        length_penalty=1.0,
        early_stopping=True,
        no_repeat_ngram_size=2
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Ana çalıştırma fonksiyonu
def main():
    file_path = "/content/drive/MyDrive/Yüksek Lisans Projeleri/verbilimi/Model İnput/499satır.csv"
    df = load_data(file_path)

    train_dataset, eval_dataset = prepare_dataset(df)
    model, tokenizer = load_model_and_tokenizer()

    tokenized_train_dataset = tokenize_data(train_dataset, tokenizer)
    tokenized_eval_dataset = tokenize_data(eval_dataset, tokenizer)

    trainer = train_model(model, tokenized_train_dataset, tokenized_eval_dataset, tokenizer)
    save_model(trainer, "./final_model")

    # Test için birkaç örnek soru
    test_questions = [
        "Data Science nedir?",
        "Python'da makine öğrenimi için hangi kütüphaneler kullanılıyor?",
        "Outlier nedir?"
    ]

    print("\nTest Sonuçları:")
    for question in test_questions:
        prediction = predict(question, model, tokenizer)
        print(f"\nSoru: {question}")
        print(f"Cevap: {prediction}")

if __name__ == "__main__":
    main()

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/388 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Map:   0%|          | 0/398 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss
100,14.5268,12.91643
200,9.8443,8.740439
300,7.9633,7.308992
400,5.9664,4.640598
500,5.325,4.372823



Test Sonuçları:

Soru: Data Science nedir?
Cevap: 㽇 , sanayi ve teknolojiler arasında yer alması ile birlikte olarak bilinen en çok kullanılan en önemli yöntemler .

Soru: Python'da makine öğrenimi için hangi kütüphaneler kullanılıyor?
Cevap: aruoloji uzmanı Hande Kazanova , olarak bilinen ve çok sayıda öğrenci olmak üzere birçok alanda kullanılan yöntemle ilgili neler yaşanıyor ?

Soru: Outlier nedir?
Cevap: 㽇 , olarak kullanılan yöntemlerden biri olan en çok konuşulan en önemli konulardan biri olmak üzere birçok ülkenin en iyi sektörde neler yaşanacak .


# **facebook/mbart-large-cc25**

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
import torch
from sklearn.model_selection import train_test_split

# GPU kontrolü
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# CSV dosyasını okuma
def load_data(file_path):
    df = pd.read_csv(file_path)
    # Sütun isimlerini kontrol et ve düzelt
    if 'input' not in df.columns or 'response' not in df.columns:
        df.columns = ['input', 'response']
    return df

# Veri setini hazırlama ve bölme
def prepare_dataset(df):
    # Veriyi temizle
    df['input'] = df['input'].str.strip()
    df['response'] = df['response'].str.strip()

    # Eğitim ve değerlendirme için böl
    train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)

    # Dataset objelerine dönüştür
    train_dataset = Dataset.from_pandas(train_df)
    eval_dataset = Dataset.from_pandas(eval_df)

    return train_dataset, eval_dataset

# Model ve tokenizer'ı yükleme
def load_model_and_tokenizer():
    # Türkçe için daha uygun bir model seçelim
    model_name = "facebook/mbart-large-cc25"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
    return model, tokenizer

# Veriyi tokenize etme
def tokenize_data(dataset, tokenizer):
    def tokenize_function(examples):
        # Girdi metinlerine prefix ekle
        prefix = "soru: "
        inputs = [prefix + text for text in examples["input"]]

        model_inputs = tokenizer(
            inputs,
            padding="max_length",
            truncation=True,
            max_length=256,  # Daha uzun metin için
            return_tensors="pt"
        )

        labels = tokenizer(
            examples["response"],
            padding="max_length",
            truncation=True,
            max_length=256,  # Daha uzun cevaplar için
            return_tensors="pt"
        )

        model_inputs = {k: v.to(device) for k, v in model_inputs.items()}
        model_inputs["labels"] = labels["input_ids"].to(device)
        return model_inputs

    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)
    return tokenized_dataset

# Eğitim fonksiyonu
def train_model(model, train_dataset, eval_dataset, tokenizer):
    training_args = Seq2SeqTrainingArguments(
        output_dir="./results",
        num_train_epochs=5,  # Epoch sayısını artırdık
        per_device_train_batch_size=4,  # Batch size'ı düşürdük
        per_device_eval_batch_size=4,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=100,
        save_steps=200,
        save_total_limit=2,
        no_cuda=not torch.cuda.is_available(),
        learning_rate=2e-5  # Learning rate'i düşürdük
    )

    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
    )

    trainer.train()
    return trainer

# Modeli kaydetme
def save_model(trainer, output_dir):
    trainer.save_model(output_dir)

# Tahmin yapma fonksiyonu
def predict(text, model, tokenizer):
    # Girdi metnine prefix ekle
    text = f"soru: {text}"
    inputs = tokenizer(text,
                      return_tensors="pt",
                      padding=True,
                      truncation=True,
                      max_length=256)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    outputs = model.generate(
        **inputs,
        max_length=256,
        num_beams=5,
        length_penalty=1.0,
        early_stopping=True,
        no_repeat_ngram_size=2
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Ana çalıştırma fonksiyonu
def main():
    file_path = "/content/drive/MyDrive/Yüksek Lisans Projeleri/verbilimi/Model İnput/499satır.csv"
    df = load_data(file_path)

    train_dataset, eval_dataset = prepare_dataset(df)
    model, tokenizer = load_model_and_tokenizer()

    tokenized_train_dataset = tokenize_data(train_dataset, tokenizer)
    tokenized_eval_dataset = tokenize_data(eval_dataset, tokenizer)

    trainer = train_model(model, tokenized_train_dataset, tokenized_eval_dataset, tokenizer)
    save_model(trainer, "./final_model")

    # Test için birkaç örnek soru
    test_questions = [
        "Data Science nedir?",
        "Python'da makine öğrenimi için hangi kütüphaneler kullanılıyor?",
        "Outlier nedir?"
    ]

    print("\nTest Sonuçları:")
    for question in test_questions:
        prediction = predict(question, model, tokenizer)
        print(f"\nSoru: {question}")
        print(f"Cevap: {prediction}")

if __name__ == "__main__":
    main()

Using device: cuda


config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

Map:   0%|          | 0/398 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss
100,5.6183,4.549925
200,0.9237,0.82214
300,0.5675,0.668638
400,0.3512,0.509024
500,0.3122,0.512781





Test Sonuçları:

Soru: Data Science nedir?
Cevap: ››ШЫШЫҢҢÂÂ»)») Түркия Түркия Нұрсұлтан Нұрсұлтан Түркия Мәдениет Мәдениет Нұрсұлтан Мәдениет Түркия Ұлттық Нұрсұлтан Ұлттық Мәдениет Ұлттық Ұлттық ұста ұстанып Түркия ұста Түркия келі Түркия Қазақстанның Түркия жеткіз Түркия Өзбекстан ТүркияТәуелсіз Түркия Маңғыстау Түркия Ұлт Мәдениет жеткіз жеткізТәуелсізТәуелсіз Мәдениет келі келі жеткіз келі Нұрсұлтан жеткіз Нұрсұлтан Астанада Түркия Астанада МәдениетТәуелсіз НұрсұлтанТәуелсіз Қазақстанның Нұрсұлтан Қазақстанның МәдениетҮЙҮЙТәуелсізАрнайыАрнайыТәуелсізҚазақстан Нұрсұлтан Маңғыстау Нұрсұлтан Қазақстандық Мәдениет аумағы аумағы Түркия аумағы Нұрсұлтан әкімдігі Түркия әкімдігі әкімдігі Астанада Астанада Нұрсұлтан көрсетілетін көрсетілетін қызметтер қызметтер Мәдениет қызметтер жеткіз қызметтер көрсетілетін Түркия қызметтер аумағы әкімдігі Мәдениет әкімдігі аумағы қызметтер әкімдігі қызметтерімдерімдер жеткізімдер қызметтерІЛІЛ жеткізІЛТәуелсіз қызметтер Түркия көрсетілетін жеткіз әкім

# **savasy/bert-base-turkish-squad**

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
    DefaultDataCollator
)
import torch
from sklearn.model_selection import train_test_split

# GPU kontrolü
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# CSV dosyasını okuma ve gereksiz sütunları kaldırma
def load_data(file_path):
    df = pd.read_csv(file_path)
    df = df[['input', 'response']]  # Yalnızca 'input' ve 'response' sütunları alınır
    print("Dataset Columns:", df.columns)  # Yeni sütunları kontrol et
    return df

# Veri setini hazırlama ve bölme
def prepare_dataset(df):
    train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)

    def find_answer_start_end(context, answer):
        start = context.find(answer)
        end = start + len(answer)
        return start, end

    def add_answer_positions(example):
        # input ve response sütunlarını doğru şekilde kullanıyoruz
        context = example["input"]  # Doğru sütun adı
        answer = example["response"]  # Doğru sütun adı
        start, end = find_answer_start_end(context, answer)
        example["start_position"] = start
        example["end_position"] = end
        return example

    train_df = train_df.apply(add_answer_positions, axis=1)
    eval_df = eval_df.apply(add_answer_positions, axis=1)

    train_dataset = Dataset.from_pandas(train_df)
    eval_dataset = Dataset.from_pandas(eval_df)
    return train_dataset, eval_dataset

# Model ve tokenizer'ı yükleme
def load_model_and_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name).to(device)

    # Eğer pad_token_id eksikse, pad_token_id'yi manuel olarak tanımlayalım
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = 0  # BERT genellikle 0'ı pad_token_id olarak kullanır

    return model, tokenizer

# Veriyi tokenize etme
def tokenize_data(dataset, tokenizer):
    def preprocess_function(examples):
        tokenized = tokenizer(
            examples["input"],  # input sütunu
            examples["response"],  # response sütunu
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )

        # Adjust start and end positions to be relative to tokenized inputs
        start_positions = []
        end_positions = []
        for i in range(len(examples['input'])):
            start = examples['start_position'][i]
            end = examples['end_position'][i]

            # Ensure that start and end positions are valid token indices
            start_token = tokenized.char_to_token(i, start) if start >= 0 else None
            end_token = tokenized.char_to_token(i, end) if end >= 0 else None

            if start_token is None or end_token is None:
                # If tokenization fails, fallback to the default strategy
                input_ids = tokenized.input_ids[i].cpu().numpy()  # Convert to numpy array
                start_token = np.where(input_ids == tokenizer.pad_token_id)[0][0]
                end_token = start_token

            start_positions.append(start_token)
            end_positions.append(end_token)

        tokenized["start_positions"] = start_positions
        tokenized["end_positions"] = end_positions
        return tokenized

    tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)
    return tokenized_dataset

# Eğitim fonksiyonu
def train_model(model, train_dataset, eval_dataset, tokenizer):
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="steps",
        eval_steps=500,
        save_steps=1000,
        save_total_limit=2,
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        logging_dir="./logs",
        logging_steps=10,
        learning_rate=3e-5,
        weight_decay=0.01,
        warmup_steps=500,
        save_strategy="steps",
        load_best_model_at_end=True,
        report_to="none"
    )

    data_collator = DefaultDataCollator()

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()
    return trainer

# Modeli kaydetme
def save_model(trainer, output_dir):
    trainer.save_model(output_dir)

# Tahmin yapma fonksiyonu
def predict(question, context, model, tokenizer):
    inputs = tokenizer(
        question,
        context,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    ).to(device)

    outputs = model(**inputs)
    answer_start_index = torch.argmax(outputs.start_logits)
    answer_end_index = torch.argmax(outputs.end_logits) + 1

    answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(inputs.input_ids[0][answer_start_index:answer_end_index])
    )
    return answer

# Ana çalıştırma fonksiyonu
def main():
    file_path = "/content/drive/MyDrive/Yüksek Lisans Projeleri/verbilimi/Model İnput/499satır.csv"
    model_name = "savasy/bert-base-turkish-squad"  # Alternatif olarak: "xlm-roberta-base"

    df = load_data(file_path)
    train_dataset, eval_dataset = prepare_dataset(df)
    model, tokenizer = load_model_and_tokenizer(model_name)

    tokenized_train_dataset = tokenize_data(train_dataset, tokenizer)
    tokenized_eval_dataset = tokenize_data(eval_dataset, tokenizer)

    trainer = train_model(model, tokenized_train_dataset, tokenized_eval_dataset, tokenizer)
    save_model(trainer, "./final_model")

    # Test için örnek tahmin
    sample_question = "Data Science nedir"
    sample_context = "Data Science, veri odaklı hesaplama ve çıkarım odaklı düşüncenin dünyayı anlamak ve sorunları çözmek için kullanılmasıdır."
    prediction = predict(sample_question, sample_context, model, tokenizer)

    print(f"Soru: {sample_question}")
    print(f"Cevap: {prediction}")

if __name__ == "__main__":
    main()


Using device: cuda
Dataset Columns: Index(['input', 'response'], dtype='object')


Some weights of the model checkpoint at savasy/bert-base-turkish-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Map:   0%|          | 0/398 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss,Validation Loss


Soru: Data Science nedir
Cevap: .
