In [1]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
from torch.utils.data import Dataset as TorchDataset, random_split
from peft import get_peft_model, LoraConfig, TaskType

# Custom dataset class to handle our data
class CustomTextDataset(TorchDataset):
    def __init__(self, texts, tokenizer, block_size=128):
        self.examples = []
        for text in texts:
            tokenized_text = tokenizer(text, truncation=True, max_length=block_size, padding="max_length")
            self.examples.append(tokenized_text)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return {key: torch.tensor(val) for key, val in self.examples[i].items()}

# Load and preprocess dataset
def load_and_preprocess_dataset(file_path, tokenizer, block_size=128):
    df = pd.read_csv(file_path)
    df['text'] = df['madde'].astype(str) + ": " + df['anlam'].astype(str)
    
    # Filter texts with length >= 15 words
    df = df[df['text'].apply(lambda x: len(x.split()) >= 15)]
    
    texts = df['text'].tolist()
    print(len(texts))
    return CustomTextDataset(texts, tokenizer, block_size)




In [2]:
# Paths and configurations
train_file_path = "tdk_word_meaning_data.csv"  # Update with your CSV file path
model_name = r"C:\Users\STJ\Desktop\final_model_and_tokenizer"
output_dir = r"C:\Users\STJ\Desktop\results2"
overwrite_output_dir = True
per_device_train_batch_size = 8
num_train_epochs = 3
save_steps = 10

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Add pad token
special_tokens = {'pad_token': '<PAD>'}
num_added_toks = tokenizer.add_special_tokens(special_tokens)

model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

# Load dataset
dataset = load_and_preprocess_dataset(train_file_path, tokenizer)

# Split dataset into train and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])


7870


In [13]:
train_dataset[2]["input_ids"]

tensor([12613, 14613,    26, 10627, 17154,   423,   820,    12,  4408,   345,
        10202,    12,  4551,   736,  2635,   294, 13325,  1142,    12,  2544,
         5530,   960,  6173,   591,  2416, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257])

In [23]:

# Prepare data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Apply LoRA
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    inference_mode=False, 
    r     =8, 
    lora_alpha=32, 
    lora_dropout=0.1
)

model = get_peft_model(model, peft_config)

# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    save_steps=save_steps,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100,
    report_to="none",
    fp16=True,  # Use mixed precision (fp16) if supported by the hardware for faster training
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()
trainer.save_model(output_dir)




Step,Training Loss,Validation Loss
100,4.3788,4.064568
200,4.0975,3.968937
300,4.1632,3.914739
400,4.1005,3.880788
500,4.0926,3.851108
600,3.9797,3.833823
700,4.1286,3.815324
800,4.166,3.80372
900,4.1205,3.795265
1000,4.0535,3.787807




In [27]:
import os

model_path = r"C:\Users\Dell\OneDrive\Masaüstü\FineTunedModel"

# Check if all necessary files exist
required_files = ["config.json", "special_tokens_map.json", "tokenizer_config.json", "vocab.json", "merges.txt", "pytorch_model.bin"]
for file_name in required_files:
    file_path = os.path.join(model_path, file_name)
    if not os.path.isfile(file_path):
        print(f"Missing file: {file_path}")


Missing file: C:\Users\Dell\OneDrive\Masaüstü\FineTunedModel\config.json
Missing file: C:\Users\Dell\OneDrive\Masaüstü\FineTunedModel\special_tokens_map.json
Missing file: C:\Users\Dell\OneDrive\Masaüstü\FineTunedModel\tokenizer_config.json
Missing file: C:\Users\Dell\OneDrive\Masaüstü\FineTunedModel\vocab.json
Missing file: C:\Users\Dell\OneDrive\Masaüstü\FineTunedModel\merges.txt
Missing file: C:\Users\Dell\OneDrive\Masaüstü\FineTunedModel\pytorch_model.bin


In [35]:
from transformers import pipeline, GPT2Tokenizer, GPT2LMHeadModel

# Model ve tokenizer'ı yükleyin
model_path = r"C:\Users\STJ\Desktop\results2"  # Fine-tuned modelin kaydedildiği yol
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Pipeline oluşturma
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Örnek bir kelimenin anlamını bulma
def get_word_meaning(word, max_length=50):
    prompt = word + ":"
    generated_texts = text_generator(prompt, max_length=max_length, num_return_sequences=1)
    meaning = generated_texts[0]['generated_text'][len(prompt):].strip()
    return meaning

# Kelimenin anlamını almak için örnek kullanım
word = "uyumak nedemek "
meaning = get_word_meaning(word)
print(f"{word}: {meaning}")


Exception: Impossible to guess which tokenizer to use. Please provide a PreTrainedTokenizer class or a path/identifier to a pretrained tokenizer.

In [31]:
from transformers import AutoTokenizer, GPT2LMHeadModel
from transformers import pipeline

model = GPT2LMHeadModel.from_pretrained(r"C:\Users\STJ\Desktop\final_model_and_tokenizer")
tokenizer = AutoTokenizer.from_pretrained(r"C:\Users\STJ\Desktop\final_model_and_tokenizer")

text_generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
r = text_generator("Teknolojinin gelişimi hayatımızı önemli ölçüde etkiledi. ", max_length=100)
[{'generated_text': ""}]


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': ''}]

In [32]:
r

[{'generated_text': 'Teknolojinin gelişimi hayatımızı önemli ölçüde etkiledi. "Teknoloji" deyince aklımıza ilk etapta makineler geliyor. Örneğin, televizyon gibi yeni bir teknoloji olan televizyonun mucidi James Cromwell, "dijital televizyonun dijital görüntü olarak bir televizyonun bir fotoğrafının aynı anda iki fotoğrafı olarak kullanılabileceğini söylemişti. Son olarak yine James Cromwell, TV ekranıyla ilgili bir makale yazm… Devamını oku..\nDevamı için tıklayınız. Televizyonla İlgili Kompozisyon\nDevamı için tıklayınız. Yeni Teknolojiyi Keş'}]

In [None]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
from torch.utils.data import Dataset as TorchDataset, random_split
from peft import get_peft_model, LoraConfig, TaskType

# Custom dataset class to handle our data
class CustomTextDataset(TorchDataset):
    def __init__(self, texts, tokenizer, block_size=128):
        self.examples = []
        for text in texts:
            tokenized_text = tokenizer(text, truncation=True, max_length=block_size, padding="max_length")
            self.examples.append(tokenized_text)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return {key: torch.tensor(val) for key, val in self.examples[i].items()}

# Load and preprocess dataset
def load_and_preprocess_dataset(file_path, tokenizer, block_size=128):
    df = pd.read_csv(file_path)
    df['text'] = "Kelime: " + df['madde'].astype(str) + ". Anlam: " + df['anlam'].astype(str)
    
    # Filter texts with length >= 15 words
    df = df[df['text'].apply(lambda x: len(x.split()) >= 6)]
    
    texts = df['text'].tolist()
    print(len(texts))
    return CustomTextDataset(texts, tokenizer, block_size)

In [None]:
# Paths and configurations
train_file_path = "tdk_word_meaning_data.csv"  # Update with your CSV file path
model_name = "turkish-gpt2"
output_dir = "results4"
overwrite_output_dir = True
per_device_train_batch_size = 8
num_train_epochs = 3
save_steps = 10

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Add pad token
special_tokens = {'pad_token': '<PAD>'}
num_added_toks = tokenizer.add_special_tokens(special_tokens)

model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

# Load dataset
dataset = load_and_preprocess_dataset(train_file_path, tokenizer)

# Split dataset into train and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])


In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback
from transformers import LoraConfig, TaskType

# Prepare data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Apply LoRA configuration
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1
)

model = get_peft_model(model, peft_config)

# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    save_steps=save_steps,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100,
    report_to="none",
    fp16=True,  # Use mixed precision (fp16) if supported by the hardware for faster training
)

# Custom callback for better fine-tuning control
class CustomCallback(TrainerCallback):
    def on_train_begin(self, args, state, control, **kwargs):
        print("Starting training...")

    def on_epoch_end(self, args, state, control, **kwargs):
        print(f"Epoch {state.epoch} completed.")

    def on_train_end(self, args, state, control, **kwargs):
        print("Training finished.")

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[CustomCallback()]
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
!pip install peft

!git clone https://huggingface.co/ytu-ce-cosmos/turkish-gpt2

%cd turkish-gpt2

!ls

from google.colab import files
uploaded = files.upload()

!ls

import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
from torch.utils.data import Dataset as TorchDataset, random_split
from peft import get_peft_model, LoraConfig, TaskType

# Custom dataset class to handle our data
class CustomTextDataset(TorchDataset):
    def __init__(self, texts, tokenizer, block_size=128):
        self.examples = []
        for text in texts:
            tokenized_text = tokenizer(text, truncation=True, max_length=block_size, padding="max_length")
            self.examples.append(tokenized_text)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return {key: torch.tensor(val) for key, val in self.examples[i].items()}

# Load and preprocess dataset
def load_and_preprocess_dataset(file_path, tokenizer, block_size=128):
    df = pd.read_csv(file_path)
    df['text'] = "Kelime: " + df['madde'].astype(str) + ". Anlam: " + df['anlam'].astype(str)
    
    # Filter texts with length >= 15 words
    df = df[df['text'].apply(lambda x: len(x.split()) >= 6)]
    
    texts = df['text'].tolist()
    print(len(texts))
    return CustomTextDataset(texts, tokenizer, block_size)

# Paths and configurations
train_file_path = "/content/tdk_word_meaning_data.csv"  # Update with your CSV file path
model_name = "/content/turkish-gpt2"
output_dir = "results4"
overwrite_output_dir = True
per_device_train_batch_size = 8
num_train_epochs = 3
save_steps = 10

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Add pad token
special_tokens = {'pad_token': '<PAD>'}
num_added_toks = tokenizer.add_special_tokens(special_tokens)

model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

# Load dataset
dataset = load_and_preprocess_dataset(train_file_path, tokenizer)

# Split dataset into train and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])


from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback
from peft import LoraConfig, TaskType

# Prepare data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Apply LoRA configuration
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1
)

model = get_peft_model(model, peft_config)

# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    save_steps=save_steps,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100,
    report_to="none",
    fp16=True,  # Use mixed precision (fp16) if supported by the hardware for faster training
)

# Custom callback for better fine-tuning control
class CustomCallback(TrainerCallback):
    def on_train_begin(self, args, state, control, **kwargs):
        print("Starting training...")

    def on_epoch_end(self, args, state, control, **kwargs):
        print(f"Epoch {state.epoch} completed.")

    def on_train_end(self, args, state, control, **kwargs):
        print("Training finished.")

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[CustomCallback()]
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

!pip install --upgrade transformers

