In [None]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
from torch.utils.data import Dataset as TorchDataset

# Custom dataset class to handle our data
class CustomTextDataset(TorchDataset):
    def __init__(self, texts, tokenizer, block_size=128):
        self.examples = []
        for text in texts:
            tokenized_text = tokenizer(text, truncation=True, max_length=block_size, padding="max_length")
            self.examples.append(tokenized_text)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return {key: torch.tensor(val) for key, val in self.examples[i].items()}

# Load and preprocess dataset
def load_and_preprocess_dataset(file_path, tokenizer, block_size=128):
    df = pd.read_csv(file_path)
    df['text'] = df['madde'].astype(str) + " - " + df['anlam'].astype(str)
    texts = df['text'].tolist()
    return CustomTextDataset(texts, tokenizer, block_size)

# Paths and configurations
train_file_path = "tdk_word_meaning_data.csv"  # Update with your CSV file path
model_name = r"C:\Users\Dell\OneDrive\Masaüstü\ProjectAS"
output_dir = "./results"
overwrite_output_dir = True
per_device_train_batch_size = 8
num_train_epochs = 3
save_steps = 10

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Add pad token
special_tokens = {'pad_token': '<PAD>'}
num_added_toks = tokenizer.add_special_tokens(special_tokens)

model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

# Load dataset
train_dataset = load_and_preprocess_dataset(train_file_path, tokenizer)

# Prepare data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    save_steps=save_steps,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100,
    report_to="none",
    fp16=True,  # Use mixed precision (fp16) if supported by the hardware for faster training
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()
trainer.save_model(output_dir)




Step,Training Loss,Validation Loss


In [None]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
from torch.utils.data import Dataset as TorchDataset

# Custom dataset class to handle our data
class CustomTextDataset(TorchDataset):
    def __init__(self, texts, tokenizer, block_size=128):
        self.examples = []
        for text in texts:
            tokenized_text = tokenizer(text, truncation=True, max_length=block_size, padding="max_length")
            self.examples.append(tokenized_text)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return {key: torch.tensor(val) for key, val in self.examples[i].items()}

# Load and preprocess dataset
def load_and_preprocess_dataset(file_path, tokenizer, block_size=128):
    df = pd.read_csv(file_path)
    df['text'] = df['madde'].astype(str) + ": " + df['anlam'].astype(str)
    
    # Filter texts with length >= 15 words
    df = df[df['text'].apply(lambda x: len(x.split()) >= 15)]
    
    texts = df['text'].tolist()
    return CustomTextDataset(texts, tokenizer, block_size)

# Paths and configurations
train_file_path = "tdk_word_meaning_data.csv"  # Update with your CSV file path
model_name = r"C:\Users\Dell\OneDrive\Masaüstü\ProjectAS"
output_dir = r"C:\Users\Dell\OneDrive\Masaüstü\results"
overwrite_output_dir = True
per_device_train_batch_size = 8
num_train_epochs = 3
save_steps = 10

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Add pad token
special_tokens = {'pad_token': '<PAD>'}
num_added_toks = tokenizer.add_special_tokens(special_tokens)

model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

# Load dataset
train_dataset = load_and_preprocess_dataset(train_file_path, tokenizer)

# Prepare data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    save_steps=save_steps,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100,
    report_to="none",
    fp16=True,  # Use mixed precision (fp16) if supported by the hardware for faster training
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()
trainer.save_model(output_dir)




Step,Training Loss,Validation Loss


In [1]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
from torch.utils.data import Dataset as TorchDataset, random_split
from peft import get_peft_model, LoraConfig, TaskType

# Custom dataset class to handle our data
class CustomTextDataset(TorchDataset):
    def __init__(self, texts, tokenizer, block_size=128):
        self.examples = []
        for text in texts:
            tokenized_text = tokenizer(text, truncation=True, max_length=block_size, padding="max_length")
            self.examples.append(tokenized_text)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return {key: torch.tensor(val) for key, val in self.examples[i].items()}

# Load and preprocess dataset
def load_and_preprocess_dataset(file_path, tokenizer, block_size=128):
    df = pd.read_csv(file_path)
    df['text'] = df['madde'].astype(str) + ": " + df['anlam'].astype(str)
    
    # Filter texts with length >= 15 words
    df = df[df['text'].apply(lambda x: len(x.split()) >= 15)]
    
    texts = df['text'].tolist()
    return CustomTextDataset(texts, tokenizer, block_size)
    
# Paths and configurations
train_file_path = "tdk_word_meaning_data.csv"  # Update with your CSV file path
model_name = r"C:\Users\Dell\OneDrive\Masaüstü\ProjectAS"
output_dir = r"C:\Users\Dell\OneDrive\Masaüstü\results"
overwrite_output_dir = True
per_device_train_batch_size = 8
num_train_epochs = 3
save_steps = 10

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Add pad token
special_tokens = {'pad_token': '<PAD>'}
num_added_toks = tokenizer.add_special_tokens(special_tokens)

model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

# Load dataset
dataset = load_and_preprocess_dataset(train_file_path, tokenizer)

# Split dataset into train and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Prepare data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Apply LoRA
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    inference_mode=False, 
    r=8, 
    lora_alpha=32, 
    lora_dropout=0.1
)
model = get_peft_model(model, peft_config)

# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    save_steps=save_steps,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100,
    report_to="none",
    fp16=True,  # Use mixed precision (fp16) if supported by the hardware for faster training
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()
trainer.save_model(output_dir)




Step,Training Loss,Validation Loss
100,4.2469,4.103694
200,4.2773,4.0061
300,4.1638,3.942166
400,4.1579,3.901731
500,4.1195,3.874077
600,4.1073,3.856584
700,3.9943,3.840295
800,3.9389,3.828642
900,3.9281,3.822006
1000,4.0084,3.811811




In [8]:
import os

model_path = r"C:\Users\Dell\OneDrive\Masaüstü\FineTunedModel"

# Check if all necessary files exist
required_files = ["config.json", "special_tokens_map.json", "tokenizer_config.json", "vocab.json", "merges.txt", "pytorch_model.bin"]
for file_name in required_files:
    file_path = os.path.join(model_path, file_name)
    if not os.path.isfile(file_path):
        print(f"Missing file: {file_path}")


Missing file: C:\Users\Dell\OneDrive\Masaüstü\FineTunedModel\merges.txt
Missing file: C:\Users\Dell\OneDrive\Masaüstü\FineTunedModel\pytorch_model.bin


In [11]:
from transformers import pipeline, GPT2Tokenizer, GPT2LMHeadModel

# Model ve tokenizer'ı yükleyin
model_path = r"C:\Users\Dell\OneDrive\Masaüstü\FineTunedModel"  # Fine-tuned modelin kaydedildiği yol
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Pipeline oluşturma
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Örnek bir kelimenin anlamını bulma
def get_word_meaning(word, max_length=50):
    prompt = word + ":"
    generated_texts = text_generator(prompt, max_length=max_length, num_return_sequences=1)
    meaning = generated_texts[0]['generated_text'][len(prompt):].strip()
    return meaning

# Kelimenin anlamını almak için örnek kullanım
word = "uyumak nedemek "
meaning = get_word_meaning(word)
print(f"{word}: {meaning}")


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


uyumak nedemek : etmek eylemiyle bir araya gelip, birlikte yaşamaktan daha az acı çeken bir varlık olduğunu sanmak, aynı şey olmak, aynı yoldan geçmek, aynı şekilde geçmek, aynı şekilde geçmek, aynı yola doğru gitmek, aynı yolda yaşamak gibi
