In [13]:
!pip install python-docx



In [None]:
import re
from PyPDF2 import PdfReader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

In [3]:
def read_pdf(file_path):
    text = ""
    with open(file_path, "rb") as file:
        pdf_reader = PdfReader(file)
        for page_num in range(30):
            text += pdf_reader.pages[page_num].extract_text()
    return text

def clean_text(text):
    text = re.sub(r'\n+', '\n', text).strip()
    text = text.replace('*', ' ')
    text = text.replace('...', ' ')
    return text

def save_text_to_file(text, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(text)

def load_dataset(file_path, tokenizer, block_size=128):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size,
    )
    return dataset

def load_data_collator(tokenizer, mlm=False):
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=mlm)
    return data_collator

def train(train_file_path, model_name, output_dir, overwrite_output_dir, per_device_train_batch_size, num_train_epochs, save_steps):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    train_dataset = load_dataset(train_file_path, tokenizer)
    data_collator = load_data_collator(tokenizer)

    model = GPT2LMHeadModel.from_pretrained(model_name)

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=overwrite_output_dir,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        save_steps=save_steps,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )

    trainer.train()
    trainer.save_model(output_dir)

In [5]:

# Paths and configurations
train_pdf_path = "Büyük Türkçe Sözlük.pdf"
cleared_text_path = "ClearedTXT.txt"
model_name = r"C:\Users\STJ\Desktop\final_model_and_tokenizer" # Example GPT-2 model
output_dir = "./results"
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 3
save_steps = 10

# Read PDF and clean text
text_data = read_pdf(train_pdf_path)
text_data = clean_text(text_data)


In [None]:

# Save cleaned text to file
save_text_to_file(text_data, cleared_text_path)

# Train the model
train(
    train_file_path=cleared_text_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps,
)




Step,Training Loss
