<a href="https://colab.research.google.com/github/JakeTurner616/Adonalsium/blob/LLM/trainandgenerate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install dependencies

In [None]:
! pip install torch pandas
! pip install -U accelerate
! pip install -U transformers

Download the book

In [2]:
import requests

# URLs for the data
book_url = 'https://gist.githubusercontent.com/JakeTurner616/7c5bcb37d861ba6fdb66d12a9bb9a084/raw/63d14a3aeed74a23b7aa1dba263b42d7c2b8676e/book.txt'

# Download the book
book_response = requests.get(book_url)
book_text = book_response.text

# Save the book text to a file (optional, you can also directly use `book_text` for training)
with open('reverted_book.txt', 'w') as f:
    f.write(book_text)

print("Downloaded and saved the book text.")

Downloaded and saved the book text.


Train the model

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Set pad token to eos token for GPT-2 compatibility

# Prepare dataset
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="reverted_book.txt",
    block_size=128
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

model = GPT2LMHeadModel.from_pretrained('gpt2')

training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

trainer.train()

Generate text

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, pipeline

def generate_text(prompt, model_path='./results/checkpoint-40000/'):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token

    text_generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
    generated_texts = text_generator(
        prompt,
        max_length=150,
        temperature=0.7,
        top_p=0.85,
        repetition_penalty=1.3,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        early_stopping=True
      )

    for generated in generated_texts:
        print(generated['generated_text'])

if __name__ == "__main__":
    prompt = "She fell to the ground dead"
    generate_text(prompt)