Install dependencies

In [None]:
! pip install torch pandas
! pip install -U accelerate
! pip install -U transformers

Download the book

In [None]:
import requests

# URLs for the data
book_url = 'https://gist.githubusercontent.com/JakeTurner616/7c5bcb37d861ba6fdb66d12a9bb9a084/raw/63d14a3aeed74a23b7aa1dba263b42d7c2b8676e/book.txt'  # Change to your book's URL

# Download the book
book_response = requests.get(book_url)
book_text = book_response.text

# Save the book text to a file (optional, you can also directly use `book_text` for training)
with open('reverted_book.txt', 'w') as f:
    f.write(book_text)

print("Downloaded and saved the book text.")

Downloaded and saved the book text.


Train the model

In [None]:
from transformers import GPT2Tokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from transformers import TextDataset, DataCollatorForLanguageModeling

model_name = 'EleutherAI/gpt-neo-2.7B'  # Example: Using a 2.7B parameter model

tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # GPT-Neo also uses the eos token as a pad token

# Prepare dataset
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="reverted_book.txt",
    block_size=128  # Adjust block size based on the available memory
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

model = AutoModelForCausalLM.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,  # Might need adjustment based on GPU memory
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

trainer.train()

Generate text

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

def generate_text(prompt, model_path='./results/checkpoint-40000/'):
    # Load the model and tokenizer based on the saved checkpoint
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)

    # Create a text generation pipeline using the loaded model and tokenizer
    text_generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

    # Generate text based on the provided prompt
    generated_texts = text_generator(
        prompt,
        max_length=150,
        temperature=0.7,
        top_p=0.85,
        repetition_penalty=1.3,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        early_stopping=True
    )

    # Print the generated texts
    for generated in generated_texts:
        print(generated['generated_text'])

if __name__ == "__main__":
    prompt = "She fell to the ground dead"
    generate_text(prompt)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


She fell to the ground dead, and it was a miracle she survived. Her hands were covered in blood, her feet scratched by something sharp.
  Mistings are supposed not as dangerous as you think, Vin thought. She pulled back against the wall beside Elend. He had fallen unconscious there too. How did he survive? What if his wounds weren’t so bad that they made him sick? It didn't matter—he would die soon anyway. The only way out of this place was for them all together into one....
 Something crashed through Kelsier on top of Yomen. A man with dark brown skin. Like Ruin himself, but different. Marsh looked like someone else from what could be seen up above
