In [1]:
!pip install transformers
!pip install datasets



In [25]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

# Load and tokenize the dataset
dataset = load_dataset('text', data_files={'train': '/kaggle/input/txt-dataset/input.txt'})

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Set the padding token
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['text'])

# Prepare data for language modeling
block_size = 128

def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result['labels'] = result['input_ids'].copy()
    return result

lm_datasets = tokenized_dataset.map(group_texts, batched=True)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False
)

# Load pre-trained model
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets['train'],
    data_collator=data_collator,
)

# Train the model
trainer.train()


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]



Step,Training Loss
500,4.6945
1000,4.3959
1500,4.264
2000,4.2332
2500,4.1952
3000,4.1411
3500,4.0933
4000,4.0547
4500,4.0714
5000,4.0342




TrainOutput(global_step=15000, training_loss=3.76825634765625, metrics={'train_runtime': 4244.7288, 'train_samples_per_second': 28.27, 'train_steps_per_second': 3.534, 'total_flos': 7838760960000000.0, 'train_loss': 3.76825634765625, 'epoch': 3.0})

In [30]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Define the prompt
prompt = "First Citizen: Before we proceed any further, hear me speak."
inputs = tokenizer(prompt, return_tensors='pt')

# Move inputs to the same device as the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
inputs = {k: v.to(device) for k, v in inputs.items()}

# Generate text with repetition mitigation strategies
outputs = model.generate(
    inputs['input_ids'], 
    max_length=50, 
    num_return_sequences=1, 
    no_repeat_ngram_size=2,  
    top_k=50,  
    top_p=0.95, 
    temperature=0.7,  
    repetition_penalty=2.0  
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


First Citizen: Before we proceed any further, hear me speak.
I am the first citizen of this world to be born in a country where there is no law against slavery and that it has been abolished by our government for over two hundred years;


In [31]:
model.save_pretrained('./shakespeare-gpt2')
tokenizer.save_pretrained('./shakespeare-gpt2')


('./shakespeare-gpt2/tokenizer_config.json',
 './shakespeare-gpt2/special_tokens_map.json',
 './shakespeare-gpt2/vocab.json',
 './shakespeare-gpt2/merges.txt',
 './shakespeare-gpt2/added_tokens.json')

In [32]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained('./shakespeare-gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('./shakespeare-gpt2')
