In [5]:
import torch
from torch.utils.data import Dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import os

In [10]:
class CustomTextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.inputs = []

        for text in texts:
            encoding = tokenizer(
                text,
                truncation=True,
                max_length=max_length,
                padding="max_length",
                return_tensors="pt"
            )
            self.inputs.append(encoding['input_ids'].squeeze())

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {"input_ids": self.inputs[idx], "labels": self.inputs[idx]}

def fine_tune_gpt2():
    custom_texts = [
        "Artificial intelligence is transforming the world in unprecedented ways. Machine learning algorithms can now process vast amounts of data.",
        "Natural language processing has made significant advances in recent years. Text generation models can create coherent and contextually relevant content.",
        "Deep learning neural networks are capable of learning complex patterns from data. These models excel at tasks like image recognition and language understanding.",
        "The future of AI holds immense potential for solving complex problems. From healthcare to climate change, AI applications are expanding rapidly.",
        "Computer vision systems can now identify objects with human-level accuracy. This technology powers autonomous vehicles and medical imaging systems.",
        "Reinforcement learning enables agents to learn optimal strategies through trial and error. This approach has achieved superhuman performance in games.",
        "Transformer architectures have revolutionized natural language processing. Models like GPT and BERT have set new benchmarks in language tasks.",
        "Data science combines statistics, programming, and domain expertise. It enables organizations to extract insights from large datasets.",
        "Cloud computing provides scalable infrastructure for AI applications. This democratizes access to powerful computational resources.",
        "Ethical AI development is crucial for responsible technology deployment. Bias mitigation and fairness are key considerations in model design."
    ]

    model_name = "gpt2"
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    dataset = CustomTextDataset(custom_texts, tokenizer)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )

    training_args = TrainingArguments(
        output_dir="./gpt2-finetuned",
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        warmup_steps=10,
        logging_steps=10,
        save_steps=50,
        eval_strategy="no",
        learning_rate=5e-5,
        weight_decay=0.01,
        logging_dir="./logs",
        dataloader_drop_last=False,
        remove_unused_columns=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
        tokenizer=tokenizer,
    )

    print("Starting fine-tuning...")
    trainer.train()

    print("Fine-tuning complete.")

    return model, tokenizer

def generate_text_from_prompt(model, tokenizer, prompt):
    print("Generating sample text...")
    model.eval()

    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs['input_ids'].to(model.device)
    attention_mask = inputs['attention_mask'].to(model.device)


    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=100,
            num_return_sequences=1,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"\nGenerated text:\n{generated_text}")


if __name__ == "__main__":
    model, tokenizer = fine_tune_gpt2()

    user_prompt = input("Enter your prompt for text generation: ")
    generate_text_from_prompt(model, tokenizer, user_prompt)

  trainer = Trainer(


Starting fine-tuning...


Step,Training Loss
10,3.884


Fine-tuning complete.
Enter your prompt for text generation: medical 
Generating sample text...

Generated text:
medical ills have become more common in America. Research shows that people with these conditions are more likely than non-Hispanic whites to smoke and to consume alcohol.

According to the US Centers for Disease Control and Prevention, nearly 1 in 5 deaths from cardiovascular disease are from smoking. Since the 1990s, there has been a rise in the number of people who smoke. This is partly caused by increased use of tobacco products.

In the United States, more than half of all deaths that
