In [None]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
)

def load_and_preprocess_dataset(sample_percent=1):
    """Load and preprocess IMDb dataset."""
    print("Loading IMDb dataset...")
    dataset = load_dataset("imdb", split=f"train[:{sample_percent}%]")

    print("Preprocessing dataset (removing newlines)...")
    def preprocess(batch):
        batch['text'] = [text.replace('\n', ' ') for text in batch['text']]
        return batch

    return dataset.map(preprocess, batched=True)

def tokenize_dataset(dataset, tokenizer, max_length=128):
    """Tokenize dataset using the specified tokenizer."""
    print("Tokenizing dataset...")

    def tokenize_function(examples):
        tokenized = tokenizer(
            examples['text'],
            padding="max_length",
            truncation=True,
            max_length=max_length
        )
        tokenized["labels"] = tokenized["input_ids"].copy()
        return tokenized

    return dataset.map(tokenize_function, batched=True)

def prepare_model(model_name="distilgpt2"):
    """Load model and tokenizer."""
    print(f"Loading model and tokenizer: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token  # Ensure padding token is set
    model = AutoModelForCausalLM.from_pretrained(model_name)
    return model, tokenizer

def split_dataset(dataset, train_ratio=0.8):
    """Split dataset into training and evaluation subsets."""
    total = len(dataset)
    split_index = int(train_ratio * total)
    train_data = dataset.shuffle(seed=42).select(range(split_index))
    eval_data = dataset.shuffle(seed=42).select(range(split_index, total))
    return train_data, eval_data

def train_model(model, tokenizer, train_data, eval_data):
    """Train the model using the HuggingFace Trainer."""
    print("Setting training arguments...")
    args = TrainingArguments(
        output_dir="./results",
        eval_strategy="epoch",
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=1,
        logging_dir='./logs',
        logging_steps=10,
        save_total_limit=1
    )

    print("Starting training...")
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_data,
        eval_dataset=eval_data
    )
    trainer.train()

    print("Saving fine-tuned model and tokenizer...")
    model.save_pretrained("./fine_tuned_model")
    tokenizer.save_pretrained("./fine_tuned_model")

def generate_text(model, tokenizer, prompt="acting", max_length=15):
    """Generate text from a prompt using the trained model."""
    print(f"Generating text for prompt: '{prompt}'")
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(inputs["input_ids"], max_length=max_length)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def main():
    try:
        dataset = load_and_preprocess_dataset()
        model, tokenizer = prepare_model()
        tokenized_data = tokenize_dataset(dataset, tokenizer)
        train_data, eval_data = split_dataset(tokenized_data)
        train_model(model, tokenizer, train_data, eval_data)

        # Ask user for the prompt
        prompt = input("Enter a prompt for text generation: ")

        result = generate_text(model, tokenizer, prompt=prompt)
        print(f"Generated text: {result}")

    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()