In [None]:
pip install datasets pandas torch python-dotenv peft

In [None]:
# --------------------------------------
# 1. Load environment variables and log in to Hugging Face
# --------------------------------------
import os
from dotenv import load_dotenv
load_dotenv()
hf_token = os.getenv("HF_TOKEN")

In [None]:
# --------------------------------------
# 2. Import model and tokenizer from Hugging Face
# --------------------------------------
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset

model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
# --------------------------------------
# 3. Load and split the dataset
# --------------------------------------
raw_data = load_dataset("hkustisom5240/dataset_for_movie_review", split="train[:1000]")
data = raw_data.train_test_split(train_size=0.95)

In [None]:
# --------------------------------------
# 4. Preprocess: Tokenize the 'overview' field and prepare labels for Causal LM
# --------------------------------------
# Preprocessing
def preprocess_batch(batch):
    inputs = tokenizer(batch["overview"], truncation=True, padding="max_length", max_length=512)
    inputs["labels"] = inputs["input_ids"].copy()
    return inputs
# Apply preprocessing to the dataset and remove original columns
tokenized_data = data.map(
    preprocess_batch,
    batched=True,
    batch_size=4,
    remove_columns=data["train"].column_names
)

In [None]:
# --------------------------------------
# 5. Set up data collator (handles dynamic padding during training)
# --------------------------------------
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
# --------------------------------------
# 6. Apply LoRA to reduce training memory and speed up fine-tuning
# --------------------------------------
from peft import get_peft_model, LoraConfig, TaskType
model = AutoModelForCausalLM.from_pretrained(model_name)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
model.train()

In [None]:
# --------------------------------------
# 7. Define optimizer
# --------------------------------------
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=1e-4)

In [None]:
# --------------------------------------
# 8. Define training arguments for Hugging Face Trainer
# --------------------------------------
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./output",                  # Where to save checkpoints
    save_steps=500,                         # Save model every 500 steps
    learning_rate=1e-5,                     # Learning rate
    weight_decay=0.04,                      # Weight decay (regularization)
    num_train_epochs=3,                     # Number of training epochs
    per_device_train_batch_size=2,          # Batch size per GPU (or CPU)
    per_device_eval_batch_size=2,           # Eval batch size
    logging_steps=50,                       # Log every 50 steps
    logging_dir="./logs",                   # Directory for logs
    resume_from_checkpoint=False,           # Don't resume unless checkpoint exists
    report_to="none"                        # Disable wandb/tensorboard for simplicity
)

In [None]:
# --------------------------------------
# 9. Initialize the Trainer and start training
# --------------------------------------
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    data_collator=data_collator,
    optimizers=(optimizer, None),
    tokenizer=tokenizer,
)

trainer.train()

In [None]:
# --------------------------------------
# 10. Save the fine-tuned model and tokenizer locally
# --------------------------------------
model.save_pretrained("gpt2-review-lora")
tokenizer.save_pretrained("gpt2-review-lora")

In [None]:
# --------------------------------------
# 11. Inference: Generate text from a prompt
# --------------------------------------
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("gpt2-review-lora")
tokenizer = AutoTokenizer.from_pretrained("gpt2-review-lora")

text = "Avatar revenue."
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(inputs.input_ids, max_length=100)
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

for line in decoded_output.split('. '):
    print(line.strip())