In [1]:
!pip install datasets seacrowd bitsandbytes
from transformers import AutoTokenizer, Phi3ForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
import torch
import bitsandbytes



In [2]:
!pip install -U bitsandbytes
# Load pre-trained model and tokenizer
checkpoint = "microsoft/Phi-3.5-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
model = Phi3ForCausalLM.from_pretrained(
    checkpoint,
    torch_dtype=torch.bfloat16,  # Use bfloat16 for memory efficiency
    trust_remote_code=True,
    device_map="auto",  # Automatically map the model to available GPUs
    load_in_8bit=True,
)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
!pip install -U peft bitsandbytes
from peft import LoraConfig, get_peft_model, TaskType
# Define LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["qkv_proj", "o_proj"],  # Adjust based on your model architecture
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM",
    # bnb_8bit=True  # Enable bitsandbytes 8-bit quantization
)
# Apply LoRA to the model
model = get_peft_model(model, lora_config)



In [4]:
# 1. Memuat Dataset
dataset = load_dataset("SEACrowd/liputan6")
print("Dataset loaded:", dataset)
train_dataset = dataset["train"].shuffle(seed=42).select(range(100))
print("Train dataset:", train_dataset)
val_dataset = dataset["validation"].shuffle(seed=42).select(range(100))
print("Validation dataset:", val_dataset)
test_dataset = dataset["test"].shuffle(seed=42).select(range(100))
print("Test dataset:", test_dataset)

Dataset loaded: DatasetDict({
    train: Dataset({
        features: ['document', 'id', 'summary'],
        num_rows: 193883
    })
    test: Dataset({
        features: ['document', 'id', 'summary'],
        num_rows: 10972
    })
    validation: Dataset({
        features: ['document', 'id', 'summary'],
        num_rows: 10972
    })
})
Train dataset: Dataset({
    features: ['document', 'id', 'summary'],
    num_rows: 100
})
Validation dataset: Dataset({
    features: ['document', 'id', 'summary'],
    num_rows: 100
})
Test dataset: Dataset({
    features: ['document', 'id', 'summary'],
    num_rows: 100
})


In [5]:
from datasets import load_dataset
!pip install datasets
# Preprocess the dataset for summarization tasks
def preprocess_function(examples):
    inputs = examples["document"]
    targets = examples["summary"]
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="max_length",
    )
    # Tokenize the labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=128,
            truncation=True,
            padding="max_length",
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the dataset
train_tokenized_datasets = train_dataset.map(preprocess_function, batched=True)
val_tokenized_datasets = val_dataset.map(preprocess_function, batched=True)




In [6]:
!pip install trl
from peft import (
        get_peft_model,
        prepare_model_for_kbit_training,
        LoraConfig
    )
from trl import SFTTrainer
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)





In [7]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./phi3-liputan6-summarizer-lora",  # Directory for saving model checkpoints
    evaluation_strategy="epoch",
    learning_rate=5e-4,  # Higher learning rate for LoRA fine-tuning
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,  # Limit number of saved checkpoints
    logging_dir="./logs",
    logging_steps=100,
    fp16=True,  # Enable mixed precision for faster training
    push_to_hub=False,
)

def formatting_func(prompt):
  output = []

  for d, s in zip(prompt["src"], prompt["tgt"]):
    op = generate_prompt(d, s)
    output.append(op)

  return output

# Define the Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_tokenized_datasets,
    eval_dataset=val_tokenized_datasets,
    peft_config=lora_config,
    formatting_func=formatting_func,
    # max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_args
)

# Train the model with LoRA
trainer.train()





  trainer = SFTTrainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mnadhiefathallahi[0m ([33mnadhiefathallahi-universitas-pendidikan-indonesia[0m). Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,No log,2.106455
2,No log,2.100772
3,No log,2.113444




TrainOutput(global_step=75, training_loss=2.0712235514322916, metrics={'train_runtime': 571.4555, 'train_samples_per_second': 0.525, 'train_steps_per_second': 0.131, 'total_flos': 3435077409177600.0, 'train_loss': 2.0712235514322916, 'epoch': 3.0})

In [11]:
import os

model_path = "/content/phi3-liputan6-merged"
if os.path.exists(model_path):
    print("Model path exists.")
else:
    print("Model path does not exist.")

Model path exists.


In [1]:
# Save the LoRA fine-tuned model
trainer.save_model("./phi3-liputan6-lora-finetuned")


from peft import PeftModel

# peft_model_id = "/content/checkpoint-75"
# peft_model = PeftModel.from_pretrained(model, peft_model_id, torch_dtype=torch.float16, offload_folder="/content/phi3-liputan6-summarizer-lora")
# Load the fine-tuned model for inference
fine_tuned_model = Phi3ForCausalLM.from_pretrained("/content/phi3-liputan6-merged",  local_files_only=True)

# Perform inference with the fine-tuned model
new_article = dataset["test"][1]["document"]
inputs = tokenizer(f"Summarize the following article: {new_article}", return_tensors="pt")
generate_ids = fine_tuned_model.generate(
    inputs.input_ids, max_new_tokens=200, num_beams=5, early_stopping=True
)

# Decode the generated summary
generated_summary = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

print("New Article:\n", new_article)
print("\nGenerated Summary:\n", generated_summary)

NameError: name 'trainer' is not defined

In [None]:
from peft import PeftModel

# Load base model
base_model = Phi3ForCausalLM.from_pretrained("microsoft/Phi-3.5-mini-instruct", torch_dtype=torch.float16)

# Load LoRA adapter
lora_model = PeftModel.from_pretrained(base_model, "./phi3-liputan6-summarizer-lora/checkpoint-75")

# Merge LoRA weights into the base model
merged_model = lora_model.merge_and_unload()

# Save the full model (merged)
merged_model.save_pretrained("./phi3-liputan6-merged")
tokenizer.save_pretrained("./phi3-liputan6-merged")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [None]:
# Evaluate BLEU score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
nltk.download('punkt-tab')

def calculate_bleu_score(reference, candidate):
    reference_tokens = nltk.word_tokenize(reference.lower())
    candidate_tokens = nltk.word_tokenize(candidate.lower())
    smoothing = SmoothingFunction().method4
    return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=smoothing)

reference_summary = dataset["test"][1]["summary"]
bleu_score = calculate_bleu_score(reference_summary, generated_summary)
print("\nBLEU Score:", bleu_score)