In [2]:
!pip install transformers accelerate bitsandbytes peft torch trl sentencepiece datasets protobuf



In [3]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model
)

# ✅ 1. Model ID (switched from PhoGPT to Vistral)
model_id = "Viet-Mistral/Vistral-7B-Chat"

# ✅ 2. Configure 4-bit or 8-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,  # Use 8-bit quantization
    bnb_8bit_use_double_quant=True,
    bnb_8bit_quant_type="nf8",
    bnb_8bit_compute_dtype=torch.float16
)

# ✅ 3. Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True
)

# ✅ 4. Prepare model for PEFT + LoRA
model = prepare_model_for_kbit_training(model) # Removed this line

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=False)

# Configure LoRA (correct target modules)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

# Apply LoRA
model = get_peft_model(model, lora_config) # Re-added this line

In [5]:
model.print_trainable_parameters()

trainable params: 13,631,488 || all params: 7,307,538,432 || trainable%: 0.1865


In [6]:
import json
from datasets import Dataset
from transformers import AutoTokenizer

def prepare_dataset(data_path):
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained("Viet-Mistral/Vistral-7B-Chat", trust_remote_code=True)

    # Load your JSON data
    with open(data_path, 'r', encoding='utf-8') as f:
        try:
            data = json.load(f)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON file: {e}")
            print("Please check if the JSON file is correctly formatted.")
            return None

    # Ensure the loaded data is a list
    if not isinstance(data, list):
        print("JSON data is not a list of objects. Please check the file format.")
        return None

    # Format the data according to the chat template
    formatted_data = []
    for item in data:
        # Skip items that have qa_pairs
        if isinstance(item, dict) and "qa_pairs" in item:
            continue

        # Process items that don't have qa_pairs
        if isinstance(item, dict):
            if "prompt" in item and "response" in item:
                messages = [
                    {"role": "user", "content": item["prompt"]},
                    {"role": "assistant", "content": item["response"]}
                ]
                prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
                formatted_data.append({"text": prompt})
            else:
                print(f"Skipping item due to missing 'prompt' or 'response' key: {item}")
        else:
            print(f"Skipping item as it is not a dictionary: {item}")

    # Return Dataset.from_list only if formatted_data is not empty
    if formatted_data:
        return Dataset.from_list(formatted_data)
    else:
        print("No valid data found to create a dataset.")
        return None

# Save dataset to HuggingFace format
dataset = prepare_dataset("/content/combined_qa_dataset.json")

# Only save to disk if the dataset was successfully created
if dataset is not None:
    dataset.save_to_disk("/content/sample_instruction_following_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/1418 [00:00<?, ? examples/s]

In [7]:
from datasets import load_from_disk

# Load the dataset we just prepared
dataset = load_from_disk("/content/sample_instruction_following_dataset")

In [8]:
print(dataset[0])

{'text': '<s>[INST] Công tác tư vấn học tập và hướng nghiệp của Trường Đại học Quốc tế bao gồm những gì? [/INST] Công tác tư vấn học tập và hướng nghiệp bao gồm tư vấn, hỗ trợ sinh viên trong việc xây dựng kế hoạch học tập, phương pháp học tập và cung cấp thông tin về chương trình đào tạo. </s>'}


In [12]:
!pip install trl==0.7.1

Collecting trl==0.7.1
  Downloading trl-0.7.1-py3-none-any.whl.metadata (9.8 kB)
Downloading trl-0.7.1-py3-none-any.whl (117 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.0/118.0 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl
  Attempting uninstall: trl
    Found existing installation: trl 0.4.7
    Uninstalling trl-0.4.7:
      Successfully uninstalled trl-0.4.7
Successfully installed trl-0.7.1
