In [1]:
!pip install -U transformers trl peft datasets evaluate rouge_score underthesea bitsandbytes thefuzz bert_score python-dotenv
# Tải tài nguyên NLTK cho METEOR
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
import os
os.environ["WANDB_DISABLED"] = "true"
import json
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
import string
from underthesea import word_tokenize
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, EarlyStoppingCallback
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
import evaluate

  from .autonotebook import tqdm as notebook_tqdm
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from dotenv import load_dotenv
import os
import torch
from peft import LoraConfig, get_peft_model

# Tải biến môi trường từ file .env
load_dotenv()

# Đọc access token từ biến môi trường
hf_token = os.getenv("HF_VISTRAL")
if not hf_token:
    raise ValueError("Không tìm thấy HF_VISTRAL trong file .env. Vui lòng thêm token vào file .env với định dạng: HF_VISTRAL=your_token")

def load_model_and_tokenizer(quantization="int8"):
    model_name = "Viet-Mistral/Vistral-7B-Chat"

    # Cấu hình quantization với bitsandbytes
    if quantization == "int8":
        quantization_config = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_compute_dtype=torch.bfloat16,
            bnb_8bit_use_double_quant=True,
        )
    elif quantization == "int4":
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
        )
    else:
        quantization_config = None

    # Tải tokenizer và mô hình với token xác thực
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        token=hf_token  # Sử dụng token từ biến môi trường
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        token=hf_token  # Sử dụng token từ biến môi trường
    )

    # Kiểm tra cấu trúc mô hình
    print("Cấu trúc mô hình Vistral-7B-Chat:")
    print(model)

    # Cấu hình LoRA với target_modules phù hợp
    peft_config = LoraConfig(
        r=32,
        lora_alpha=32,
        lora_dropout=0.5,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"]  # Dựa trên kiến trúc Mistral
    )
    model = get_peft_model(model, peft_config)
    return model, tokenizer, peft_config

# Tải mô hình với INT8 quantization
model, tokenizer, peft_config = load_model_and_tokenizer(quantization="int8")

Loading checkpoint shards: 100%|██████████| 2/2 [00:21<00:00, 10.53s/it]


Cấu trúc mô hình Vistral-7B-Chat:
MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(38369, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear8bitLt(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4

In [5]:
def formatting_func(example):
    if not all(k in example for k in ['input', 'output']):
        print('Thiếu key trong example:', example)
        return ''
    messages = [
        {"role": "user", "content": example['input']},
        {"role": "assistant", "content": example['output']}
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False)

In [6]:
import os
import json
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
import string
from underthesea import word_tokenize
from thefuzz import fuzz

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = word_tokenize(text)
    text = " ".join(tokens)
    return text

def extract_json_from_folder(folder_path):
    dataset = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    json_data = json.load(f)
                    for item in json_data:
                        if all(k in item for k in ['input', 'output']):
                            item['input'] = preprocess_text(item['input'])
                            item['output'] = preprocess_text(item['output'])
                            dataset.append(item)
                        else:
                            print(f"Thiếu trường trong {filename}: {item}")
            except json.JSONDecodeError:
                print(f"Không thể parse JSON từ {filename}")
            except Exception as e:
                print(f"Lỗi khi đọc {filename}: {e}")
    return dataset

folder_path = "/root/model/data"
dataset = extract_json_from_folder(folder_path)

df = pd.DataFrame(dataset)
print("Số lượng giá trị duy nhất trong 'input' (exact):", df['input'].nunique())
print("Số lượng giá trị duy nhất trong 'output' (exact):", df['output'].nunique())
print("Tổng số hàng:", len(df))

# Fuzzy matching để tìm các record tương tự
similarity_threshold = 90  # Ngưỡng độ tương đồng (90%)
input_pairs = []
output_pairs = []

# Tìm các cặp input và output tương tự
for i in range(len(df)):
    for j in range(i + 1, len(df)):
        input_sim = fuzz.ratio(df['input'].iloc[i], df['input'].iloc[j])
        if input_sim >= similarity_threshold:
            input_pairs.append((i, j, input_sim))
        output_sim = fuzz.ratio(df['output'].iloc[i], df['output'].iloc[j])
        if output_sim >= similarity_threshold:
            output_pairs.append((i, j, output_sim))

print(f"Số cặp input tương tự (>{similarity_threshold}%):", len(input_pairs))
print(f"Số cặp output tương tự (>{similarity_threshold}%):", len(output_pairs))

# Loại bỏ các record có input hoặc output tương tự (giữ record đầu tiên)
indices_to_keep = set(range(len(df)))
for i, j, _ in input_pairs:
    if j in indices_to_keep:
        indices_to_keep.remove(j)
for i, j, _ in output_pairs:
    if j in indices_to_keep:
        indices_to_keep.remove(j)

df = df.iloc[list(indices_to_keep)].reset_index(drop=True)
print("Số hàng sau khi xóa record tương tự:", len(df))

# Kiểm tra lại độ unique
print("Số lượng giá trị duy nhất trong 'input' (sau xử lý):", df['input'].nunique())
print("Số lượng giá trị duy nhất trong 'output' (sau xử lý):", df['output'].nunique())

# Chia train/validation
full_dataset = Dataset.from_pandas(df[['input', 'output']])
train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df[['input', 'output']])
eval_dataset = Dataset.from_pandas(eval_df[['input', 'output']])
print(train_df[['input', 'output']].head())

Số lượng giá trị duy nhất trong 'input' (exact): 1599
Số lượng giá trị duy nhất trong 'output' (exact): 1659
Tổng số hàng: 1700
Số cặp input tương tự (>90%): 320
Số cặp output tương tự (>90%): 41
Số hàng sau khi xóa record tương tự: 1472
Số lượng giá trị duy nhất trong 'input' (sau xử lý): 1472
Số lượng giá trị duy nhất trong 'output' (sau xử lý): 1472
                                                  input  \
998   tự kỷ có phải do dùng thiết bị điện tử nhiều k...   
254                     trẻ nói nhại lại lời người khác   
1074              tự kỷ có nên dùng thuốc an thần không   
643    trẻ không biết tự khen ngợi bản thân khi làm tốt   
1451                    trẻ 2 tuổi chỉ hét mà không nói   

                                                 output  
998   hiện chưa có bằng chứng khoa học cho thấy thiế...  
254   việc trẻ lặp lại lời nói người khác thường xuy...  
1074  không dùng thuốc an thần bừa bãi cho trẻ tự kỷ...  
643   không ghi nhận thành công khiến trẻ thiếu tự t...  


In [7]:
import evaluate
import numpy as np
from sentence_transformers import SentenceTransformer, util
def evaluate_metrics(predictions, references):
    # Tải các độ đo
    rouge = evaluate.load("rouge")
    meteor = evaluate.load("meteor")

    # Tính các độ đo
    rouge_results = rouge.compute(predictions=predictions, references=references)
    meteor_results = meteor.compute(predictions=predictions, references=references)

    # Tải mô hình nhúng câu để tính Cosine Similarity
    embedder = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

    # Tính embeddings cho dự đoán và tham chiếu
    pred_embeddings = embedder.encode(predictions, convert_to_tensor=True)
    ref_embeddings = embedder.encode(references, convert_to_tensor=True)

    # Tính Cosine Similarity giữa từng cặp dự đoán-tham chiếu
    cosine_scores = util.cos_sim(pred_embeddings, ref_embeddings)
    # Lấy trung bình Cosine Similarity (chỉ lấy đường chéo chính, vì mỗi dự đoán chỉ so với tham chiếu tương ứng)
    avg_cosine_similarity = np.mean([cosine_scores[i][i].item() for i in range(len(predictions))])
    
    # Gộp kết quả
    metrics = {
        "rouge1": rouge_results["rouge1"],
        "rouge2": rouge_results["rouge2"],
        "rougeL": rouge_results["rougeL"],
        "meteor": meteor_results["meteor"],
        "cosine_similarity": avg_cosine_similarity,
    }
    return metrics

def generate_predictions(model, tokenizer, inputs, max_length=200):
    """Tạo dự đoán từ mô hình cho các đầu vào."""
    model.eval()
    predictions = []
    # Đảm bảo pad_token được thiết lập
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    for input_text in inputs:
        # Tiền xử lý input để đồng bộ với huấn luyện
        input_text = preprocess_text(input_text)
        prompt = f"<s>[INST] {input_text} [/INST]"
        inputs_encoded = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs_encoded,
                max_length=max_length,
                num_return_sequences=1,
                pad_token_id=tokenizer.pad_token_id
            )
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        pred = generated_text.split("[/INST]")[-1].strip()
        # Tiền xử lý dự đoán để đồng bộ với tham chiếu
        pred = preprocess_text(pred)
        predictions.append(pred)
    return predictions

In [8]:
import json
from transformers import TrainingArguments
from trl import SFTTrainer
from transformers import EarlyStoppingCallback

# Loại bỏ cột không cần thiết để tránh cảnh báo
train_dataset = train_dataset.remove_columns(['__index_level_0__'] if '__index_level_0__' in train_dataset.column_names else [])
eval_dataset = eval_dataset.remove_columns(['__index_level_0__'] if '__index_level_0__' in eval_dataset.column_names else [])

model_name = "Viet-Mistral/Vistral-7B-Chat" 
# Cấu hình huấn luyện
training_arguments_single = TrainingArguments(
    output_dir=f"./results_single_{model_name.split('/')[-1]}",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    optim="paged_adamw_32bit",
    save_steps=100,
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.1,
    fp16=False,
    bf16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.1,
    group_by_length=True,
    lr_scheduler_type="cosine",
    eval_strategy="steps",
    eval_steps=10,
    logging_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

# Huấn luyện Single Run
trainer_single = SFTTrainer(
    model=model,
    args=training_arguments_single,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    formatting_func=formatting_func, 
    callbacks=[
        EarlyStoppingCallback(
            early_stopping_patience=3,
            early_stopping_threshold=0.01,
        )
    ],
)

print(f"Train dataset size: {len(train_dataset)}, Eval dataset size: {len(eval_dataset)}")
trainer_single.train()

# Lưu mô hình Single Run
model.save_pretrained(f"./finetuned_{model_name.split('/')[-1]}_single")  
tokenizer.save_pretrained(f"./finetuned_{model_name.split('/')[-1]}_single")

# Đánh giá Single Run
test_inputs = eval_df['input'].tolist()
test_references = eval_df['output'].tolist()
predictions_single = generate_predictions(model, tokenizer, test_inputs)
metrics_single = evaluate_metrics(predictions_single, test_references)
print(f"Single Run Metrics for {model_name}:", metrics_single)

# Lưu metrics vào file
with open(f"single_run_metrics_{model_name.split('/')[-1]}.json", "w") as f:
    json.dump(metrics_single, f, indent=4)

# Kiểm tra mẫu dự đoán
for i in range(5):
    print(f"Input: {test_inputs[i]}")
    print(f"Prediction: {predictions_single[i]}")
    print(f"Reference: {test_references[i]}\n")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Applying formatting function to train dataset: 100%|██████████| 1177/1177 [00:00<00:00, 6697.23 examples/s]
Converting train dataset to ChatML: 100%|██████████| 1177/1177 [00:00<00:00, 26164.11 examples/s]
Applying chat template to train dataset: 100%|██████████| 1177/1177 [00:00<00:00, 27426.24 examples/s]
Tokenizing train dataset: 100%|██████████| 1177/1177 [00:00<00:00, 3189.39 examples/s]
Truncating train dataset: 100%|██████████| 1177/1177 [00:00<00:00, 289678.20 examples/s]
Applying formatting function to eval dataset: 100%|██████████| 295/295 [00:00<00:00, 6631.51 examples/s]
Converting eval dataset to ChatML: 100%|██████████| 295/295 [00:00<00:00, 24957.53 examples/s]
Applying chat template to eval dataset: 100%|██████████| 295/295 [00:00<00:00, 23456.30 examples/s]
Tokenizing eval 

Train dataset size: 1177, Eval dataset size: 295




Step,Training Loss,Validation Loss
10,4.1591,4.255232
20,3.7726,3.174825
30,2.8679,2.766187
40,2.6435,2.442622
50,2.318,2.240651
60,2.1423,2.139199
70,2.0405,1.99802
80,1.7938,1.854019
90,1.7254,1.744097
100,1.6083,1.691299



Cannot access gated repo for url https://huggingface.co/Viet-Mistral/Vistral-7B-Chat/resolve/main/config.json.
Access to model Viet-Mistral/Vistral-7B-Chat is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in Viet-Mistral/Vistral-7B-Chat.

Cannot access gated repo for url https://huggingface.co/Viet-Mistral/Vistral-7B-Chat/resolve/main/config.json.
Access to model Viet-Mistral/Vistral-7B-Chat is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in Viet-Mistral/Vistral-7B-Chat.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nlt

Single Run Metrics for Viet-Mistral/Vistral-7B-Chat: {'rouge1': np.float64(0.6422481889840015), 'rouge2': np.float64(0.3082904041660043), 'rougeL': np.float64(0.41880351070431054), 'meteor': np.float64(0.35291971704127534), 'cosine_similarity': np.float64(0.7343392619642161)}
Input: trẻ không biết sử dụng hình ảnh minh họa kết quả hoạt động
Prediction: không có hình ảnh minh họa kết quả hoạt động – nên luyện vẽ sơ đồ hoặc chụp ảnh minh họa
Reference: không chụp hình hoặc chọn hình mờ nhạt – nên luyện chọn hình rõ người – rõ hành động – có cảm xúc

Input: trẻ không dùng tay để chỉ vật
Prediction: trẻ không dùng tay để chỉ vật có thể là dấu hiệu chậm phát triển kỹ năng giao tiếp – nên đánh giá khả năng giao tiếp và tương tác xã hội của trẻ
Reference: việc trẻ không dùng tay để chỉ khi muốn thể hiện nhu cầu là dấu hiệu quan trọng để nhận biết trẻ có khó khăn trong giao tiếp phi ngôn ngữ

Input: trẻ tự kỷ có học được cách tự kiểm tra sức khỏe không
Prediction: có trẻ tự kỷ có thể học cách 

In [None]:
# Cell 8: 5-Fold Cross-Validation
from sklearn.model_selection import KFold
import numpy as np
import gc

# Cấu hình KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_metrics = []
fold_models = []

# Lặp qua từng fold
for fold, (train_idx, eval_idx) in enumerate(kf.split(df)):
    print(f"\nTraining Fold {fold + 1}...")

    # Tạo tập train và eval cho fold hiện tại
    train_fold = df.iloc[train_idx][['input', 'output']]
    eval_fold = df.iloc[eval_idx][['input', 'output']]
    train_fold_dataset = Dataset.from_pandas(train_fold)
    eval_fold_dataset = Dataset.from_pandas(eval_fold)

    # Loại bỏ cột không cần thiết
    train_fold_dataset = train_fold_dataset.remove_columns(['__index_level_0__'] if '__index_level_0__' in train_fold_dataset.column_names else [])
    eval_fold_dataset = eval_fold_dataset.remove_columns(['__index_level_0__'] if '__index_level_0__' in eval_fold_dataset.column_names else [])

    # Tải lại mô hình gốc với INT4 quantization
    model, tokenizer, peft_config = load_model_and_tokenizer(quantization="int4")
    print(f"Fold {fold + 1} - Train size: {len(train_fold_dataset)}, Eval size: {len(eval_fold_dataset)}")
    # Cấu hình huấn luyện cho fold
    training_arguments_fold = TrainingArguments(
        output_dir=f"./results_{model_name.split('/')[-1]}fold_{fold + 1}",
        num_train_epochs=3,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=16,
        optim="paged_adamw_32bit",
        save_steps=100,
        logging_steps=10,
        learning_rate=5e-5,
        weight_decay=0.1,
        fp16=False,
        bf16=True,
        max_grad_norm=0.3,
        warmup_ratio=0.1,
        group_by_length=True,
        lr_scheduler_type="cosine",
        eval_strategy="steps",
        eval_steps=10,
        logging_strategy="steps",
        log_level="info",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
    )

    # Huấn luyện fold
    trainer_fold = SFTTrainer(
        model=model,
        args=training_arguments_fold,
        train_dataset=train_fold_dataset,
        eval_dataset=eval_fold_dataset,
        peft_config=peft_config,
        formatting_func=formatting_func,  # Truyền model_name vào formatting_func
        callbacks=[
            EarlyStoppingCallback(
                early_stopping_patience=3,
                early_stopping_threshold=0.01,
            )
        ],
    )
    trainer_fold.train()

    # Lưu mô hình fold
    fold_path = f"./finetuned_vistral_fold_{fold + 1}"
    model.save_pretrained(fold_path)
    tokenizer.save_pretrained(fold_path)
    fold_models.append(fold_path)

    # Đánh giá fold
    test_inputs_fold = eval_fold['input'].tolist()
    test_references_fold = eval_fold['output'].tolist()
    predictions_fold = generate_predictions(model, tokenizer, test_inputs_fold)
    metrics_fold = evaluate_metrics(predictions_fold, test_references_fold)
    print(f"Fold {fold + 1} Metrics:", metrics_fold)
    fold_metrics.append(metrics_fold)

    # Lưu metrics của fold
    with open(f"fold_{fold + 1}_metrics_vistral.json", "w") as f:
        json.dump(metrics_fold, f, indent=4)

    # Dọn dẹp bộ nhớ
    del model, trainer_fold
    torch.cuda.empty_cache()
    gc.collect()

# Tính trung bình metrics qua các fold
avg_metrics = {
    "rouge1": np.mean([m["rouge1"] for m in fold_metrics]),
    "rouge2": np.mean([m["rouge2"] for m in fold_metrics]),
    "rougeL": np.mean([m["rougeL"] for m in fold_metrics]),
    "meteor": np.mean([m["meteor"] for m in fold_metrics]),
}
print("\nAverage Cross-Validation Metrics:", avg_metrics)

# Lưu metrics trung bình
with open("cross_validation_metrics.json", "w") as f:
    json.dump(avg_metrics, f, indent=4)


Training Fold 1...


Loading checkpoint shards: 100%|██████████| 2/2 [00:18<00:00,  9.36s/it]


Cấu trúc mô hình Vistral-7B-Chat:
MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(38369, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Fold 1 - Train size: 1177, Eval size: 295


Applying formatting function to train dataset: 100%|██████████| 1177/1177 [00:00<00:00, 5618.28 examples/s]
Converting train dataset to ChatML: 100%|██████████| 1177/1177 [00:00<00:00, 22717.51 examples/s]
Applying chat template to train dataset: 100%|██████████| 1177/1177 [00:00<00:00, 23729.32 examples/s]
Tokenizing train dataset: 100%|██████████| 1177/1177 [00:00<00:00, 2692.06 examples/s]
Truncating train dataset: 100%|██████████| 1177/1177 [00:00<00:00, 251666.79 examples/s]
Applying formatting function to eval dataset: 100%|██████████| 295/295 [00:00<00:00, 6060.42 examples/s]
Converting eval dataset to ChatML: 100%|██████████| 295/295 [00:00<00:00, 20265.32 examples/s]
Applying chat template to eval dataset: 100%|██████████| 295/295 [00:00<00:00, 18033.58 examples/s]
Tokenizing eval dataset: 100%|██████████| 295/295 [00:00<00:00, 2568.24 examples/s]
Truncating eval dataset: 100%|██████████| 295/295 [00:00<00:00, 85467.96 examples/s]
Using auto half precision backend
No label_nam

Step,Training Loss,Validation Loss
10,4.2058,4.299687
20,3.8319,3.214305
30,2.7945,2.609295
40,2.4809,2.405173
50,2.223,2.197246
60,2.0389,2.066096
70,2.008,1.866504
80,1.679,1.761346
90,1.6846,1.693895
100,1.557,1.665935


The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text, output, input. If text, output, input are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 295
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text, output, input. If text, output, input are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 295
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text, output, input. If text, output, input are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2

Fold 1 Metrics: {'rouge1': np.float64(0.6274440560507248), 'rouge2': np.float64(0.29444332949528357), 'rougeL': np.float64(0.41291968347737357), 'meteor': np.float64(0.32268665250483003), 'cosine_similarity': np.float64(0.7093233066090082)}

Training Fold 2...


loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/tokenizer.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/tokenizer.json
loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/added_tokens.json
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/tokenizer_config.json
loading file chat_template.jinja from cache at None
Special tokens have been added in

Cấu trúc mô hình Vistral-7B-Chat:
MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(38369, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
PyTorch: setting up devices


Fold 2 - Train size: 1177, Eval size: 295


loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/tokenizer.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/tokenizer.json
loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/added_tokens.json
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/tokenizer_config.json
loading file chat_template.jinja from cache at None
Special tokens have been added in

Step,Training Loss,Validation Loss
10,4.1646,4.262856
20,3.8481,3.110157
30,2.7647,2.576733
40,2.4933,2.373812
50,2.2987,2.190441
60,2.0389,2.040157
70,1.956,1.854415
80,1.7034,1.74119
90,1.6259,1.687064
100,1.5539,1.63868


The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text, output, input. If text, output, input are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 295
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text, output, input. If text, output, input are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 295
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text, output, input. If text, output, input are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2

Fold 2 Metrics: {'rouge1': np.float64(0.6299936661003), 'rouge2': np.float64(0.3060467380879088), 'rougeL': np.float64(0.4230915911566669), 'meteor': np.float64(0.33021067434237245), 'cosine_similarity': np.float64(0.7289658469668889)}

Training Fold 3...


loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/tokenizer.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/tokenizer.json
loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/added_tokens.json
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/tokenizer_config.json
loading file chat_template.jinja from cache at None
Special tokens have been added in

Cấu trúc mô hình Vistral-7B-Chat:
MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(38369, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
PyTorch: setting up devices


Fold 3 - Train size: 1178, Eval size: 294


loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/tokenizer.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/tokenizer.json
loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/added_tokens.json
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/tokenizer_config.json
loading file chat_template.jinja from cache at None
Special tokens have been added in

Step,Training Loss,Validation Loss
10,4.2815,4.165266
20,3.7996,3.082685
30,2.7568,2.605698
40,2.5067,2.355263
50,2.2204,2.20685
60,2.0124,2.060964
70,1.9866,1.888332
80,1.6999,1.791562
90,1.6775,1.711529
100,1.5789,1.6734


The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text, output, input. If text, output, input are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 294
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text, output, input. If text, output, input are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 294
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text, output, input. If text, output, input are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2

Fold 3 Metrics: {'rouge1': np.float64(0.6226426400288338), 'rouge2': np.float64(0.3031341817244409), 'rougeL': np.float64(0.41897569378635746), 'meteor': np.float64(0.3275378994904807), 'cosine_similarity': np.float64(0.7421905587927825)}

Training Fold 4...


loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/tokenizer.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/tokenizer.json
loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/added_tokens.json
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/tokenizer_config.json
loading file chat_template.jinja from cache at None
Special tokens have been added in

Cấu trúc mô hình Vistral-7B-Chat:
MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(38369, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
PyTorch: setting up devices


Fold 4 - Train size: 1178, Eval size: 294


loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/tokenizer.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/tokenizer.json
loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/added_tokens.json
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/tokenizer_config.json
loading file chat_template.jinja from cache at None
Special tokens have been added in

Step,Training Loss,Validation Loss
10,4.283,4.302227
20,3.837,3.265931
30,2.8486,2.686669
40,2.5453,2.421843
50,2.2662,2.229098
60,2.075,2.108346
70,2.0174,1.896111
80,1.7047,1.793876
90,1.6619,1.730138
100,1.5738,1.727466


The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text, output, input. If text, output, input are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 294
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text, output, input. If text, output, input are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 294
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text, output, input. If text, output, input are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2

Fold 4 Metrics: {'rouge1': np.float64(0.6469268566357902), 'rouge2': np.float64(0.33722182194617945), 'rougeL': np.float64(0.44669986571600806), 'meteor': np.float64(0.3605655734543626), 'cosine_similarity': np.float64(0.7409246646019877)}

Training Fold 5...


loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/tokenizer.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/tokenizer.json
loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/added_tokens.json
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/tokenizer_config.json
loading file chat_template.jinja from cache at None
Special tokens have been added in

Cấu trúc mô hình Vistral-7B-Chat:
MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(38369, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
PyTorch: setting up devices


Fold 5 - Train size: 1178, Eval size: 294


loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/tokenizer.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/tokenizer.json
loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/added_tokens.json
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/tokenizer_config.json
loading file chat_template.jinja from cache at None
Special tokens have been added in

Step,Training Loss,Validation Loss
10,4.2263,4.235795
20,3.7217,3.080836
30,2.7276,2.591502
40,2.5102,2.371832
50,2.3337,2.220725
60,2.194,2.12733
70,2.0459,1.959268
80,1.8457,1.813228
90,1.6892,1.747418
100,1.6192,1.693519


The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text, output, input. If text, output, input are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 294
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text, output, input. If text, output, input are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 294
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text, output, input. If text, output, input are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2

Fold 5 Metrics: {'rouge1': np.float64(0.6331555412154104), 'rouge2': np.float64(0.30615200886128013), 'rougeL': np.float64(0.41627220691302647), 'meteor': np.float64(0.34571969064913927), 'cosine_similarity': np.float64(0.7398509132821544)}


KeyError: 'bleu'

In [10]:
# Tính trung bình metrics qua các fold
avg_metrics = {
    "rouge1": np.mean([m["rouge1"] for m in fold_metrics]),
    "rouge2": np.mean([m["rouge2"] for m in fold_metrics]),
    "rougeL": np.mean([m["rougeL"] for m in fold_metrics]),
    "meteor": np.mean([m["meteor"] for m in fold_metrics]),
}
print("\nAverage Cross-Validation Metrics:", avg_metrics)

# Lưu metrics trung bình
with open("cross_validation_metrics.json", "w") as f:
    json.dump(avg_metrics, f, indent=4)


Average Cross-Validation Metrics: {'rouge1': np.float64(0.6320325520062118), 'rouge2': np.float64(0.3093996160230185), 'rougeL': np.float64(0.42359180820988646), 'meteor': np.float64(0.337344098088237)}


In [14]:
import torch
import gc
import json
from peft import PeftModel
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import copy

# Giải phóng bộ nhớ trước
torch.cuda.empty_cache()
gc.collect()

# Đường dẫn đến các file metrics
fold_metrics_files = [
    "fold_1_metrics_vistral.json",
    "fold_2_metrics_vistral.json",
    "fold_3_metrics_vistral.json",
    "fold_4_metrics_vistral.json",
    "fold_5_metrics_vistral.json"
]

# Tải và so sánh metrics để chọn fold tốt nhất
best_fold_path = None
best_score = -float('inf')
metric_to_optimize = "cosine_similarity"  # Hoặc chọn "rougeL", "meteor", tùy bạn

for file_path in fold_metrics_files:
    with open(file_path, 'r', encoding='utf-8') as f:
        metrics = json.load(f)
        score = metrics.get(metric_to_optimize, -float('inf'))
        if score > best_score:
            best_score = score
            best_fold_path = f"./finetuned_vistral_fold_{file_path.split('_')[1].split('.')[0]}"

if best_fold_path is None:
    raise ValueError("Không tìm thấy fold nào để merge.")

# Tải mô hình gốc ban đầu
base_model, tokenizer, _ = load_model_and_tokenizer(quantization="int8")

# Tải mô hình fine-tune tốt nhất
best_fold_model = PeftModel.from_pretrained(base_model, best_fold_path, is_trainable=False)
best_state_dict = copy.deepcopy(best_fold_model.state_dict())

# Kết hợp trọng số (50% mô hình ban đầu + 50% mô hình fine-tune tốt nhất)
base_state_dict = copy.deepcopy(base_model.state_dict())
combined_state_dict = {}
for key in base_state_dict:
    if key in best_state_dict:
        combined_state_dict[key] = 0.5 * base_state_dict[key] + 0.5 * best_state_dict[key]
    else:
        combined_state_dict[key] = base_state_dict[key]

# Tải trọng số kết hợp vào mô hình gốc
base_model.load_state_dict(combined_state_dict, strict=False)

# Lưu mô hình kết hợp
combined_model_path = "./finetuned_vistral_combined"
base_model.save_pretrained(combined_model_path)
tokenizer.save_pretrained(combined_model_path)

# Dọn dẹp bộ nhớ sau khi lưu
del base_model
del best_fold_model
del base_state_dict
del best_state_dict
del combined_state_dict
torch.cuda.empty_cache()
gc.collect()

# Tải lại mô hình kết hợp với INT8 quantization
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.bfloat16,
    bnb_8bit_use_double_quant=True,
    llm_int8_enable_fp32_cpu_offload=True
)
device_map = {"": 0}

combined_model = AutoModelForCausalLM.from_pretrained(
    combined_model_path,
    quantization_config=quantization_config,
    torch_dtype=torch.bfloat16,
    device_map=device_map
)

# Đánh giá mô hình kết hợp
test_inputs = eval_df['input'].tolist()
test_references = eval_df['output'].tolist()
predictions_combined = generate_predictions(combined_model, tokenizer, test_inputs, batch_size=1)
metrics_combined = evaluate_metrics(predictions_combined, test_references)
print("Combined Model Metrics:", metrics_combined)

# Lưu metrics của mô hình kết hợp
with open("combined_model_metrics.json", "w") as f:
    json.dump(metrics_combined, f, indent=4)

# Dọn dẹp bộ nhớ sau khi đánh giá
del combined_model
torch.cuda.empty_cache()
gc.collect()

Unused kwargs: ['bnb_8bit_compute_dtype', 'bnb_8bit_use_double_quant']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/tokenizer.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/tokenizer.json
loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/added_tokens.json
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vistral-7B-Chat/snapshots/d331b64e61b935cc43c2b3010ae9fb4fde599b45/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--Viet-Mistral--Vist

Cấu trúc mô hình Vistral-7B-Chat:
MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(38369, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear8bitLt(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4



OutOfMemoryError: CUDA out of memory. Tried to allocate 56.00 MiB. GPU 0 has a total capacity of 23.49 GiB of which 21.25 MiB is free. Including non-PyTorch memory, this process has 23.45 GiB memory in use. Of the allocated memory 23.03 GiB is allocated by PyTorch, and 98.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)