In [8]:
# Install required libraries
# pip install pandas numpy transformers torch datasets scikit-learn nltk rouge_score evaluate PyMuPDF PyPDF2 tabulate

In [9]:
import json
import pandas as pd

# Load and combine both JSON files
data_combined = []

json_files = [
    "/root/trainModel/Data/TestModel/Seq2Seq/formatted_data_part_1.json",
    "/root/trainModel/Data/TestModel/Seq2Seq/formatted_data_part_2.json"
]

for file_name in json_files:
    with open(file_name, 'r', encoding='utf-8') as f:
        data = json.load(f)
        data_combined.extend(data)

df = pd.DataFrame([{
    "text": f"instruction: {item['instruction']} input: {item['input']}",
    "label": item["output"]
} for item in data_combined])

print(df.head())

                                                text  \
0  instruction: Nếu bạn là bác sĩ, vui lòng trả l...   
1  instruction: Nếu bạn là bác sĩ, vui lòng trả l...   
2  instruction: Nếu bạn là bác sĩ, vui lòng trả l...   
3  instruction: Nếu bạn là bác sĩ, vui lòng trả l...   
4  instruction: Nếu bạn là bác sĩ, vui lòng trả l...   

                                               label  
0  Xin chào, Cảm ơn bạn đã đăng truy vấn của bạn....  
1  Xin chào ... Cảm ơn bạn đã tham khảo ý kiến ​​...  
2  Xin chào, và tôi hy vọng tôi có thể giúp bạn n...  
3  CHÀO. Bạn có hai vấn đề khác nhau. Các khối u ...  
4  Cảm ơn bạn đã sử dụng bác sĩ trò chuyện. Tôi s...  


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset
import numpy as np

model_name = "meta-llama/llama-3.2-1B-Instruct"
hf_token = "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"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)

# Add padding token if not present (LLaMA models often don't have a pad token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Preprocess function for causal language model
def preprocess_function(examples):
    # Concatenate input and label into a single sequence
    inputs = [f"{text} {label}" for text, label in zip(examples["text"], examples["label"])]
    # Tokenize the concatenated sequence
    model_inputs = tokenizer(
        inputs,
        max_length=512,  # Giữ nguyên max_length
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    # Labels are the same as input_ids for causal language modeling
    model_inputs["labels"] = model_inputs["input_ids"].clone()
    return model_inputs

In [11]:
import torch
import gc
import os
from transformers import TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback

# Check CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Callback to print loss after each epoch
class LoggingCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        # Check if log_history is not empty and contains 'loss'
        if state.log_history and 'loss' in state.log_history[-1]:
            print(f"Epoch {state.epoch}: Loss = {state.log_history[-1]['loss']:.4f}")
        else:
            print(f"Epoch {state.epoch}: Loss not available in log history.")

Using device: cuda


In [None]:
# Function to predict and evaluate
def predict_and_evaluate(model, tokenized_val, device, fold_idx):
    # Predict and evaluate
    preds, refs = [], []
    for example in tokenized_val:
        input_ids = torch.tensor(example["input_ids"]).unsqueeze(0).to(device)
        attention_mask = torch.tensor(example["attention_mask"]).unsqueeze(0).to(device)
        with torch.no_grad():
            generated_ids = model.generate(
                input_ids,
                attention_mask=attention_mask,
                max_length=512,
                num_beams=4,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )
        pred = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        # Extract the predicted part after the input text
        input_text = tokenizer.decode(example["input_ids"], skip_special_tokens=True)
        pred = pred[len(input_text):].strip()
        # Reference is the label part
        ref = example["label"]
        
        preds.append(pred)
        refs.append(ref)
    
    # Save sample predictions
    os.makedirs(f"./predictions_fold_{fold_idx}", exist_ok=True)
    with open(f"./predictions_fold_{fold_idx}/predictions.txt", "w", encoding="utf-8") as f:
        for pred, ref in zip(preds[:5], refs[:5]):
            f.write(f"Prediction: {pred}\nReference: {ref}\n\n")
    
    # Compute BLEU score
    bleu_metric = evaluate.load("bleu")
    bleu_result = bleu_metric.compute(predictions=[p.split() for p in preds],
                                      references=[[r.split()] for r in refs])
    bleu_score = bleu_result["bleu"]
    
    # Compute ROUGE scores
    rouge_metric = evaluate.load("rouge")
    rouge_result = rouge_metric.compute(predictions=preds, references=refs, use_stemmer=True)
    rouge1 = rouge_result['rouge1'].mid.fmeasure
    rouge2 = rouge_result['rouge2'].mid.fmeasure
    rougeL = rouge_result['rougeL'].mid.fmeasure
    
    # Compute METEOR score
    meteor_metric = evaluate.load("meteor")
    meteor_score = meteor_metric.compute(predictions=preds, references=refs)["meteor"]
    
    # Free up memory
    del model
    torch.cuda.empty_cache()
    gc.collect()
    
    return bleu_score, rouge1, rouge2, rougeL, meteor_score

In [None]:
def train_and_evaluate_fold(train_dataset, val_dataset, model_name, device, fold_idx):
    # Tokenize datasets
    tokenized_train = train_dataset.map(preprocess_function, batched=True)
    tokenized_val = val_dataset.map(preprocess_function, batched=True)

    # Remove unnecessary columns and set format
    tokenized_train = tokenized_train.remove_columns(["text", "label"])
    tokenized_val = tokenized_val.remove_columns(["text", "label"])
    tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
    tokenized_val.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

    # Load the model and move to device
    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, token=hf_token).to(device)

    # Define training arguments with memory optimization
    training_args = TrainingArguments(
        output_dir=f"./MentalLLaMA_fold_{fold_idx}",
        eval_strategy="epoch",
        learning_rate=3e-5,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=4,
        num_train_epochs=3,
        weight_decay=0.01,
        save_strategy="no",
        logging_steps=10,
        logging_strategy="steps",
        seed=42,
        run_name=f"MentalLLaMA_Fold_{fold_idx}",
        report_to="none",
        gradient_checkpointing=True,
    )

    # Data collator for language modeling
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False  # Causal language modeling, not masked language modeling
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        data_collator=data_collator,
        callbacks=[LoggingCallback()],
    )

    # Train the model
    trainer.train()

    print(f"Training for Fold {fold_idx} completed.")

    return model, tokenized_val


Starting Fold 1


Map: 100%|██████████| 1600/1600 [00:00<00:00, 2496.06 examples/s]
Map: 100%|██████████| 400/400 [00:00<00:00, 2695.28 examples/s]
Loading checkpoint shards: 100%|██████████| 6/6 [00:08<00:00,  1.46s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 24.00 GiB of which 9.86 MiB is free. Including non-PyTorch memory, this process has 22.35 GiB memory in use. Of the allocated memory 21.84 GiB is allocated by PyTorch, and 1.12 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
from sklearn.model_selection import KFold

# Main loop for K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

bleu_scores = []
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []
meteor_scores = []

fold_idx = 1

for train_index, val_index in kf.split(df):
    print(f"\nStarting Fold {fold_idx}")
    
    # Split the data into training and validation sets
    train_df = df.iloc[train_index]
    val_df = df.iloc[val_index]
    
    # Convert to Hugging Face dataset
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    
    # Train and evaluate the fold
    model, tokenized_val = train_and_evaluate_fold(
        train_dataset, val_dataset, model_name, device, fold_idx
    )
    
    # Predict and evaluate
    bleu_score, rouge1, rouge2, rougeL, meteor_score = predict_and_evaluate(
        model, tokenized_val, device, fold_idx
    )
    
    # Store the results
    bleu_scores.append(bleu_score)
    rouge1_scores.append(rouge1)
    rouge2_scores.append(rouge2)
    rougeL_scores.append(rougeL)
    meteor_scores.append(meteor_score)
    
    print(f"Fold {fold_idx} completed.")
    print(f"BLEU: {bleu_score:.4f}")
    print(f"ROUGE-1: {rouge1:.4f}, ROUGE-2: {rouge2:.4f}, ROUGE-L: {rougeL:.4f}")
    print(f"METEOR: {meteor_score:.4f}")
    
    fold_idx += 1

In [None]:
# Summarize the results
results_df = pd.DataFrame({
    "Fold": [f"Fold {i+1}" for i in range(5)] + ["Average"],
    "BLEU": bleu_scores + [np.mean(bleu_scores)],
    "ROUGE-1": rouge1_scores + [np.mean(rouge1_scores)],
    "ROUGE-2": rouge2_scores + [np.mean(rouge2_scores)],
    "ROUGE-L": rougeL_scores + [np.mean(rougeL_scores)],
})

# Format the results
results_df["BLEU"] = results_df["BLEU"].apply(lambda x: f"{x:.4f}")
results_df["ROUGE-1"] = results_df["ROUGE-1"].apply(lambda x: f"{x:.4f}")
results_df["ROUGE-2"] = results_df["ROUGE-2"].apply(lambda x: f"{x:.4f}")
results_df["ROUGE-L"] = results_df["ROUGE-L"].apply(lambda x: f"{x:.4f}")

# Print the results
print("\nCross-Validation Results:")
print(results_df.to_markdown(index=False))