# Install library_additional

In [1]:
# pip install pandas numpy transformers torch datasets scikit-learn nltk rouge_score evaluate PyMuPDF PyPDF2 tabulate


In [None]:
import json
import pandas as pd

# Load and combine both JSON files
data_combined = []

json_files = [
    "../Data/TestModel/Seq2Seq/formatted_data_part_1.json",
    "../Data/TestModel/Seq2Seq/formatted_data_part_2.json"
]

for file_name in json_files:
    with open(file_name, 'r', encoding='utf-8') as f:
        data = json.load(f)
        data_combined.extend(data)

df = pd.DataFrame([{
    "text": f"instruction: {item['instruction']} input: {item['input']}",
    "label": item["output"]
} for item in data_combined])

print(df.head())

                                                text  \
0  instruction: Nếu bạn là bác sĩ, vui lòng trả l...   
1  instruction: Nếu bạn là bác sĩ, vui lòng trả l...   
2  instruction: Nếu bạn là bác sĩ, vui lòng trả l...   
3  instruction: Nếu bạn là bác sĩ, vui lòng trả l...   
4  instruction: Nếu bạn là bác sĩ, vui lòng trả l...   

                                               label  
0  Xin chào .... điều kiện da chỉ được chẩn đoán ...  
1  Cảm ơn bạn đã chọn bác sĩ trò chuyện. Tôi rất ...  
2  Kính gửi bệnh nhân bạn đang có tất cả các dấu ...  
3  Xin chào, một cơn đau lưng cùng với sự tỏa ra ...  
4  Đau vàng nhất có thể là do nhiều nguyên nhân n...  


# Configure the input and model parameters

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np

model_name = "Tianlin668/MentalBART"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    # Tokenize input (text)
    model_inputs = tokenizer(
        examples["text"],
        max_length=1024,
        padding="max_length",
        truncation=True,
    )
    # Tokenize output (label) for decoder
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["label"],
            max_length=1024,
            padding="max_length",
            truncation=True,
        )
    # Gán labels
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

2025-03-27 19:02:17.558987: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-27 19:02:17.567793: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743102137.576968 3552291 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743102137.579756 3552291 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1743102137.588274 3552291 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

tokenizer_config.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [None]:
import torch
import numpy as np
import gc
import os
from sklearn.model_selection import KFold 
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq, TrainerCallback
from datasets import Dataset
import evaluate

# Check CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Callback to print loss after each epoch
class LoggingCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        print(f"Epoch {state.epoch}: Loss = {state.log_history[-1]['loss']:.4f}")

# Function to train and evaluate a fold
def train_and_evaluate_fold(train_dataset, val_dataset, model_name, device, fold_idx):
    # Check dataset for empty entries
    def check_dataset(dataset):
        for i, example in enumerate(dataset):
            if not example["text"] or not example["label"]:
                print(f"Warning: Empty text or label at index {i}")
    check_dataset(train_dataset)
    check_dataset(val_dataset)
    
    # Tokenize the datasets
    tokenized_train = train_dataset.map(preprocess_function, batched=True)
    tokenized_val = val_dataset.map(preprocess_function, batched=True)
    
    # Remove unnecessary columns and set format
    tokenized_train = tokenized_train.remove_columns(["text", "label"])
    tokenized_val = tokenized_val.remove_columns(["text", "label"])
    tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
    tokenized_val.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
    
    # Load the model and move to device
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
    
    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f"./MentalBART_fold_{fold_idx}",
        eval_strategy="epoch",
        learning_rate=3e-5,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=4,
        fp16=True,
        num_train_epochs=3,
        weight_decay=0.01,
        save_strategy="no",
        logging_steps=10,
        logging_strategy="epoch",
        seed=42,
        run_name=f"MentalBART_Fold_{fold_idx}",
        report_to="none",
    )
    
    # Add data collator
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
    
    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        data_collator=data_collator,
        callbacks=[LoggingCallback()],
    )
    
    # Train with OOM handling
    try:
        trainer.train()
    except RuntimeError as e:
        if "out of memory" in str(e).lower():
            print("Out of Memory! Reducing batch size and retrying...")
            training_args.per_device_train_batch_size = 1
            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=tokenized_train,
                eval_dataset=tokenized_val,
                data_collator=data_collator,
                callbacks=[LoggingCallback()],
            )
            trainer.train()
        else:
            raise e
    
    print(f"Training for Fold {fold_idx} completed.")
    
    # Predict and evaluate
    preds, refs = [], []
    for example in tokenized_val:
        input_ids = torch.tensor(example["input_ids"]).unsqueeze(0).to(device)
        attention_mask = torch.tensor(example["attention_mask"]).unsqueeze(0).to(device)
        with torch.no_grad():
            generated_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=1024, num_beams=4)
        pred = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        
        preds.append(pred)
        refs.append(tokenizer.decode(example["labels"], skip_special_tokens=True))
    
    # Save sample predictions
    os.makedirs(f"./predictions_fold_{fold_idx}", exist_ok=True)
    with open(f"./predictions_fold_{fold_idx}/predictions.txt", "w", encoding="utf-8") as f:
        for pred, ref in zip(preds[:5], refs[:5]):
            f.write(f"Prediction: {pred}\nReference: {ref}\n\n")
    
    # Compute BLEU score
    bleu_metric = evaluate.load("bleu")
    bleu_result = bleu_metric.compute(predictions=[p.split() for p in preds],
                                      references=[[r.split()] for r in refs])
    bleu_score = bleu_result["bleu"]
    
    # Compute ROUGE scores
    rouge_metric = evaluate.load("rouge")
    rouge_result = rouge_metric.compute(predictions=preds, references=refs, use_stemmer=True)
    rouge1 = rouge_result['rouge1'].mid.fmeasure
    rouge2 = rouge_result['rouge2'].mid.fmeasure
    rougeL = rouge_result['rougeL'].mid.fmeasure
    
    # Compute METEOR score
    meteor_metric = evaluate.load("meteor")
    meteor_score = meteor_metric.compute(predictions=preds, references=refs)["meteor"]
    
    # Free up memory
    del model
    del trainer
    torch.cuda.empty_cache()
    gc.collect()
    
    return bleu_score, rouge1, rouge2, rougeL, meteor_score

# Main loop for K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

bleu_scores = []
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []
meteor_scores = []

fold_idx = 1

for train_index, val_index in kf.split(df):
    print(f"\nStarting Fold {fold_idx}")
    
    # Split the data into training and validation sets
    train_df = df.iloc[train_index]
    val_df = df.iloc[val_index]
    
    # Convert to Hugging Face dataset
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    
    # Train and evaluate the fold
    bleu_score, rouge1, rouge2, rougeL, meteor_score = train_and_evaluate_fold(
        train_dataset, val_dataset, model_name, device, fold_idx
    )
    
    # Store the results
    bleu_scores.append(bleu_score)
    rouge1_scores.append(rouge1)
    rouge2_scores.append(rouge2)
    rougeL_scores.append(rougeL)
    meteor_scores.append(meteor_score)
    
    print(f"Fold {fold_idx} completed.")
    print(f"BLEU: {bleu_score:.4f}")
    print(f"ROUGE-1: {rouge1:.4f}, ROUGE-2: {rouge2:.4f}, ROUGE-L: {rougeL:.4f}")
    print(f"METEOR: {meteor_score:.4f}")
    
    fold_idx += 1

Using device: cuda

Starting Fold 1


Map:   0%|          | 0/1600 [00:00<?, ? examples/s]



Map:   0%|          | 0/400 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.71k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/292 [00:00<?, ?B/s]

  trainer = Trainer(


Starting training for Fold 1...


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 15.58 GiB of which 17.31 MiB is free. Process 2967531 has 12.61 GiB memory in use. Including non-PyTorch memory, this process has 2.94 GiB memory in use. Of the allocated memory 2.78 GiB is allocated by PyTorch, and 13.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

In [None]:
# Summarize the results
results_df = pd.DataFrame({
    "Fold": [f"Fold {i+1}" for i in range(5)] + ["Average"],
    "BLEU": bleu_scores + [np.mean(bleu_scores)],
    "ROUGE-1": rouge1_scores + [np.mean(rouge1_scores)],
    "ROUGE-2": rouge2_scores + [np.mean(rouge2_scores)],
    "ROUGE-L": rougeL_scores + [np.mean(rougeL_scores)],
})

# Format the results
results_df["BLEU"] = results_df["BLEU"].apply(lambda x: f"{x:.4f}")
results_df["ROUGE-1"] = results_df["ROUGE-1"].apply(lambda x: f"{x:.4f}")
results_df["ROUGE-2"] = results_df["ROUGE-2"].apply(lambda x: f"{x:.4f}")
results_df["ROUGE-L"] = results_df["ROUGE-L"].apply(lambda x: f"{x:.4f}")

# Print the results
print("\nCross-Validation Results:")
print(results_df.to_markdown(index=False))
