# Install library_additional

In [1]:
# pip install pandas numpy transformers torch datasets scikit-learn nltk rouge_score evaluate tabulate


In [2]:
import json
import pandas as pd

# Load and combine JSON files
data_combined = []
json_files = [
    "/root/trainModel/Data/TestModel/Seq2Seq/formatted_data_part_1.json",
    "/root/trainModel/Data/TestModel/Seq2Seq/formatted_data_part_2.json"
]

for file_name in json_files:
    with open(file_name, 'r', encoding='utf-8') as f:
        data = json.load(f)
        data_combined.extend(data)

# Create DataFrame with validation
cleaned_data = []
for item in data_combined:
    # Ensure all fields exist and are strings
    instruction = str(item.get("instruction", "")) if item.get("instruction") is not None else ""
    input_text = str(item.get("input", "")) if item.get("input") is not None else ""
    output = str(item.get("output", "")) if item.get("output") is not None else ""

    # Skip records where any field is empty after conversion
    if not instruction.strip() or not input_text.strip() or not output.strip():
        continue

    cleaned_data.append({
        "text": f"instruction: {instruction} input: {input_text}",
        "label": output
    })

# Create DataFrame
df = pd.DataFrame(cleaned_data)

# Additional cleaning: Remove any remaining NaN or empty strings
df = df.dropna(subset=["text", "label"])
df = df[df["text"].str.strip() != ""]
df = df[df["label"].str.strip() != ""]

print(df.head())
print(f"Total records after cleaning: {len(df)}")

                                                text  \
0  instruction: Nếu bạn là bác sĩ, vui lòng trả l...   
1  instruction: Nếu bạn là bác sĩ, vui lòng trả l...   
2  instruction: Nếu bạn là bác sĩ, vui lòng trả l...   
3  instruction: Nếu bạn là bác sĩ, vui lòng trả l...   
4  instruction: Nếu bạn là bác sĩ, vui lòng trả l...   

                                               label  
0  Xin chào, Cảm ơn bạn đã đăng truy vấn của bạn....  
1  Xin chào ... Cảm ơn bạn đã tham khảo ý kiến ​​...  
2  Xin chào, và tôi hy vọng tôi có thể giúp bạn n...  
3  CHÀO. Bạn có hai vấn đề khác nhau. Các khối u ...  
4  Cảm ơn bạn đã sử dụng bác sĩ trò chuyện. Tôi s...  
Total records after cleaning: 2000


# Configure the input and model parameters

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np

# Define model and tokenizer
model_name = "Tianlin668/MentalT5"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Preprocess function for tokenization
def preprocess_function(examples):
    # Ensure inputs are valid strings
    texts = [str(text) if text is not None else "" for text in examples["text"]]
    labels = [str(label) if label is not None else "" for label in examples["label"]]

    # Tokenize inputs
    model_inputs = tokenizer(
        texts,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors=None,  # Let the dataset handle tensor conversion
    )

    # Tokenize labels (as target sequences)
    with tokenizer.as_target_tokenizer():
        labels_encoded = tokenizer(
            labels,
            max_length=512,
            padding="max_length",
            truncation=True,
            return_tensors=None,  # Let the dataset handle tensor conversion
        )

    # Replace padding token id with -100 to ignore in loss computation
    labels_encoded["input_ids"] = [
        [(label_id if label_id != tokenizer.pad_token_id else -100) for label_id in label_ids]
        for label_ids in labels_encoded["input_ids"]
    ]

    model_inputs["labels"] = labels_encoded["input_ids"]
    return model_inputs

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import torch
import numpy as np
import gc
import os
from sklearn.model_selection import KFold
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq, TrainerCallback
from datasets import Dataset
import evaluate

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Check CUDA and GPU details
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")

# Callback to log loss after each epoch
class LoggingCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        if state.log_history and 'loss' in state.log_history[-1]:
            print(f"Epoch {state.epoch}: Loss = {state.log_history[-1]['loss']:.4f}")
        else:
            print(f"Epoch {state.epoch}: Loss not available in log history.")

# Train and evaluate a fold
def train_and_evaluate_fold(train_dataset, val_dataset, model, tokenizer, device, fold_idx):
    # Check for empty entries in dataset
    def check_dataset(dataset):
        for i, example in enumerate(dataset):
            if not example["text"] or not example["label"]:
                print(f"Warning: Empty text or label at index {i}")
    check_dataset(train_dataset)
    check_dataset(val_dataset)

    # Tokenize datasets
    tokenized_train = train_dataset.map(preprocess_function, batched=True)
    tokenized_val = val_dataset.map(preprocess_function, batched=True)

    # Remove unnecessary columns and set format
    tokenized_train = tokenized_train.remove_columns(["text", "label"])
    tokenized_val = tokenized_val.remove_columns(["text", "label"])
    tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
    tokenized_val.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f"./MentalBART_fold_{fold_idx}",
        eval_strategy="epoch",
        learning_rate=3e-5,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=2,
        fp16=False,
        bf16=True,
        num_train_epochs=3,
        weight_decay=0.01,
        save_strategy="no",
        logging_steps=10,
        logging_strategy="steps",
        seed=42,
        run_name=f"MentalBART_Fold_{fold_idx}",
        report_to="none",
        gradient_checkpointing=True
    )

    # Add data collator
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        data_collator=data_collator,
        callbacks=[LoggingCallback()],
    )

    # Train with OOM handling
    try:
        trainer.train()
    except RuntimeError as e:
        if "out of memory" in str(e).lower():
            print("Out of Memory! Reducing batch size and retrying...")
            training_args.per_device_train_batch_size = 1
            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=tokenized_train,
                eval_dataset=tokenized_val,
                data_collator=data_collator,
                callbacks=[LoggingCallback()],
            )
            trainer.train()
        else:
            raise e

    print(f"Training for Fold {fold_idx} completed.")

    # Predict and evaluate
    preds, refs = [], []
    for example in tokenized_val:
        input_ids = torch.tensor(example["input_ids"]).unsqueeze(0).to(device)
        attention_mask = torch.tensor(example["attention_mask"]).unsqueeze(0).to(device)
        with torch.no_grad():
            generated_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=1024, num_beams=4)
        pred = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        preds.append(pred)
        refs.append(tokenizer.decode(example["labels"], skip_special_tokens=True))

    # Save sample predictions
    os.makedirs(f"./predictions_fold_{fold_idx}", exist_ok=True)
    with open(f"./predictions_fold_{fold_idx}/predictions.txt", "w", encoding="utf-8") as f:
        for pred, ref in zip(preds[:5], refs[:5]):
            f.write(f"Prediction: {pred}\nReference: {ref}\n\n")

    # Compute BLEU score
    bleu_metric = evaluate.load("bleu")
    bleu_result = bleu_metric.compute(predictions=[p.split() for p in preds],
                                      references=[[r.split()] for r in refs])
    bleu_score = bleu_result["bleu"]

    # Compute ROUGE scores
    rouge_metric = evaluate.load("rouge")
    rouge_result = rouge_metric.compute(predictions=preds, references=refs, use_stemmer=True)
    rouge1 = rouge_result['rouge1'].mid.fmeasure
    rouge2 = rouge_result['rouge2'].mid.fmeasure
    rougeL = rouge_result['rougeL'].mid.fmeasure

    # Compute METEOR score
    meteor_metric = evaluate.load("meteor")
    meteor_score = meteor_metric.compute(predictions=preds, references=refs)["meteor"]

    # Free up memory
    torch.cuda.empty_cache()
    gc.collect()

    return bleu_score, rouge1, rouge2, rougeL, meteor_score

# K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

bleu_scores = []
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []
meteor_scores = []

# Load model and tokenizer once before training
model_name = "Tianlin668/MentalT5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
).to(device)

fold_idx = 1

for train_index, val_index in kf.split(df):
    print(f"\nStarting Fold {fold_idx}")

    torch.cuda.empty_cache()
    gc.collect()

    # Split data into training and validation sets
    train_df = df.iloc[train_index]
    val_df = df.iloc[val_index]

    # Convert to Hugging Face dataset
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)

    # Train and evaluate
    bleu_score, rouge1, rouge2, rougeL, meteor_score = train_and_evaluate_fold(
        train_dataset, val_dataset, model, tokenizer, device, fold_idx
    )

    # Store results
    bleu_scores.append(bleu_score)
    rouge1_scores.append(rouge1)
    rouge2_scores.append(rouge2)
    rougeL_scores.append(rougeL)
    meteor_scores.append(meteor_score)

    print(f"Fold {fold_idx} completed.")
    print(f"BLEU: {bleu_score:.4f}")
    print(f"ROUGE-1: {rouge1:.4f}, ROUGE-2: {rouge2:.4f}, ROUGE-L: {rougeL:.4f}")
    print(f"METEOR: {meteor_score:.4f}")

    fold_idx += 1

# Save the model and tokenizer after training all folds
final_model_dir = "./final_mentalT5_model"
model.save_pretrained(final_model_dir)
tokenizer.save_pretrained(final_model_dir)
print(f"Model and tokenizer saved to {final_model_dir}")

Using device: cuda
CUDA available: True
GPU device: GRID P40-24Q
CUDA version: 12.4

Starting Fold 1


Map: 100%|██████████| 1600/1600 [00:05<00:00, 303.54 examples/s]
Map: 100%|██████████| 400/400 [00:01<00:00, 362.31 examples/s]
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss
1,1.9317,1.87044


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1.0: Loss = 1.9317


In [None]:
# Summarize results if all folds completed
if len(bleu_scores) == 5:
    results_df = pd.DataFrame({
        "Fold": [f"Fold {i+1}" for i in range(5)] + ["Average"],
        "BLEU": bleu_scores + [np.mean(bleu_scores)],
        "ROUGE-1": rouge1_scores + [np.mean(rouge1_scores)],
        "ROUGE-2": rouge2_scores + [np.mean(rouge2_scores)],
        "ROUGE-L": rougeL_scores + [np.mean(rougeL_scores)],
    })

    # Format results
    results_df["BLEU"] = results_df["BLEU"].apply(lambda x: f"{x:.4f}")
    results_df["ROUGE-1"] = results_df["ROUGE-1"].apply(lambda x: f"{x:.4f}")
    results_df["ROUGE-2"] = results_df["ROUGE-2"].apply(lambda x: f"{x:.4f}")
    results_df["ROUGE-L"] = results_df["ROUGE-L"].apply(lambda x: f"{x:.4f}")

    # Print results
    print("\nCross-Validation Results (MentalT5):")
    print(results_df.to_markdown(index=False))
else:
    print("Not enough folds completed to summarize results.")

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


ValueError: All arrays must be of the same length