In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification,Trainer,TrainingArguments, logging
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np
import pandas as pd
import random
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
import traceback
import warnings

In [None]:
from google.colab import drive
drive.mount ("/content/drive")
import os
os.chdir("/content/drive/MyDrive/TEST")
os.environ['WANDB_MODE'] = 'disabled'
model_name="./MODEL/qa_model"
output_dir="./MODEL/qa_model2"
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

Mounted at /content/drive


In [None]:
df_temp1 = pd.read_csv("qa_pairs.csv")[["questions", "answers", "labels"]] #1000 perfect + 100 good/bad eli5 + 100 bad gpt2 + 100 bad gpt2 finetune
df_temp2 = pd.read_csv("qa_pairs2.csv")[["questions", "answers", "labels"]] #1200 eli5 finetune labeled by ROUGE_L
df_temp2 = df_temp2[df_temp2["labels"]==0]
df_temp3 = pd.read_csv("qa_pairs3_simple.csv")[["questions", "answers", "labels"]] #247 eli5 from LLAMA ==> bart-large-cnn label by gpt-4o
df_QA = pd.concat([df_temp1, df_temp2, df_temp3], axis=0, ignore_index=True).reset_index(drop=True)
len(df_QA), len(df_QA[df_QA["labels"]==1]), len(df_QA[df_QA["labels"]==0])

(2180, 1299, 881)

In [None]:
def train_qa_model(df_QA):
    print("Preparing data...")
    questions = df_QA['questions'].astype(str).tolist()
    answers = df_QA['answers'].astype(str).tolist()
    labels = df_QA['labels'].astype(np.int64).tolist()
    try:
        tokenizer = RobertaTokenizer.from_pretrained(model_name)
        model = RobertaForSequenceClassification.from_pretrained(
            model_name,
            num_labels=2,
            problem_type="single_label_classification",
            torch_dtype=torch.float32
        )
    except Exception as e:
        print(f"Error loading model: {e}")
        return None, None, None
    if torch.cuda.is_available():
        model = model.cuda()
        print("Model moved to GPU")
    try:
        train_questions, val_questions, train_answers, val_answers, train_labels, val_labels = train_test_split(
            questions, answers, labels, test_size=0.2, stratify=labels, random_state=42
        )
    except Exception as e:
        print(f"Error splitting data: {e}")
        return None, None, None
    class QADataset(Dataset):
        def __init__(self, questions, answers, labels, tokenizer, max_length=512):
            self.tokenizer = tokenizer
            self.questions = questions
            self.answers = answers
            self.labels = labels
            self.max_length = max_length
        def __len__(self):
            return len(self.labels)
        def __getitem__(self, idx):
            question = str(self.questions[idx])
            answer = str(self.answers[idx])
            text = f"Question: {question} Answer: {answer}"
            encoding = self.tokenizer(
                text,
                add_special_tokens=True,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
            return {
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'labels': torch.tensor(self.labels[idx], dtype=torch.long),
                'input_texts': text
            }
    class QAQualityLoss(nn.Module):
        def __init__(self):
            super().__init__()
            self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        def forward(self, outputs, targets, input_texts, attention_mask):
            logits = outputs.logits
            loss = F.cross_entropy(logits, targets, reduction='none')  # Get per-example loss
            mask_weight = attention_mask.float().mean(dim=-1)
            weighted_loss = (loss * mask_weight).mean()

            # Get predictions for logging
            preds = torch.argmax(logits, dim=-1)
            accuracy = (preds == targets).float().mean()

            return weighted_loss, {
                'loss': weighted_loss.item(),
                'batch_accuracy': accuracy.item()
            }
    class WeightedTrainer(Trainer):
        def __init__(self, quality_loss=None, **kwargs):
            super().__init__(**kwargs)
            self.quality_loss = quality_loss if quality_loss else QAQualityLoss()
        def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
            labels = inputs.pop("labels")
            outputs = model(**inputs)
            loss, loss_components = self.quality_loss(
                outputs,
                labels,
                inputs.get('input_texts', []),
                inputs['attention_mask']
            )
            self.log(loss_components)
            return (loss, outputs) if return_outputs else loss
    def compute_metrics(pred):
            labels = pred.label_ids
            preds = pred.predictions.argmax(-1)
            metrics = {
                'eval_accuracy': accuracy_score(labels, preds),
                'eval_precision': precision_score(labels, preds, average='weighted'),
                'eval_recall': recall_score(labels, preds, average='weighted'),
                'eval_f1': f1_score(labels, preds, average='weighted'),
                'eval_confusion_matrix': confusion_matrix(labels, preds).tolist()
            }
            print(f"Evaluation metrics: {metrics}")
            return metrics
    train_dataset = QADataset(train_questions, train_answers, train_labels, tokenizer)
    val_dataset = QADataset(val_questions, val_answers, val_labels, tokenizer)
    quality_loss = QAQualityLoss()
    if torch.cuda.is_available(): quality_loss = quality_loss.cuda()
    training_args = TrainingArguments(
      output_dir=output_dir,
      num_train_epochs=3,
      per_device_train_batch_size=16,
      per_device_eval_batch_size=16,
      evaluation_strategy="steps",
      eval_steps=100,
      save_strategy="steps",
      save_steps=100,
      load_best_model_at_end=True,
      metric_for_best_model="eval_accuracy",
      greater_is_better=True,
      logging_steps=100,
      report_to="tensorboard",
    )
    trainer = WeightedTrainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=val_dataset,
      quality_loss=quality_loss,
      compute_metrics=compute_metrics
    )
    try:
        trainer.train()
        print("Training completed successfully")
    except Exception as e:
        print(f"Error during training: {e}")
        return None, None, None
    return model, tokenizer, trainer

In [None]:
model, tokenizer, trainer = train_qa_model(df_QA)

Preparing data...
Model moved to GPU


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Confusion Matrix
100,0.008388,0.016856,0.974771,0.974795,0.974771,0.974735,"[[169, 7], [4, 256]]"
200,0.000934,0.012591,0.979358,0.979527,0.979358,0.979307,"[[169, 7], [2, 258]]"
300,0.000597,0.016491,0.977064,0.977889,0.977064,0.97714,"[[175, 1], [9, 251]]"


Trainer is attempting to log a value of "[[169, 7], [4, 256]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


Evaluation metrics: {'eval_accuracy': 0.9747706422018348, 'eval_precision': 0.9747946874928797, 'eval_recall': 0.9747706422018348, 'eval_f1': 0.9747345661013931, 'eval_confusion_matrix': [[169, 7], [4, 256]]}


Trainer is attempting to log a value of "[[169, 7], [2, 258]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


Evaluation metrics: {'eval_accuracy': 0.9793577981651376, 'eval_precision': 0.9795265961915925, 'eval_recall': 0.9793577981651376, 'eval_f1': 0.9793073753309589, 'eval_confusion_matrix': [[169, 7], [2, 258]]}


Trainer is attempting to log a value of "[[175, 1], [9, 251]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


Evaluation metrics: {'eval_accuracy': 0.9770642201834863, 'eval_precision': 0.9778888952203671, 'eval_recall': 0.9770642201834863, 'eval_f1': 0.9771398764016309, 'eval_confusion_matrix': [[175, 1], [9, 251]]}


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Confusion Matrix
100,0.008388,0.016856,0.974771,0.974795,0.974771,0.974735,"[[169, 7], [4, 256]]"
200,0.000934,0.012591,0.979358,0.979527,0.979358,0.979307,"[[169, 7], [2, 258]]"
300,0.000597,0.016491,0.977064,0.977889,0.977064,0.97714,"[[175, 1], [9, 251]]"


Training completed successfully


In [None]:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./MODEL/qa_model2/tokenizer_config.json',
 './MODEL/qa_model2/special_tokens_map.json',
 './MODEL/qa_model2/vocab.json',
 './MODEL/qa_model2/merges.txt',
 './MODEL/qa_model2/added_tokens.json')