In [2]:
from typing import List, Dict, Optional
import pandas as pd
import numpy as np
from pathlib import Path
import json
import torch
import faiss
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from src.utils.enums import QuestionRecommendConfig

In [3]:
class QuestionDataProcessor:
    def __init__(
        self,
        data_dir: str = QuestionRecommendConfig.FINE_TUNE_DATA_DIR / "CancerQA.csv",
        output_dir: str = QuestionRecommendConfig.FINE_TUNE_DATA_DIR,
        embedding_dim: int = 768,
    ):
        self.data_dir = Path(data_dir)
        self.output_dir = Path(output_dir)
        self.embedding_dim = embedding_dim

        # Initialize BERT model for embeddings
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        self.model = AutoModel.from_pretrained("bert-base-uncased")
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

        # Create output directory
        self.output_dir.mkdir(parents=True, exist_ok=True)

    def load_datasets(self) -> pd.DataFrame:
        file_path = self.data_dir
        print(f"file_path: {file_path}")
        df = pd.read_csv(Path(file_path))
        print(df.head())
        df["source"] = Path(file_path).stem
        return df

    def preprocess_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Clean and preprocess the data."""
        df["cleaned_question"] = df["Question"]
        df = df.drop_duplicates(subset=["cleaned_question"])
        df = df.dropna(subset=["cleaned_question"])

        self.output_dir.mkdir(parents=True, exist_ok=True)
        df.to_csv(self.output_dir / "cleaned_dataset.csv", index=False)

        return df

    def create_embeddings(self, questions: List[str]) -> List[float]:
        """Create embeddings for questions using BERT."""
        embeddings = []
        for question in tqdm(questions, desc="Creating embeddings"):
            # Tokenize and create embedding
            inputs = self.tokenizer(
                question, return_tensors="pt", padding=True, truncation=True
            )
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = self.model(**inputs)
                # Use [CLS] token embedding
                embedding = (
                    outputs.last_hidden_state[:, 0, :]
                    .cpu()
                    .numpy()[0]
                    .astype("float32")
                )

                # if len(embedding) != self.embedding_dim:
                #     if len(embedding) < self.embedding_dim:
                #         embedding = np.pad(embedding, (0, self.embedding_dim - len(embedding)))
                #     else:
                #         embedding = embedding[:self.embedding_dim]
                if embedding.size != self.embedding_dim:
                    raise ValueError(f"Expected {self.embedding_dim}, got {emb.size}")

                embeddings.append(embedding)

        return np.stack(embeddings, axis=0)

    def build_faiss_index(self, questions: List[str], embeddings: List[List[float]]):
        """Build FAISS index for question retrieval."""
        embeddings_array = np.array(embeddings).astype("float32")

        faiss_index = faiss.IndexFlatL2(self.embedding_dim)
        faiss_index.add(embeddings_array)

        return faiss_index

    def process_datasets(self) -> Dict:
        """Process all datasets and prepare for training."""
        print("Loading datasets...")
        combined_df = self.load_datasets()

        print("Preprocessing data...")
        processed_df = self.preprocess_data(combined_df)

        print("Creating embeddings...")
        questions = processed_df["cleaned_question"].tolist()
        embeddings = self.create_embeddings(questions)

        print("Building FAISS index...")
        faiss_index = self.build_faiss_index(questions, embeddings)

        # Save processed data
        questions_mapping = {i: q for i, q in enumerate(questions)}
        # Write out to disk
        with open(
            f"{self.output_dir}/questions_mapping.json", "w", encoding="utf-8"
        ) as f:
            json.dump(questions_mapping, f, ensure_ascii=False, indent=2)

        return {
            "questions": questions,
            "embeddings": embeddings,
            "faiss_index": faiss_index,
            "questions_mapping": questions_mapping,
            "metadata": {
                "num_questions": len(questions),
                "embedding_dim": self.embedding_dim,
            },
        }

In [4]:
class QuestionGenerator:
    def __init__(
        self,
        faiss_index,
        questions_mapping: Dict[str, str],
        output_dir: str = QuestionRecommendConfig.PROCESSED_DATA_DIR,
        temperature: float = 0.7,
    ):
        self.output_dir = Path(output_dir)
        self.faiss_index = faiss_index
        self.questions_mapping = questions_mapping
        self.model_name = model_name
        self.temperature = temperature

    def get_similar_questions(self, query_embedding, k: int = 5) -> List[str]:
        """Retrieve similar questions using FAISS."""
        if isinstance(query_embedding, list):
            query_embedding = np.array(query_embedding)

        if query_embedding.ndim == 1:
            query_embedding = query_embedding.reshape(1, -1)

        distances, indices = self.faiss_index.search(
            query_embedding.astype("float32"), k
        )
        # Get questions from mapping
        updated_indices = indices[0][1:]
        similar_questions = [
            self.questions_mapping[idx]
            for idx in updated_indices
            if idx in self.questions_mapping
        ]

        # Remove duplicates while preserving order
        results = []
        for q in similar_questions:
            if q not in results:
                results.append(q)

        return results[:k]

    def generate_follow_up_questions(
        self, question_embedding: np.ndarray, num_questions: int = 4
    ) -> List[str]:
        """Generate follow-up questions using FAISS."""
        similar_questions = self.get_similar_questions(
            question_embedding, k=num_questions
        )
        return similar_questions[:num_questions]

In [13]:
question_map_path = "/home/jiso/Documents/EPITA/action-learning/rag_medical/src/data/processed/questions_mapping.json"
with open(question_map_path, "r", encoding="utf-8") as f:
    question_map = json.load(f)

print(question_map[str(600)])

How many people are affected by congenital hepatic fibrosis ?


In [5]:
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from transformers.tokenization_utils_base import PreTrainedTokenizerBase


@dataclass
class CustomDataCollatorForSeq2Seq:
    """
    Custom data collator that handles T5 input/output format properly
    """

    tokenizer: PreTrainedTokenizerBase
    model: Any = None
    padding: Union[bool, str] = True
    max_length: int = None
    pad_to_multiple_of: int = None
    label_pad_token_id: int = -100

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        # Separate inputs and labels
        input_ids = [f["input_ids"] for f in features]
        attention_mask = [f["attention_mask"] for f in features]
        labels = [f["labels"] for f in features]

        # Convert to tensors
        batch = {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
            "labels": torch.tensor(labels, dtype=torch.long),
        }

        return batch

In [6]:
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
from datasets import Dataset


class FineTuningPipeline:
    def __init__(
        self,
        model_name: str = "google/flan-t5-base",
        data_dir: str = QuestionRecommendConfig.FINE_TUNE_DATA_DIR,
        output_dir: str = QuestionRecommendConfig.MODEL_DIR,
        max_length: int = 256,
        batch_size: int = 2,
        learning_rate: float = 2e-5,
        num_epochs: int = 3,
    ):
        self.model_name = model_name
        self.data_dir = Path(data_dir)
        self.output_dir = Path(output_dir)
        self.max_length = max_length
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs

        # Initialize components
        self.data_processor = QuestionDataProcessor(data_dir=data_dir)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

        # Create output directory
        self.output_dir.mkdir(parents=True, exist_ok=True)

    def prepare_training_data(self) -> Dict[str, Dataset]:
        """Prepare the dataset for training."""
        # Process datasets
        processed_data = self.data_processor.process_datasets()

        # Initialize question generator
        question_generator = QuestionGenerator(
            faiss_index=processed_data["faiss_index"],
            questions_mapping=processed_data["questions_mapping"],
        )

        # Generate training pairs
        training_data = []
        for idx, question in tqdm(
            enumerate(processed_data["questions"]),
            total=len(processed_data["questions"]),
            desc="Generating training data",
        ):
            question_embedding = processed_data["embeddings"][idx]

            follow_up_questions = question_generator.generate_follow_up_questions(
                question_embedding, num_questions=4
            )

            training_data.append(
                {
                    "input": question,
                    "output": follow_up_questions,
                    "follow_up_combined": " | ".join(follow_up_questions),
                }
            )

        print(f"Sample training data:")
        for i in range(min(3, len(training_data))):
            print(f"Input: {training_data[i]['input']}")
            print(f"Output: {training_data[i]['output']}")
            print("---")

        # Convert to DataFrame for inspection
        df = pd.DataFrame(training_data)

        # Save to CSV for manual inspection
        csv_path = self.output_dir / "training_data.csv"
        df.to_csv(csv_path, index=False)

        # Convert to dataset
        dataset = Dataset.from_list(training_data)
        split_dataset = dataset.train_test_split(test_size=0.1)

        # Tokenize both splits
        tokenized_dataset = {
            "train": split_dataset["train"].map(
                self.tokenize_function,
                batched=True,
                remove_columns=dataset.column_names,
            ),
            "validation": split_dataset["test"].map(
                self.tokenize_function,
                batched=True,
                remove_columns=dataset.column_names,
            ),
        }

        print("Sample tokenized data:")
        sample = tokenized_dataset["train"][0]
        print(f"Input IDs shape: {len(sample['input_ids'])}")
        print(f"Labels shape: {len(sample['labels'])}")
        print(f"First few input IDs: {sample['input_ids'][:10]}")
        print(f"First few labels: {sample['labels'][:10]}")

        labels_flat = [
            label for labels in tokenized_dataset["train"]["labels"] for label in labels
        ]
        num_ignored = sum(1 for label in labels_flat if label == -100)
        print(f"Number of ignored tokens (-100): {num_ignored}/{len(labels_flat)}")

        return tokenized_dataset

    def tokenize_function(self, examples):
        """Tokenize the input and output sequences."""
        model_inputs = self.tokenizer(
            examples["input"],
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
        )

        # Join the list of questions with a separator
        formatted_outputs = [" | ".join(questions) for questions in examples["output"]]

        # Tokenize targets
        # with self.tokenizer.as_target_tokenizer():
        #     labels = self.tokenizer(
        #         formatted_outputs,
        #         max_length=self.max_length,
        #         padding="max_length",
        #         truncation=True
        #     )

        labels = self.tokenizer(
            text_target=formatted_outputs,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
        )

        labels_input_ids = labels["input_ids"].copy()
        labels_input_ids = [
            [
                (label if label != self.tokenizer.pad_token_id else -100)
                for label in label_seq
            ]
            for label_seq in labels_input_ids
        ]

        model_inputs["labels"] = labels_input_ids
        return model_inputs

    def debug_batch(self, datasets):
        """Debug what's actually being fed to the model"""
        from torch.utils.data import DataLoader

        # Create a simple data loader
        train_loader = DataLoader(
            datasets["train"], batch_size=2, collate_fn=lambda x: x
        )

        # Get one batch
        raw_batch = next(iter(train_loader))
        print("Raw batch structure:")
        for i, item in enumerate(raw_batch[:2]):  # First 2 items
            print(f"Item {i}:")
            print(f"  Input IDs length: {len(item['input_ids'])}")
            print(f"  Labels length: {len(item['labels'])}")
            print(f"  Input IDs sample: {item['input_ids'][:10]}")
            print(f"  Labels sample: {item['labels'][:10]}")
            print(f"  Labels has -100: {-100 in item['labels']}")
            print()

        # Test with custom collator
        data_collator = CustomDataCollatorForSeq2Seq(
            tokenizer=self.tokenizer, model=self.model, label_pad_token_id=-100
        )

        collated_batch = data_collator(raw_batch)
        print("Collated batch:")
        print(f"Input IDs shape: {collated_batch['input_ids'].shape}")
        print(f"Labels shape: {collated_batch['labels'].shape}")
        print(
            f"Labels contains -100: {(collated_batch['labels'] == -100).any().item()}"
        )

        # Test forward pass with this batch
        self.model.eval()
        with torch.no_grad():
            try:
                # Move to device
                batch_device = {k: v.to(self.device) for k, v in collated_batch.items()}
                outputs = self.model(**batch_device)
                print(f"Forward pass loss: {outputs.loss.item()}")
            except Exception as e:
                print(f"Forward pass failed: {e}")

    def train(self):
        """Train the model."""
        # Prepare dataset
        datasets = self.prepare_training_data()

        print("=== DEBUGGING BATCHES ===")
        self.debug_batch(datasets)
        print("=== END DEBUGGING ===")

        # Training arguments
        training_args = Seq2SeqTrainingArguments(
            output_dir=str(self.output_dir),
            # evaluation_strategy="epoch",
            eval_steps=100,
            do_eval=True,
            do_train=True,
            save_steps=100,
            eval_strategy="steps",
            save_strategy="steps",
            logging_strategy="steps",
            logging_steps=10,
            learning_rate=self.learning_rate,
            per_device_train_batch_size=self.batch_size,
            per_device_eval_batch_size=self.batch_size,
            num_train_epochs=self.num_epochs,
            weight_decay=0.01,
            save_total_limit=3,
            predict_with_generate=False,
            fp16=True,
            logging_dir=str(self.output_dir / "logs"),
            load_best_model_at_end=True,
            log_level="info",
            report_to="none",
            gradient_accumulation_steps=1,
            max_grad_norm=1.0,
            warmup_steps=100,
            dataloader_pin_memory=False,
        )

        # Data collator
        data_collator = DataCollatorForSeq2Seq(
            self.tokenizer, model=self.model, label_pad_token_id=-100
        )

        # Initialize trainer
        trainer = Seq2SeqTrainer(
            model=self.model,
            args=training_args,
            train_dataset=datasets["train"],
            eval_dataset=datasets["validation"],
            tokenizer=self.tokenizer,
            data_collator=data_collator,
        )

        # Train the model
        trainer.train()

        metrics = trainer.evaluate()
        print(f"Final validation loss: {metrics['eval_loss']:.4f}")

        # Save the model
        trainer.save_model(str(self.output_dir))
        self.tokenizer.save_pretrained(str(self.output_dir))

AttributeError: type object 'QuestionRecommendConfig' has no attribute 'MODEL_DIR'

In [26]:
pipeline = FineTuningPipeline(
    model_name="google/flan-t5-base",
    data_dir="/home/jiso/Documents/EPITA/action-learning/rag_medical/src/data/fine_tune_dataset/CancerQA.csv",
    output_dir="/home/jiso/Documents/EPITA/action-learning/rag_medical/src/ml_models/flant5",
    batch_size=4,
    num_epochs=3,
)

In [27]:
# Train the model
pipeline.train()

Loading datasets...
file_path: /home/jiso/Documents/EPITA/action-learning/rag_medical/src/data/fine_tune_dataset/CancerQA.csv
                                            Question  \
0         What is (are) Non-Small Cell Lung Cancer ?   
1   Who is at risk for Non-Small Cell Lung Cancer? ?   
2  What are the symptoms of Non-Small Cell Lung C...   
3       How to diagnose Non-Small Cell Lung Cancer ?   
4  What is the outlook for Non-Small Cell Lung Ca...   

                                              Answer   topic  split  
0  Key Points\n                    - Non-small ce...  cancer  train  
1  Smoking is the major risk factor for non-small...  cancer  train  
2  Signs of non-small cell lung cancer include a ...  cancer   test  
3  Tests that examine the lungs are used to detec...  cancer  train  
4  Certain factors affect prognosis (chance of re...  cancer  train  
Preprocessing data...
Creating embeddings...


Creating embeddings: 100%|██████████| 683/683 [00:03<00:00, 194.59it/s]


Building FAISS index...


Generating training data: 100%|██████████| 683/683 [00:00<00:00, 19296.17it/s]


Sample training data:
Input: What is (are) Non-Small Cell Lung Cancer ?
Output: ['What is (are) Small Cell Lung Cancer ?', 'What is (are) Renal Cell Cancer ?', 'What is (are) Hypopharyngeal Cancer ?']
---
Input: Who is at risk for Non-Small Cell Lung Cancer? ?
Output: ['Who is at risk for Endometrial Cancer? ?', 'Who is at risk for Small Cell Lung Cancer? ?', 'Who is at risk for Adult Primary Liver Cancer? ?']
---
Input: What are the symptoms of Non-Small Cell Lung Cancer ?
Output: ['What are the symptoms of Small Cell Lung Cancer ?', 'What are the symptoms of Endometrial Cancer ?', 'What are the symptoms of Pancreatic Cancer ?']
---


Map: 100%|██████████| 614/614 [00:00<00:00, 1242.29 examples/s]
Map: 100%|██████████| 69/69 [00:00<00:00, 1233.67 examples/s]
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
  trainer = Seq2SeqTrainer(
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Using auto half precision backend


Sample tokenized data:
Input IDs shape: 256
Labels shape: 256
First few input IDs: [2645, 19, 44, 1020, 21, 30588, 11759, 332, 13159, 9422]
First few labels: [2645, 19, 44, 1020, 21, 12318, 11759, 332, 13159, 9422]
Number of ignored tokens (-100): 126671/157184
=== DEBUGGING BATCHES ===
Raw batch structure:
Item 0:
  Input IDs length: 256
  Labels length: 256
  Input IDs sample: [2645, 19, 44, 1020, 21, 30588, 11759, 332, 13159, 9422]
  Labels sample: [2645, 19, 44, 1020, 21, 12318, 11759, 332, 13159, 9422]
  Labels has -100: True

Item 1:
  Input IDs length: 256
  Labels length: 256
  Input IDs sample: [363, 33, 8, 5872, 21, 30588, 8505, 2935, 7419, 5744]
  Labels sample: [363, 33, 8, 5872, 21, 30588, 2808, 22450, 1162, 2149]
  Labels has -100: True

Collated batch:
Input IDs shape: torch.Size([2, 256])
Labels shape: torch.Size([2, 256])
Labels contains -100: True
Forward pass loss: 1.9573472738265991
=== END DEBUGGING ===


***** Running training *****
  Num examples = 614
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 462
  Number of trainable parameters = 247,577,856


Step,Training Loss,Validation Loss
100,0.0,
200,0.0,
300,0.0,
400,0.0,



***** Running Evaluation *****
  Num examples = 69
  Batch size = 4
Saving model checkpoint to /home/jiso/Documents/EPITA/action-learning/rag_medical/src/ml_models/flant5/checkpoint-100
Configuration saved in /home/jiso/Documents/EPITA/action-learning/rag_medical/src/ml_models/flant5/checkpoint-100/config.json
Configuration saved in /home/jiso/Documents/EPITA/action-learning/rag_medical/src/ml_models/flant5/checkpoint-100/generation_config.json
Model weights saved in /home/jiso/Documents/EPITA/action-learning/rag_medical/src/ml_models/flant5/checkpoint-100/model.safetensors
tokenizer config file saved in /home/jiso/Documents/EPITA/action-learning/rag_medical/src/ml_models/flant5/checkpoint-100/tokenizer_config.json
Special tokens file saved in /home/jiso/Documents/EPITA/action-learning/rag_medical/src/ml_models/flant5/checkpoint-100/special_tokens_map.json
Deleting older checkpoint [/home/jiso/Documents/EPITA/action-learning/rag_medical/src/ml_models/flant5/checkpoint-100] due to args

Saving model checkpoint to /home/jiso/Documents/EPITA/action-learning/rag_medical/src/ml_models/flant5
Configuration saved in /home/jiso/Documents/EPITA/action-learning/rag_medical/src/ml_models/flant5/config.json
Configuration saved in /home/jiso/Documents/EPITA/action-learning/rag_medical/src/ml_models/flant5/generation_config.json


Final validation loss: nan


Model weights saved in /home/jiso/Documents/EPITA/action-learning/rag_medical/src/ml_models/flant5/model.safetensors
tokenizer config file saved in /home/jiso/Documents/EPITA/action-learning/rag_medical/src/ml_models/flant5/tokenizer_config.json
Special tokens file saved in /home/jiso/Documents/EPITA/action-learning/rag_medical/src/ml_models/flant5/special_tokens_map.json
tokenizer config file saved in /home/jiso/Documents/EPITA/action-learning/rag_medical/src/ml_models/flant5/tokenizer_config.json
Special tokens file saved in /home/jiso/Documents/EPITA/action-learning/rag_medical/src/ml_models/flant5/special_tokens_map.json


In [32]:
def generate_recommendations(
    model, tokenizer, question: str, max_length: int = 256
) -> List[str]:
    """Generate question recommendations for a given medical question."""
    # Tokenize input
    inputs = tokenizer(
        question,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    ).to(model.device)

    # Generate recommendations
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_beams=4,
        temperature=0.7,
        no_repeat_ngram_size=2,
    )

    # Decode and split recommendations
    recommendations = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return recommendations.split(" | ")


question = "What causes cancer?"
# Example usage
prompt = (
    "Generate exactly four medically relevant follow-up questions based on the patient’s input question."
    "Each follow-up question must be concise, end with a question mark, and explore a different aspect of the topic "
    "(e.g., diagnosis, treatment, risk factors, prognosis, prevention, or causes). "
    "The follow-up questions must not repeat the patient’s question or use its exact wording. "
    "Format the output as a numbered list (e.g., '1. Question?\n2. Question?\n3. Question?\n4. Question?').\n\n"
    "Example 1:\n"
    'Patient: "What are the symptoms of diabetes?"\n'
    "Follow-Up Questions:\n"
    "1. How is diabetes diagnosed?\n"
    "2. What are the treatment options for diabetes?\n"
    "3. Who is at risk for developing diabetes?\n"
    "4. What complications can arise from diabetes?\n\n"
    "Example 2:\n"
    'Patient: "What is breast cancer?"\n'
    "Follow-Up Questions:\n"
    "1. What are the main risk factors for breast cancer?\n"
    "2. How is breast cancer diagnosed?\n"
    "3. What treatments are available for breast cancer?\n"
    "4. What is the prognosis for breast cancer patients?\n\n"
    f'Input Question: "{question}"\n'
    "Follow-Up Questions:".format(question)
)
recommendations = generate_recommendations(pipeline.model, pipeline.tokenizer, prompt)
print(f"Original question: {question}")
print("\nRecommended follow-up questions:")
for i, rec in enumerate(recommendations, 1):
    print(f"{rec}")

Original question: What causes cancer?

Recommended follow-up questions:
1. What causes cancer? 2. What is the main cause? 3. What are the causes? 4. What does it mean to be a cancer patient?


In [10]:
import numpy as np
import evaluate
import pandas as pd
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
)
from datasets import Dataset

# 1) Load metrics
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("bleu")
bertscore_metric = evaluate.load("bertscore")


def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # 2) Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # 3) Exact-match
    pred_lists = [p.split(" | ") for p in decoded_preds]
    label_lists = [l.split(" | ") for l in decoded_labels]
    em_scores = [
        sum(1 for p in pred if p in gold) / max(len(gold), 1)
        for pred, gold in zip(pred_lists, label_lists)
    ]
    exact_match = np.mean(em_scores)

    # 4) ROUGE-L
    rouge_preds = ["\n".join(p) for p in pred_lists]
    rouge_labels = ["\n".join(l) for l in label_lists]
    rouge_res = rouge_metric.compute(predictions=rouge_preds, references=rouge_labels)
    rougeL_f1 = rouge_res["rougeL"]  # already float

    # 5) BLEU (string inputs)
    # Provide list of hypothesis strings and list of reference strings
    bleu_res = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    bleu_score = bleu_res["bleu"]

    # 6) BERTScore
    bert_res = bertscore_metric.compute(
        predictions=decoded_preds, references=decoded_labels, lang="en"
    )
    bert_f1 = np.mean(bert_res["f1"])

    return {
        "exact_match": exact_match,
        "rougeL_f1": rougeL_f1,
        "bleu": bleu_score,
        "bertscore_f1": bert_f1,
    }


# 7) Load your fine-tuned model & tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(pipeline.output_dir)
tokenizer = AutoTokenizer.from_pretrained(pipeline.output_dir)

# 8) Rebuild your eval_dataset from the saved CSV
df = pd.read_csv(f"{pipeline.output_dir}/training_data.csv")
val_df = df.sample(frac=0.1, random_state=42)
eval_dataset = Dataset.from_pandas(val_df)


def preprocess(ex):
    enc = tokenizer(ex["input"], truncation=True, padding="max_length", max_length=256)
    labs = tokenizer(
        text_target=ex["follow_up_combined"],
        truncation=True,
        padding="max_length",
        max_length=256,
    )["input_ids"]
    # mask pads
    labs = [lab if lab != tokenizer.pad_token_id else -100 for lab in labs]
    enc["labels"] = labs
    return enc


eval_dataset = eval_dataset.map(
    preprocess, batched=False, remove_columns=eval_dataset.column_names
)

# 9) Create the Trainer and run evaluation
trainer = Seq2SeqTrainer(
    model=model,
    args=Seq2SeqTrainingArguments(
        output_dir=str(pipeline.output_dir),
        per_device_eval_batch_size=8,
        predict_with_generate=True,
        report_to="none",
    ),
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(
        tokenizer, model=model, label_pad_token_id=-100
    ),
    compute_metrics=compute_metrics,
)

metrics = trainer.evaluate(eval_dataset)
print("Evaluation results:", metrics)

loading configuration file /home/jiso/Documents/EPITA/action-learning/rag_medical/src/ml_models/flant5/config.json
Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "s

loading configuration file config.json from cache at /home/jiso/.cache/huggingface/hub/models--roberta-large/snapshots/722cf37b1afa9454edce342e7895e588b6ff1d59/config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.51.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/jiso/.cache/huggingface/hub/models--roberta-large/snapshots/722cf37b1afa9454edce342e7895e588b6ff1d59/vocab.json
loading file merges.txt from cache

Evaluation results: {'eval_loss': nan, 'eval_model_preparation_time': 0.0038, 'eval_exact_match': 0.0, 'eval_rougeL_f1': 0.1179912991081328, 'eval_bleu': 0.002034812795709879, 'eval_bertscore_f1': 0.8239430899129194, 'eval_runtime': 6.5132, 'eval_samples_per_second': 10.44, 'eval_steps_per_second': 1.382}


In [8]:
model_path = "google/flan-t5-base"
weights_path = "/home/jiso/Documents/EPITA/action-learning/rag_medical/src/ml_models/model_files/flant5_diabetes.pth"

# Load model architecture
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# Load saved weights
state_dict = torch.load(weights_path, map_location="cuda" if torch.cuda.is_available() else "cpu")
model.load_state_dict(state_dict)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)


def recommend_questions(input_text: str, max_length=128, num_return_sequences=1):
    # Tokenize input
    # prompt = f"Input: {input_text}\nInstruction: Recommend 3 follow-up medical questions."
    prompt = f"The response must be medically releavent question for the question: {input_text}"

#     prompt = (
#     "Generate exactly four medically relevant follow-up questions based on the patient’s input question."
#     "Each follow-up question must be concise, end with a question mark, and explore a different aspect of the topic "
#     "(e.g., diagnosis, treatment, risk factors, prognosis, prevention, or causes). "
#     "The follow-up questions must not repeat the patient’s question or use its exact wording. "
#     "Format the output as a numbered list (e.g., '1. Question?\n2. Question?\n3. Question?\n4. Question?').\n\n"
#     "Example 1:\n"
#     'Patient: "What are the symptoms of diabetes?"\n'
#     "Follow-Up Questions:\n"
#     "1. How is diabetes diagnosed?\n"
#     "2. What are the treatment options for diabetes?\n"
#     "3. Who is at risk for developing diabetes?\n"
#     "4. What complications can arise from diabetes?\n\n"
#     "Example 2:\n"
#     'Patient: "What is breast cancer?"\n'
#     "Follow-Up Questions:\n"
#     "1. What are the main risk factors for breast cancer?\n"
#     "2. How is breast cancer diagnosed?\n"
#     "3. What treatments are available for breast cancer?\n"
#     "4. What is the prognosis for breast cancer patients?\n\n"
#     f'Input Question: "{input_text}"\n'
#     "Follow-Up Questions:".format(input_text)
# )
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_length
    ).to(device)

    # Generate output
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_return_sequences=1,
            do_sample=False,  # change to True to sample diverse results
        )

    # Decode output
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]


# input_q = "Tell me about cancer?"
input_q = "What are the symptoms of diabetes?"
recommendations = recommend_questions(input_q)

print("Recommended follow-up questions:")
for q in recommendations:
    print("-", q)

Recommended follow-up questions:
- What are the symptoms of diabetes? | What are the symptoms of diabetes? | What are the symptoms of diabetes?


In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_question(topic: str) -> str:
    prompt = f"Generate one medical question about {topic}."

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=64
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=64,
            num_return_sequences=1,
            do_sample=True,           # 🔁 Enable randomness
            top_k=50,                 # Top-k sampling
            top_p=0.9,                # Or use nucleus sampling
            temperature=0.8           # Add creativity
        )

    question = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question.strip()

    
topic = "cancer"
question = generate_question(topic)
print("Generated Question:", question)

Generated Question: When was a chemo therapy called chemotherapy?


In [3]:
!pip install deep-translator

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
Installing collected packages: deep-translator
Successfully installed deep-translator-1.11.4


In [6]:
from deep_translator import GoogleTranslator

doc = """The timeline for receiving cancer treatment can vary based on several factors, including:

Diagnosis: After a cancer diagnosis, treatment typically begins within a few weeks, depending on the type and stage of cancer.
Treatment plan: Oncologists will develop a personalized treatment plan, which may include surgery, chemotherapy, radiation, or targeted therapies.
Insurance and logistics: Availability of treatment facilities, insurance approvals, and scheduling can affect the start date.
Patient health: Overall health and any necessary pre-treatment evaluations can also influence timing.
It's essential to discuss specific timelines with your healthcare provider for the most accurate information."""

# Translate to English
text_en = GoogleTranslator(source='auto', target='en').translate("Quelle est la cause du diabète?")
print(text_en)  # "What is the cause of diabetes?"

# Translate back to French
response_fr = GoogleTranslator(source='en', target='fr').translate(doc)
print(response_fr)

What is the cause of diabetes?
Le calendrier pour recevoir un traitement contre le cancer peut varier en fonction de plusieurs facteurs, notamment:

Diagnostic: Après un diagnostic de cancer, le traitement commence généralement en quelques semaines, selon le type et le stade du cancer.
Plan de traitement: les oncologues élaboreront un plan de traitement personnalisé, qui peut inclure la chirurgie, la chimiothérapie, les radiations ou les thérapies ciblées.
Assurance et logistique: la disponibilité des installations de traitement, des approbations d'assurance et de la planification peut affecter la date de début.
Santé des patients: la santé globale et toute évaluation de prétraitement nécessaire peuvent également influencer le moment.
Il est essentiel de discuter des délais spécifiques avec votre fournisseur de soins de santé pour les informations les plus précises.
