# Re-initialize Environment and Trainer

### Subtask:
Re-initialize all necessary components, including tokenizer, datasets, base model, PEFT configuration, and the Trainer object, ensuring all dependencies are met and the `remove_unused_columns` argument in `TrainingArguments` is correctly set to `True` to avoid `ValueError` during training. For this specific run, `max_steps` will be set to 1200, and `num_train_epochs` will be ignored or set to a large value to allow `max_steps` to control training duration.


## Generate Fine-Tuned Model Responses

### Subtask:
Use the fine-tuned `TinyLlama-1.1B-Chat-v1.0` model to generate responses for the entire test dataset. These responses will be used to evaluate the fine-tuned model's performance.

## Generate Baseline Model Responses

### Subtask:
Use the pre-trained `TinyLlama-1.1B-Chat-v1.0` model to generate responses for the entire test dataset to establish a performance baseline against which fine-tuned models will be compared.


In [None]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
import pandas as pd
import os
import kagglehub

# --- Start of re-included dependencies for train_val_test_dataset and tokenizer ---

# Load the tokenizer (from cell c403bba4)
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load intronhealth/afrimedqa_v2 dataset (from cell 5a8f8fc5)
afrimedqa_dataset = load_dataset('intronhealth/afrimedqa_v2')

# Download MedQuAD dataset files (from cell 66260b14)
medquad_download_path = kagglehub.dataset_download("pythonafroz/medquad-medical-question-answer-for-ai-research")

# Load MedQuAD dataset from CSV into pandas DataFrame (from cell 4e3fafdf)
medquad_csv_path = os.path.join(medquad_download_path, 'medquad.csv')
medquad_df = pd.read_csv(medquad_csv_path)

# Convert MedQuAD to unified QA format (from cell 70257e3b)
medquad_qa_df = medquad_df.copy()
medquad_qa_df['instruction'] = medquad_qa_df['question'].astype(str)
medquad_qa_df['response'] = medquad_qa_df['answer'].astype(str)
medquad_qa_dataset = Dataset.from_pandas(medquad_qa_df[['instruction', 'response']])

# Convert afrimedqa_v2 to unified QA format (from cell ce5a4b86)
afrimedqa_df = afrimedqa_dataset['train'].to_pandas()
afrimedqa_df['instruction'] = afrimedqa_df['question'].astype(str)
afrimedqa_df['response'] = afrimedqa_df['answer_rationale'].fillna('').astype(str)
afrimedqa_qa_dataset = Dataset.from_pandas(afrimedqa_df[['instruction', 'response']])

# Concatenate unified datasets (from cell 07758188)
unified_dataset = concatenate_datasets([medquad_qa_dataset, afrimedqa_qa_dataset])

# Initial cleaning and split into training, validation, and test sets (from cell 2c29636f)
unified_dataset = unified_dataset.filter(lambda example: example['instruction'].strip() != '' and example['response'].strip() != '')
shuffled_dataset = unified_dataset.shuffle(seed=42)
train_test_split = shuffled_dataset.train_test_split(test_size=0.2, seed=42)
val_test_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)
train_val_test_dataset = DatasetDict({
    'train': train_test_split['train'],
    'validation': val_test_split['train'],
    'test': val_test_split['test']
})
# --- End of re-included dependencies ---

# 1. Load the base model
base_model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device for baseline model: {device}")

# Define base keyword arguments for model loading
model_load_kwargs_base = {
    "pretrained_model_name_or_path": base_model_name,
    "device_map": "auto",
    "dtype": torch.bfloat16 if torch.cuda.is_available() else torch.float32,
}

base_model = None
try:
    if torch.cuda.is_available():
        print("Attempting to load model with 4-bit quantization.")
        quantization_config_baseline = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=False,
        )
        # Create a copy and add quantization_config
        model_load_kwargs_quant = model_load_kwargs_base.copy()
        model_load_kwargs_quant["quantization_config"] = quantization_config_baseline
        base_model = AutoModelForCausalLM.from_pretrained(**model_load_kwargs_quant)
        print("Model loaded successfully with 4-bit quantization.")
    else:
        print("CUDA not available. Loading model in full precision.")
        base_model = AutoModelForCausalLM.from_pretrained(**model_load_kwargs_base)

except ImportError as e:
    if "bitsandbytes" in str(e):
        print(f"Warning: Bitsandbytes ImportError encountered: {e}. Falling back to loading model in full precision.")
        base_model = AutoModelForCausalLM.from_pretrained(**model_load_kwargs_base)
    else:
        raise # Re-raise if it's not a bitsandbytes-related ImportError
except Exception as e:
    print(f"An unexpected error occurred during model loading: {e}. Attempting to load in full precision.")
    base_model = AutoModelForCausalLM.from_pretrained(**model_load_kwargs_base)


base_model.eval() # Set model to evaluation mode

# 2. Create an empty list to store the results
baseline_results = []
print("Generating baseline responses for the test set...")

# Get the raw test dataset split (not the tokenized one, to easily access instruction and response)
test_dataset_raw = train_val_test_dataset['test']

# 3. Iterate through the test dataset
# 4. For each example, generate a response using the base model
# 5. Decode the generated tokens
# 6. Store the results

for i in range(len(test_dataset_raw)):
    original_instruction = str(test_dataset_raw[i]['instruction'])
    reference_response = str(test_dataset_raw[i]['response'])

    # Construct the prompt for generation (only the instruction part)
    instruction_prompt = f"""### Instruction:
{original_instruction}

### Response:"""

    # Tokenize the instruction prompt and get attention mask using the __call__ method
    encoded_input = tokenizer(
        instruction_prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512,
        return_attention_mask=True
    )
    input_ids = encoded_input['input_ids'].to(device)
    attention_mask = encoded_input['attention_mask'].to(device)

    # Generate response using greedy decoding for baseline consistency
    with torch.no_grad():
        output_ids = base_model.generate(
            input_ids,
            attention_mask=attention_mask, # Pass attention mask
            max_new_tokens=256, # Limit generated response length
            do_sample=False, # Greedy decoding
            pad_token_id=tokenizer.eos_token_id # Use eos_token_id as pad_token_id
        )

    # Decode the generated output, excluding the input prompt tokens
    generated_text = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True).strip()

    baseline_results.append({
        'instruction': original_instruction,
        'reference_response': reference_response,
        'generated_response': generated_text
    })

print(f"Generated {len(baseline_results)} baseline responses.")

# Convert the list of dictionaries to a pandas DataFrame for easier analysis
baseline_responses_df = pd.DataFrame(baseline_results)

# 7. Print a few sample generated responses to verify the process
print("\n--- Sample Baseline Responses (First 5) ---")
for j in range(min(5, len(baseline_responses_df))):
    sample = baseline_responses_df.iloc[j]
    print(f"\n--- Sample {j+1} ---")
    print(f"Instruction: {sample['instruction']}")
    print(f"Reference: {sample['reference_response']}")
    print(f"Generated: {sample['generated_response']}")
    print("-" * 20)

print(f"\nBaseline responses DataFrame shape: {baseline_responses_df.shape}")

Using Colab cache for faster access to the 'medquad-medical-question-answer-for-ai-research' dataset.


Filter:   0%|          | 0/31687 [00:00<?, ? examples/s]

Using device for baseline model: cuda
Attempting to load model with 4-bit quantization.


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

Generating baseline responses for the test set...
Generated 1813 baseline responses.

--- Sample Baseline Responses (First 5) ---

--- Sample 1 ---
Instruction: What are the treatments for Hypotrichosis simplex ?
Reference: Is there treatment for hypotrichosis simplex? Is there hope for hair growth in the future? Individuals with hypotrichosis simplex experience a gradual loss of scalp hair that begins during the middle of the first decade and results in almost complete loss of hair by the third decade. A few sparse, fine, short hairs may remain in some individuals. There is currently no treatment for hypotrichosis simplex.
Generated: Hypotrichosis simplex is a genetic disorder characterized by excessive hair growth on the scalp. Treatment options for hypotrichosis simplex may include:

1. Hair transplantation: This is the most common treatment for hypotrichosis simplex. A hair transplant involves removing excess hair from the scalp and replacing it with healthy hair from the donor are

## Evaluate Baseline Model (Automatic Metrics)

### Subtask:
Compute and record the automatic evaluation metrics (ROUGE, BLEU, F1-score, Exact Match, and readability indices) for the baseline model's responses against the test set's reference answers. Populate the experiment tracking table with these baseline results.


**Reasoning**:
I need to import the necessary libraries for evaluation metrics, including `rouge_score`, `nltk.translate.bleu_score`, and `textstat`. I also need to download NLTK data for BLEU score calculation. I will use `bitsandbytes` to make sure the library is installed.



In [None]:
import subprocess
import sys

try:
    import bitsandbytes
except ImportError:
    print("bitsandbytes not found, installing...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "bitsandbytes>=0.46.1"])
    print("bitsandbytes installed successfully.")

# Also ensure accelerate is installed for device_map="auto"
try:
    import accelerate
except ImportError:
    print("accelerate not found, installing...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "accelerate"])
    print("accelerate installed successfully.")

try:
    from rouge_score import rouge_scorer
except ImportError:
    print("rouge_score not found, installing...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "rouge-score"])
    from rouge_score import rouge_scorer
    print("rouge_score installed successfully.")

try:
    import nltk
    from nltk.translate.bleu_score import sentence_bleu
    from nltk.tokenize import word_tokenize
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        print("NLTK 'punkt' tokenizer not found, downloading...")
        nltk.download('punkt')
        print("NLTK 'punkt' downloaded successfully.")
    # Add check and download for punkt_tab
    try:
        nltk.data.find('tokenizers/punkt_tab')
    except LookupError:
        print("NLTK 'punkt_tab' tokenizer not found, downloading...")
        nltk.download('punkt_tab')
        print("NLTK 'punkt_tab' downloaded successfully.")

except ImportError:
    print("NLTK not found, installing...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "nltk"])
    import nltk
    from nltk.translate.bleu_score import sentence_bleu
    from nltk.tokenize import word_tokenize
    print("NLTK installed successfully. Downloading 'punkt' and 'punkt_tab'...")
    nltk.download('punkt')
    nltk.download('punkt_tab') # Also download punkt_tab here
    print("NLTK 'punkt' and 'punkt_tab' downloaded successfully.")

try:
    import textstat
except ImportError:
    print("textstat not found, installing...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "textstat"])
    import textstat
    print("textstat installed successfully.")

import re
import string

print("All necessary evaluation libraries and NLTK data are ready.")

NLTK 'punkt_tab' tokenizer not found, downloading...


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


NLTK 'punkt_tab' downloaded successfully.
All necessary evaluation libraries and NLTK data are ready.


**Reasoning**:
Now that all necessary libraries are installed and NLTK data is ready, I will define helper functions for calculating ROUGE, BLEU, Exact Match, F1 Score, and readability metrics. Then, I will iterate through the `baseline_responses_df` to compute these metrics for each generated response and store them in a list of dictionaries, which will be converted into a DataFrame for the experiment tracking table.



In [None]:
from rouge_score import rouge_scorer
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
import textstat
import re
import string
import pandas as pd

# --- Helper Functions for Metrics ---

def calculate_rouge_scores(reference, hypothesis):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, hypothesis)
    return {
        'rouge1_fmeasure': scores['rouge1'].fmeasure,
        'rouge2_fmeasure': scores['rouge2'].fmeasure,
        'rougeL_fmeasure': scores['rougeL'].fmeasure,
    }

def calculate_bleu_score(reference, hypothesis):
    # BLEU expects a list of tokenized reference sentences and a tokenized hypothesis
    # Reference is a single string here, so put it in a list of one list of tokens
    tokenized_reference = [word_tokenize(reference.lower())]
    tokenized_hypothesis = word_tokenize(hypothesis.lower())
    # Ensure at least one token for calculation
    if not tokenized_hypothesis:
        return 0.0
    return sentence_bleu(tokenized_reference, tokenized_hypothesis)

def calculate_exact_match(reference, hypothesis):
    # Simple exact match after basic normalization
    def normalize_text(text):
        text = text.lower()
        text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
        return text.strip()

    return 1.0 if normalize_text(reference) == normalize_text(hypothesis) else 0.0

def calculate_f1_score(reference, hypothesis):
    # This is a simplified F1 score, often used in QA for token overlap
    # For more robust F1, often a more complex token-level comparison or embedding-based F1 is used.
    # This version is similar to the token-overlap F1 from SQuAD evaluation.
    def normalize_text(text):
        text = text.lower()
        text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
        return text.strip().split()

    reference_tokens = normalize_text(reference)
    hypothesis_tokens = normalize_text(hypothesis)

    common = len(set(reference_tokens) & set(hypothesis_tokens))
    if not reference_tokens or not hypothesis_tokens:
        return 0.0

    precision = common / len(hypothesis_tokens)
    recall = common / len(reference_tokens)

    if precision + recall == 0:
        return 0.0
    return (2 * precision * recall) / (precision + recall)

def calculate_readability(text):
    # Ensure text is not empty before calculating readability metrics
    if not text.strip():
        return {'flesch_kincaid_grade': 0.0, 'smog_index': 0.0}
    return {
        'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text),
        'smog_index': textstat.smog_index(text)
    }

# --- Evaluation Loop ---

all_evaluation_results = []

print("Calculating metrics for baseline responses...")

# Assuming baseline_responses_df is available from the previous step (cell df4f0b29)
for index, row in baseline_responses_df.iterrows():
    reference = row['reference_response']
    hypothesis = row['generated_response']

    # Handle empty strings gracefully for all metrics
    if not reference.strip() or not hypothesis.strip():
        # Skip or assign default values if either is empty
        rouge_scores = {'rouge1_fmeasure': 0.0, 'rouge2_fmeasure': 0.0, 'rougeL_fmeasure': 0.0}
        bleu = 0.0
        em = 0.0
        f1 = 0.0
        readability_scores = {'flesch_kincaid_grade': 0.0, 'smog_index': 0.0}
    else:
        rouge_scores = calculate_rouge_scores(reference, hypothesis)
        bleu = calculate_bleu_score(reference, hypothesis)
        em = calculate_exact_match(reference, hypothesis)
        f1 = calculate_f1_score(reference, hypothesis)
        readability_scores = calculate_readability(hypothesis)

    all_evaluation_results.append({
        'Experiment_ID': 'Baseline_Model',
        'Instruction': row['instruction'],
        'Reference_Response': reference,
        'Generated_Response': hypothesis,
        'ROUGE-1_F': rouge_scores['rouge1_fmeasure'],
        'ROUGE-2_F': rouge_scores['rouge2_fmeasure'],
        'ROUGE-L_F': rouge_scores['rougeL_fmeasure'],
        'BLEU': bleu,
        'Exact_Match': em,
        'F1_Score': f1,
        'Flesch_Kincaid_Grade': readability_scores['flesch_kincaid_grade'],
        'SMOG_Index': readability_scores['smog_index'],
    })

# Convert results to a DataFrame
baseline_metrics_df = pd.DataFrame(all_evaluation_results)

# Display aggregated metrics for the baseline model
print("\nAggregated Baseline Model Metrics:")
print(baseline_metrics_df[['ROUGE-1_F', 'ROUGE-2_F', 'ROUGE-L_F', 'BLEU', 'Exact_Match', 'F1_Score', 'Flesch_Kincaid_Grade', 'SMOG_Index']].mean())

print("\nSample of detailed baseline metrics (first 5 entries):")
print(baseline_metrics_df.head())


Calculating metrics for baseline responses...


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()



Aggregated Baseline Model Metrics:
ROUGE-1_F                0.285507
ROUGE-2_F                0.083173
ROUGE-L_F                0.178051
BLEU                     0.029740
Exact_Match              0.000000
F1_Score                 0.156921
Flesch_Kincaid_Grade    12.228807
SMOG_Index              14.006848
dtype: float64

Sample of detailed baseline metrics (first 5 entries):
    Experiment_ID                                        Instruction  \
0  Baseline_Model  What are the treatments for Hypotrichosis simp...   
1  Baseline_Model                           What causes Trisomy 18 ?   
2  Baseline_Model  How to prevent Sexual and Urologic Problems of...   
3  Baseline_Model  Do you have information about Fluid and Electr...   
4  Baseline_Model  what research (or clinical trials) is being do...   

                                  Reference_Response  \
0  Is there treatment for hypotrichosis simplex? ...   
1  What causes Trisomy 18? In most cases, trisomy...   
2  People with diabe

**Reasoning**:
The previous steps successfully calculated and aggregated all the required automatic evaluation metrics for the baseline model, storing them in `baseline_metrics_df`. The next logical step is to acknowledge the completion of this subtask and confirm that the baseline results are now ready for the experiment tracking table.



In [None]:
# Save DataFrame
baseline_responses_df.to_pickle("baseline_responses.pkl")

# Later, to load it back
import pandas as pd
baseline_responses_df = pd.read_pickle("baseline_responses.pkl")
print(f"Loaded {len(baseline_responses_df)} baseline responses.")


Loaded 1813 baseline responses.


In [None]:
import subprocess
import sys

print("Re-installing bitsandbytes and accelerate to ensure optimizer compatibility...")

try:
    # Install latest bitsandbytes version
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "bitsandbytes"])
    print("bitsandbytes re-installed successfully.")
except Exception as e:
    print(f"Error re-installing bitsandbytes: {e}")

try:
    # Install accelerate
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "accelerate"])
    print("accelerate re-installed successfully.")
except Exception as e:
    print(f"Error re-installing accelerate: {e}")

print("Please RESTART THE RUNTIME (e.g., in Colab: Runtime -> Restart runtime) after this cell finishes execution, then run all cells again starting from the beginning.")


Re-installing bitsandbytes and accelerate to ensure optimizer compatibility...
bitsandbytes re-installed successfully.
accelerate re-installed successfully.
Please RESTART THE RUNTIME (e.g., in Colab: Runtime -> Restart runtime) after this cell finishes execution, then run all cells again starting from the beginning.


In [None]:
import time
import torch
import os

# Re-import necessary libraries for the full setup
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig, AutoTokenizer
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
import pandas as pd
import kagglehub

# --- Start: Re-creating `tokenizer` and `train_val_test_dataset` (from previous steps like df4f0b29 and 2c29636f) ---

# Load the tokenizer (from cell c403bba4)
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # Or tokenizer.eos_token

# Load intronhealth/afrimedqa_v2 dataset (from cell 5a8f8fc5)
afrimedqa_dataset = load_dataset('intronhealth/afrimedqa_v2')

# Download MedQuAD dataset files (from cell 66260b14)
# kagglehub.dataset_download is efficient and uses caching, so no need for 'if not in locals()'
medquad_download_path = kagglehub.dataset_download("pythonafroz/medquad-medical-question-answer-for-ai-research")

# Load MedQuAD dataset from CSV into pandas DataFrame (from cell 4e3fafdf)
medquad_csv_path = os.path.join(medquad_download_path, 'medquad.csv')
medquad_df = pd.read_csv(medquad_csv_path)

# Convert MedQuAD to unified QA format (from cell 70257e3b)
medquad_qa_df = medquad_df.copy()
medquad_qa_df['instruction'] = medquad_qa_df['question'].astype(str)
medquad_qa_df['response'] = medquad_qa_df['answer'].astype(str)
medquad_qa_dataset = Dataset.from_pandas(medquad_qa_df[['instruction', 'response']])

# Convert afrimedqa_v2 to unified QA format (from cell ce5a4b86)
afrimedqa_df = afrimedqa_dataset['train'].to_pandas()
afrimedqa_df['instruction'] = afrimedqa_df['question'].astype(str)
afrimedqa_df['response'] = afrimedqa_df['answer_rationale'].fillna('').astype(str)
afrimedqa_qa_dataset = Dataset.from_pandas(afrimedqa_df[['instruction', 'response']])

# Concatenate unified datasets (from cell 07758188)
unified_dataset = concatenate_datasets([medquad_qa_dataset, afrimedqa_qa_dataset])

# Initial cleaning and split into training, validation, and test sets (from cell 2c29636f)
initial_num_rows = len(unified_dataset)
unified_dataset = unified_dataset.filter(lambda example: example['instruction'].strip() != '' and example['response'].strip() != '')
cleaned_num_rows = len(unified_dataset)
print(f"Removed {initial_num_rows - cleaned_num_rows} entries due to empty instruction or response.")
print(f"Unified dataset after cleaning: {unified_dataset}")

shuffled_dataset = unified_dataset.shuffle(seed=42)
train_test_split = shuffled_dataset.train_test_split(test_size=0.2, seed=42)
val_test_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)
train_val_test_dataset = DatasetDict({
    'train': train_test_split['train'],
    'validation': val_test_split['train'],
    'test': val_test_split['test']
})
print("\nUnified dataset split into training, validation, and test sets successfully:")
print(train_val_test_dataset)

# Define a function to format the dataset (from cell c403bba4)
def format_prompt(example):
    instruction = str(example['instruction'])
    response = str(example['response'])
    formatted_text = f"""### Instruction:\n{instruction}\n\n### Response:\n{response}"""
    return {'text': formatted_text}

# Apply the formatting function
formatted_dataset = train_val_test_dataset.map(format_prompt, remove_columns=['instruction', 'response'])

# Tokenization function (from cell c403bba4)
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

# Apply tokenization
tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
print("\nDataset tokenized successfully.")

# --- End: Re-creating `tokenizer` and `train_val_test_dataset` ---

# --- Start: Re-creating `trainer` and `peft_model` (from cell eb72ebc5) ---

# 1. Load the base model and move it to a GPU if available
base_model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device for training: {device}")

model = None
quantization_config = None

if torch.cuda.is_available():
    try:
        # Ensure bitsandbytes is explicitly imported for validation
        import bitsandbytes
        print("Attempting to load model with 4-bit quantization using bitsandbytes.")
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=False,
        )
        model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            quantization_config=quantization_config,
            dtype=torch.bfloat16, # Use bfloat16 for quantized model on CUDA
            device_map="auto"
        )
        print("Model loaded successfully with 4-bit quantization.")
    except Exception as e: # Catch any exception related to bitsandbytes loading
        print(f"Warning: 4-bit quantization failed ({e}). Falling back to loading model in full bfloat16 precision.")
        # If 4-bit fails, load without quantization, but still use bfloat16 if CUDA is available
        model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            dtype=torch.bfloat16,
            device_map="auto"
        )
else:
    print("CUDA not available. Loading model in full float32 precision.")
    model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        dtype=torch.float32,
        device_map="auto"
    )

# 2. Configure LoRA parameters
target_modules = ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=target_modules,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# 3. Integrate the LoRA configuration with the base model
peft_model = get_peft_model(model, lora_config)
print("\nPEFT model summary:")
peft_model.print_trainable_parameters()

# Initialize the `TrainingArguments`
training_args = TrainingArguments(
    output_dir="./tinyllama_medqa_finetuned",
    num_train_epochs=1, # Set to a small number as max_steps will control duration
    max_steps=1200, # Use max_steps to control total training steps
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    optim="adamw_torch",
    learning_rate=2e-4,
    logging_steps=50,
    save_steps=200,
    eval_steps=200,
    seed=42,
    fp16=False,
    bf16=model.dtype == torch.bfloat16,
    gradient_checkpointing=True,
    report_to="none",
    remove_unused_columns=True, # Set to True to prevent passing 'text' column
    save_total_limit=2
)

# Define the DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Initialize the Trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=data_collator,
)
print("\nTrainer initialized successfully.")

# --- End: Re-creating `trainer` and `peft_model` ---


Using Colab cache for faster access to the 'medquad-medical-question-answer-for-ai-research' dataset.


Filter:   0%|          | 0/31687 [00:00<?, ? examples/s]

Removed 13565 entries due to empty instruction or response.
Unified dataset after cleaning: Dataset({
    features: ['instruction', 'response'],
    num_rows: 18122
})

Unified dataset split into training, validation, and test sets successfully:
DatasetDict({
    train: Dataset({
        features: ['instruction', 'response'],
        num_rows: 14497
    })
    validation: Dataset({
        features: ['instruction', 'response'],
        num_rows: 1812
    })
    test: Dataset({
        features: ['instruction', 'response'],
        num_rows: 1813
    })
})


Map:   0%|          | 0/14497 [00:00<?, ? examples/s]

Map:   0%|          | 0/1812 [00:00<?, ? examples/s]

Map:   0%|          | 0/1813 [00:00<?, ? examples/s]

Map:   0%|          | 0/14497 [00:00<?, ? examples/s]

Map:   0%|          | 0/1812 [00:00<?, ? examples/s]

Map:   0%|          | 0/1813 [00:00<?, ? examples/s]


Dataset tokenized successfully.
Using device for training: cuda
Attempting to load model with 4-bit quantization using bitsandbytes.


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

Model loaded successfully with 4-bit quantization.

PEFT model summary:
trainable params: 6,307,840 || all params: 1,106,356,224 || trainable%: 0.5701

Trainer initialized successfully.


In [None]:
import subprocess
import sys

print("Re-installing bitsandbytes and accelerate to ensure optimizer compatibility...")

try:
    # Install latest bitsandbytes version
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "bitsandbytes"])
    print("bitsandbytes re-installed successfully.")
except Exception as e:
    print(f"Error re-installing bitsandbytes: {e}")

try:
    # Install accelerate
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "accelerate"])
    print("accelerate re-installed successfully.")
except Exception as e:
    print(f"Error re-installing accelerate: {e}")

print("Please RESTART THE RUNTIME (e.g., in Colab: Runtime -> Restart runtime) after this cell finishes execution, then run all cells again starting from the beginning.")


Re-installing bitsandbytes and accelerate to ensure optimizer compatibility...
bitsandbytes re-installed successfully.
accelerate re-installed successfully.
Please RESTART THE RUNTIME (e.g., in Colab: Runtime -> Restart runtime) after this cell finishes execution, then run all cells again starting from the beginning.


In [None]:
# =========================
# STEP-BASED TRAINING (908 steps / ~2 epochs)
# =========================

import os, time, torch, pandas as pd
from google.colab import drive
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    TrainingArguments, Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, TaskType
import kagglehub

# -------- Experiment config --------
EXPERIMENT_ID = "Exp_002"
BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

MAX_STEPS = 908 # Approximately 2 epochs (14497 samples / 32 effective batch size = ~453.03 steps per epoch)
LR = 5e-5
BATCH_SIZE = 4
GRAD_ACC = 8

# -------- Mount Drive --------
drive.mount("/content/drive")

OUTPUT_DIR = f"/content/drive/MyDrive/llm_experiments/{EXPERIMENT_ID}"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# -------- Tokenizer --------
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# -------- Load datasets --------
afrimedqa = load_dataset("intronhealth/afrimedqa_v2")

medquad_path = kagglehub.dataset_download(
    "pythonafroz/medquad-medical-question-answer-for-ai-research"
)
medquad_df = pd.read_csv(os.path.join(medquad_path, "medquad.csv"))

medquad_ds = Dataset.from_pandas(pd.DataFrame({
    "instruction": medquad_df["question"].astype(str),
    "response": medquad_df["answer"].astype(str)
}))

afrimedqa_df = afrimedqa["train"].to_pandas()
afrimedqa_ds = Dataset.from_pandas(pd.DataFrame({
    "instruction": afrimedqa_df["question"].astype(str),
    "response": afrimedqa_df["answer_rationale"].fillna("").astype(str)
}))

dataset = concatenate_datasets([medquad_ds, afrimedqa_ds])
dataset = dataset.filter(lambda x: x["instruction"].strip() and x["response"].strip())

dataset = dataset.shuffle(seed=42)
split1 = dataset.train_test_split(test_size=0.2)
split2 = split1["test"].train_test_split(test_size=0.5)

dataset = DatasetDict({
    "train": split1["train"],
    "validation": split2["train"],
    "test": split2["test"]
})

# -------- Prompt formatting --------
def format_prompt(x):
    return {
        "text": f"### Instruction:\n{x['instruction']}\n\n### Response:\n{x['response']}"
    }

dataset = dataset.map(format_prompt, remove_columns=["instruction", "response"])

def tokenize(x):
    # Changed padding='max_length' to padding=True for dynamic padding
    return tokenizer(x["text"], truncation=True, padding=True, max_length=512)

dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])

# -------- Model loading (4-bit if CUDA) --------
quant_config = None
if torch.cuda.is_available():
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config,
    device_map="auto"
)

# -------- LoRA --------
lora = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj", "k_proj", "v_proj",
        "o_proj", "gate_proj", "up_proj", "down_proj"
    ],
    lora_dropout=0.05,
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora)
model.print_trainable_parameters()

# -------- TrainingArguments (STEP-BASED) --------
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    max_steps=MAX_STEPS,          # ✅ ONLY THIS CONTROLS TRAINING
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC,
    learning_rate=LR,
    optim="adamw_torch",
    save_steps=50,
    eval_steps=50,
    logging_steps=50,
    save_total_limit=2,
    bf16=torch.cuda.is_available(),
    fp16=False,
    report_to="none",
    remove_unused_columns=True,
    seed=42
    # Removed evaluation_strategy="steps" to fix TypeError
)

# -------- Trainer --------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

# -------- Resume if checkpoint exists --------
checkpoint = None
if os.path.isdir(OUTPUT_DIR):
    ckpts = [os.path.join(OUTPUT_DIR, d) for d in os.listdir(OUTPUT_DIR) if d.startswith("checkpoint-")]
    if ckpts:
        ckpts.sort(key=lambda x: int(x.split("-")[-1]))
        checkpoint = ckpts[-1]
        print(f"Resuming from {checkpoint}")

# -------- Train --------
start_time = time.time()
# Clear CUDA cache and reset peak memory stats if GPU is available before training
peak_gpu_memory_gb = 0.0
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    initial_gpu_memory_allocated = torch.cuda.memory_allocated()
    print(f"Initial GPU memory allocated: {initial_gpu_memory_allocated / (1024**3):.2f} GB")

print(f"\nStarting training for {EXPERIMENT_ID} for {MAX_STEPS} steps...")
trainer.train(resume_from_checkpoint=checkpoint)
print(f"\nTraining for {EXPERIMENT_ID} completed.")
end_time = time.time()

training_duration_seconds = end_time - start_time
training_duration_hms = time.strftime("%H:%M:%S", time.gmtime(training_duration_seconds))

# -------- Evaluate --------
metrics = trainer.evaluate()
final_validation_loss = metrics['eval_loss']
print(f"Validation loss: {final_validation_loss:.4f}")

# -------- Retrieve peak GPU memory allocated during training (if CUDA was available) --------
if torch.cuda.is_available():
    peak_gpu_memory_bytes = torch.cuda.max_memory_allocated()
    peak_gpu_memory_gb = peak_gpu_memory_bytes / (1024**3)
    print(f"Peak GPU memory allocated: {peak_gpu_memory_gb:.2f} GB")

# -------- Save LoRA --------
output_adapter_dir = f"{OUTPUT_DIR}/lora_adapters"
model.save_pretrained(output_adapter_dir)
print(f"\nFine-tuned LoRA adapters for {EXPERIMENT_ID} saved to: {output_adapter_dir}")

# -------- Print Experiment Summary --------
print(f"\n--- Experiment Summary ({EXPERIMENT_ID} - {MAX_STEPS} Steps) ---")
print(f"Experiment ID: {EXPERIMENT_ID}")
print(f"Learning Rate: {LR}")
print(f"Per Device Train Batch Size: {BATCH_SIZE}")
print(f"Gradient Accumulation Steps: {GRAD_ACC}")
print(f"Effective Batch Size: {BATCH_SIZE * GRAD_ACC}")
print(f"Total Training Steps: {MAX_STEPS}")
print(f"Optimizer: adamw_torch") # Explicitly stated for consistency
print(f"Peak GPU Memory: {peak_gpu_memory_gb:.2f} GB")
print(f"Training Time: {training_duration_hms}")
print(f"Final Validation Loss: {final_validation_loss:.4f}")
print("--------------------------------------------------")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using Colab cache for faster access to the 'medquad-medical-question-answer-for-ai-research' dataset.


Filter:   0%|          | 0/31687 [00:00<?, ? examples/s]

Map:   0%|          | 0/14497 [00:00<?, ? examples/s]

Map:   0%|          | 0/1812 [00:00<?, ? examples/s]

Map:   0%|          | 0/1813 [00:00<?, ? examples/s]

Map:   0%|          | 0/14497 [00:00<?, ? examples/s]

Map:   0%|          | 0/1812 [00:00<?, ? examples/s]

Map:   0%|          | 0/1813 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

trainable params: 6,307,840 || all params: 1,106,356,224 || trainable%: 0.5701
Resuming from /content/drive/MyDrive/llm_experiments/Exp_002/checkpoint-500
Initial GPU memory allocated: 0.79 GB

Starting training for Exp_002 for 908 steps...


Step,Training Loss
550,1.061116
600,1.041444
650,1.048976
700,1.064952
750,1.051109
800,1.039883
850,1.022911
900,1.024441



Training for Exp_002 completed.


Validation loss: 1.0414
Peak GPU memory allocated: 9.78 GB

Fine-tuned LoRA adapters for Exp_002 saved to: /content/drive/MyDrive/llm_experiments/Exp_002/lora_adapters

--- Experiment Summary (Exp_002 - 908 Steps) ---
Experiment ID: Exp_002
Learning Rate: 5e-05
Per Device Train Batch Size: 4
Gradient Accumulation Steps: 8
Effective Batch Size: 32
Total Training Steps: 908
Optimizer: adamw_torch
Peak GPU Memory: 9.78 GB
Training Time: 04:05:30
Final Validation Loss: 1.0414
--------------------------------------------------


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import pandas as pd
import os
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
import kagglehub
from google.colab import drive # Import drive

drive.mount("/content/drive") # Mount Google Drive

# --- Start of re-included dependencies for train_val_test_dataset and tokenizer ---

# Load the tokenizer (from cell c403bba4)
base_model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load intronhealth/afrimedqa_v2 dataset (from cell 5a8f8fc5)
afrimedqa_dataset = load_dataset('intronhealth/afrimedqa_v2')

# Download MedQuAD dataset files (from cell 66260b14)
medquad_download_path = kagglehub.dataset_download("pythonafroz/medquad-medical-question-answer-for-ai-research")

# Load MedQuAD dataset from CSV into pandas DataFrame (from cell 4e3fafdf)
medquad_csv_path = os.path.join(medquad_download_path, 'medquad.csv')
medquad_df = pd.read_csv(medquad_csv_path)

# Convert MedQuAD to unified QA format (from cell 70257e3b)
medquad_qa_df = medquad_df.copy()
medquad_qa_df['instruction'] = medquad_qa_df['question'].astype(str)
medquad_qa_df['response'] = medquad_qa_df['answer'].astype(str)
medquad_qa_dataset = Dataset.from_pandas(medquad_qa_df[['instruction', 'response']])

# Convert afrimedqa_v2 to unified QA format (from cell ce5a4b86)
afrimedqa_df = afrimedqa_dataset['train'].to_pandas()
afrimedqa_df['instruction'] = afrimedqa_df['question'].astype(str)
afrimedqa_df['response'] = afrimedqa_df['answer_rationale'].fillna('').astype(str)
afrimedqa_qa_dataset = Dataset.from_pandas(afrimedqa_df[['instruction', 'response']])

# Concatenate unified datasets (from cell 07758188)
unified_dataset = concatenate_datasets([medquad_qa_dataset, afrimedqa_qa_dataset])

# Initial cleaning and split into training, validation, and test sets (from cell 2c29636f)
unified_dataset = unified_dataset.filter(lambda example: example['instruction'].strip() != '' and example['response'].strip() != '')
shuffled_dataset = unified_dataset.shuffle(seed=42)
train_test_split = shuffled_dataset.train_test_split(test_size=0.2, seed=42)
val_test_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)
train_val_test_dataset = DatasetDict({
    'train': train_test_split['train'],
    'validation': val_test_split['train'],
    'test': val_test_split['test']
})
# --- End of re-included dependencies ---

# Load the base model without quantization for inference (or with if preferred and compatible)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device for fine-tuned model inference: {device}")

# The adapter was saved to OUTPUT_DIR/lora_adapters from cell _Kr_JR0bYPmK.
# We assume OUTPUT_DIR is available from the training cell (`_Kr_JR0bYPmK`).
# If running this cell independently after a runtime restart, you might need to manually set OUTPUT_DIR.
OUTPUT_DIR = "/content/drive/MyDrive/llm_experiments exp1/Exp_002" # Corrected OUTPUT_DIR per user's latest path
output_adapter_dir = f"{OUTPUT_DIR}/lora_adapters"

# Load the base model
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
)

# Load the PEFT adapter
try:
    fine_tuned_model = PeftModel.from_pretrained(model, output_adapter_dir)
    print(f"Successfully loaded fine-tuned model from {output_adapter_dir}")
except Exception as e:
    print(f"Error loading PEFT adapter: {e}")
    print("Proceeding with base model only, or re-check the path.")
    fine_tuned_model = model

fine_tuned_model.eval() # Set model to evaluation mode

# Ensure train_val_test_dataset is available from previous steps (e.g., cell a09e05c4)
# If not, you'd need to re-create it here. Assuming it's in scope:
test_dataset_raw = train_val_test_dataset['test']

# 2. Create an empty list to store the results
fine_tuned_results = []
print("Generating fine-tuned responses for the test set...")

# 3. Iterate through the test dataset
# 4. For each example, generate a response using the fine-tuned model
# 5. Decode the generated tokens
# 6. Store the results

for i in range(len(test_dataset_raw)):
    original_instruction = str(test_dataset_raw[i]['instruction'])
    reference_response = str(test_dataset_raw[i]['response'])

    # Construct the prompt for generation
    instruction_prompt = f"""### Instruction:\n{original_instruction}\n\n### Response:"""

    # Tokenize the instruction prompt
    encoded_input = tokenizer(
        instruction_prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512,
        return_attention_mask=True
    )
    input_ids = encoded_input['input_ids'].to(device)
    attention_mask = encoded_input['attention_mask'].to(device)

    # Generate response using greedy decoding for consistency with baseline
    with torch.no_grad():
        output_ids = fine_tuned_model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_new_tokens=256,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode the generated output, excluding the input prompt tokens
    generated_text = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True).strip()

    fine_tuned_results.append({
        'instruction': original_instruction,
        'reference_response': reference_response,
        'generated_response': generated_text
    })

print(f"Generated {len(fine_tuned_results)} fine-tuned responses.")

# Convert the list of dictionaries to a pandas DataFrame for easier analysis
fine_tuned_responses_df = pd.DataFrame(fine_tuned_results)

# 7. Print a few sample generated responses to verify the process
print("\n--- Sample Fine-Tuned Responses (First 5) ---")
for j in range(min(5, len(fine_tuned_responses_df))):
    sample = fine_tuned_responses_df.iloc[j]
    print(f"\n--- Sample {j+1} ---")
    print(f"Instruction: {sample['instruction']}")
    print(f"Reference: {sample['reference_response']}")
    print(f"Generated: {sample['generated_response']}")
    print("-" * 20)

print(f"\nFine-tuned responses DataFrame shape: {fine_tuned_responses_df.shape}")

# Save the fine_tuned_responses_df for later use to Google Drive
fine_tuned_responses_path = os.path.join(OUTPUT_DIR, "fine_tuned_responses.pkl")
fine_tuned_responses_df.to_pickle(fine_tuned_responses_path)
print(f"Fine-tuned responses saved to {fine_tuned_responses_path}")

Mounted at /content/drive


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

afri_med_qa_15k_v2.4_phase_2_15275.csv:   0%|          | 0.00/8.64M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15275 [00:00<?, ? examples/s]

Using Colab cache for faster access to the 'medquad-medical-question-answer-for-ai-research' dataset.


Filter:   0%|          | 0/31687 [00:00<?, ? examples/s]

Using device for fine-tuned model inference: cuda


`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Successfully loaded fine-tuned model from /content/drive/MyDrive/llm_experiments exp1/Exp_002/lora_adapters
Generating fine-tuned responses for the test set...
