In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset
import torch
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, PeftModel
import os
import bitsandbytes as bnb  # For 8-bit quantization
from evaluate import load
from tqdm import tqdm
import torch
import pandas as pd
import re
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
pd.options.display.max_colwidth = None
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
# model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
hf_auth_token = os.getenv("HF_AUTH_TOKEN")



### Only before fine-tuning or testing original model ###

In [None]:
# bnb_config = BitsAndBytesConfig(
#     load_in_8bit=True,  # Enable 8-bit quantization
#     llm_int8_threshold=6.0  # Adjust threshold for higher precision on sensitive layers
# )

# tokenizer = AutoTokenizer.from_pretrained(
#     "meta-llama/Meta-Llama-3-8B-Instruct",
#     use_auth_token=hf_auth_token,
#     cache_dir="/fs03/yu60/kojitanaka/model_cache"
# )

# model = AutoModelForCausalLM.from_pretrained(
#     "meta-llama/Meta-Llama-3-8B-Instruct",
#     use_auth_token=hf_auth_token,
#     cache_dir="/fs03/yu60/kojitanaka/model_cache",
#     device_map="auto",  # Automatically maps layers to GPU
#     quantization_config=bnb_config,
# )

### Only testing fine-tuned ###

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,  # Enable 8-bit quantization
    llm_int8_threshold=6.0  # Adjust threshold for higher precision on sensitive layers
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("llama-lora_aqua/aqua_lora24_model_6000")

# Load the base model (this should match the model architecture used in fine-tuning)
base_model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    use_auth_token=hf_auth_token,
    cache_dir="/fs03/yu60/kojitanaka/model_cache",
    device_map="auto",  # Automatically maps layers to GPU
    quantization_config=bnb_config,
)

# Apply the LoRA adapters
model = PeftModel.from_pretrained(base_model, "llama-lora_aqua/aqua_lora24_model_6000")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [3]:
aqua_rat = load_dataset("deepmind/aqua_rat", "raw")
aqua_df = pd.DataFrame(aqua_rat['train'])

In [9]:
aqua_df.columns
def preprocess_aqua(example):
    return {
        "text": (
            f"Question: {example['question']}. "
            f"Options: {example['options']}. "
            "Please strictly follow this format: "
            "Explanation: [Your explanation], Answer: [A/B/C/D/E]."
            # f"Question: {example['question']}. "
            # f"Options: {example['options']}. "
            # "Please provide your answer in the exactly the same format as the example 1. Please keep the explanation short."
            # "Format is Explanation: your explanation, Answer: [A/B/C/D/E]"
            # "Example 1:"
            # "'Explanation: [1 + 1 = 2. Therefore, the answer is A], Answer: [A]'<|eot_id|>."
            # "Please note that you have to say Answer: before <|eot_id|>"
            "Your turn "
            "Explanation: "
        )
    }

# Apply preprocessing
tokenized_aqua = aqua_rat.map(preprocess_aqua)

In [None]:
# tokenized_aqua['train']
# sample_text = tokenized_aqua['train']['text'][0]
# print(tokenizer.eos_token_id)
eos_token_id = tokenizer.eos_token_id
eos_token = tokenizer.decode([eos_token_id])
print(f"Decoded EOS Token: {eos_token}")

In [11]:
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu

def evaluate_explanations(predicted_explanations, correct_explanations):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    bleu_scores = []

    for pred, correct in zip(predicted_explanations, correct_explanations):
        if not pred or not correct:
            continue  # Skip if either explanation is missing

        # Calculate ROUGE scores
        rouge = scorer.score(correct, pred)
        rouge_scores['rouge1'].append(rouge['rouge1'].fmeasure)
        rouge_scores['rouge2'].append(rouge['rouge2'].fmeasure)
        rouge_scores['rougeL'].append(rouge['rougeL'].fmeasure)

        # Calculate BLEU score
        reference = correct.split()
        candidate = pred.split()
        bleu_scores.append(sentence_bleu([reference], candidate))

    # Average Scores
    avg_rouge1 = sum(rouge_scores['rouge1']) / len(rouge_scores['rouge1']) if rouge_scores['rouge1'] else 0
    avg_rouge2 = sum(rouge_scores['rouge2']) / len(rouge_scores['rouge2']) if rouge_scores['rouge2'] else 0
    avg_rougeL = sum(rouge_scores['rougeL']) / len(rouge_scores['rougeL']) if rouge_scores['rougeL'] else 0
    avg_bleu = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0

    return avg_rouge1, avg_rouge2, avg_rougeL, avg_bleu

def extract_last_answer(text):
    # Regex pattern to match "Answer:" and capture the alphabet in different formats
    pattern = r"Answer:\s*[\[\(]?([A-E])[\]\)]?[.)]?"
    match = re.search(pattern, text)
    if match:
        return match.group(1)  # Extract the captured alphabet
    return None 

def extract_explanation(prediction_text):
    # Find the first occurrence of 'Answer:'
    answer_index = prediction_text.find("Answer:")
    if answer_index != -1:
        return prediction_text[:answer_index].strip()  # Return everything before 'Answer:'
    
    # If 'Answer:' is not found, return the full text
    return prediction_text.strip()

def remove_repetition(text, question):
    # Normalize whitespace and ensure consistent formatting
    question = question.strip()
    text = text.strip()
    # Remove the question if it appears in the generated text
    cleaned_text = text.replace(question, "").strip()
    return cleaned_text

def calculate_exact_match(predictions, correct_answers):
    correct = sum([1 for pred, correct in zip(predictions, correct_answers) if pred == correct])
    return (correct / len(correct_answers)) * 100

In [None]:
sample_texts = tokenized_aqua['test']['text']

size = 100
for i in range(0, size):
    sample_text = sample_texts[i]
    inputs = tokenizer(sample_text, return_tensors="pt")
    inputs = {key: value.to(model.device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=400, eos_token_id=tokenizer.eos_token_id, top_k=1)
        prediction_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
        print(f"Prediction text: {prediction_text}")
        answer_only = remove_repetition(prediction_text, sample_text)
        answer = extract_last_answer(answer_only)
        print(f"Extracted Answer: {answer_only}")
        explanation = extract_explanation(answer_only)

        aqua_df.loc[i, 'prediction'] = answer
        aqua_df.loc[i, 'predicted_explanation'] = explanation
        
        print(f"Progress: {i} / {size}")
        print(f"Prediction: {answer} Answer: {aqua_df.loc[i]['correct']}")
    
predictions = aqua_df['prediction'][0:size].tolist()  # Your model predictions
correct_answers = aqua_df['correct'][0:size].tolist()  # Ground truth answers
print(f"predictions: {predictions}")
print(f"Correct answers: {correct_answers}")

# Evaluate
accuracy = calculate_exact_match(predictions, correct_answers)
print(f"Accuracy (Exact Match): {accuracy:.2f}%")


In [8]:
# Without fine-tuning: 26.00%
# Accuracy fine-tuned (500): 26.00%
# Accuracy fine-tuned (5000): 9.00% mostly just None
# Fine-tuned 24 with projection layer(1000): 26.00%
# Accuracy fine-tuned (6000): 9.00% mostly just None

In [None]:
predicted_explanations = aqua_df['predicted_explanation'][0:size].tolist()  # Predicted explanations
correct_explanations = aqua_df['rationale'][0:size].tolist()
# Example Usage
rouge1, rouge2, rougeL, bleu = evaluate_explanations(predicted_explanations, correct_explanations)
print(f"ROUGE-1: {rouge1:.2f}, ROUGE-2: {rouge2:.2f}, ROUGE-L: {rougeL:.2f}, BLEU: {bleu:.2f}")


In [None]:
from transformers import TrainingArguments, Trainer
from datasets import DatasetDict
# Prepare train and validation datasets
def preprocess_aqua(example):
    return {
        "input_text": (
            f"Question: {example['question']}. "
            f"Options: {example['options']}. "
            "Please strictly follow this format: "
            "Explanation: [Your explanation], Answer: [A/B/C/D/E]."
        ),
        "target_text": (
            f"Question: {example['question']}. "
            f"Options: {example['options']}. "
            "Please strictly follow this format: "
            "Explanation: [Your explanation], Answer: [A/B/C/D/E]."
            "Response:"
            f"Explanation: {example['rationale']}, Answer: [{example['correct']}]. "
        ),
    }

# Apply preprocessing
processed_aqua = aqua_rat.map(preprocess_aqua)

In [None]:
def tokenize_for_finetuning(example):
    input_encodings = tokenizer(example["input_text"], truncation=True, padding="max_length", max_length=512)
    target_encodings = tokenizer(example["target_text"], truncation=True, padding="max_length", max_length=512)
    
    # Combine inputs and labels
    input_encodings["labels"] = target_encodings["input_ids"]
    return input_encodings

tokenizer.pad_token = tokenizer.eos_token
tokenized_datasets = processed_aqua.map(tokenize_for_finetuning, batched=True)

tokenized_datasets.save_to_disk("tokenized_aqua")
# Remove unnecessary columns
# tokenized_datasets = tokenized_datasets.remove_columns(["input_text", "target_text", "rationale", "correct"])

In [None]:
from transformers import TrainingArguments, Trainer
from datasets import load_from_disk

# Load the preprocessed dataset
tokenized_datasets = load_from_disk("tokenized_aqua")
train_subset = tokenized_datasets["train"].select(range(2000))
# train_subset = tokenized_datasets["train"]

# Replace the original train dataset with the subset (optional)
tokenized_datasets["train"] = train_subset


lora_config = LoraConfig(
    r=16,  # Low-rank dimension
    lora_alpha=16,  # Reduce scaling for stable updates
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.1,  # Dropout rate for LoRA
    bias="none",  # Bias type ("none", "all", or "lora_only")
    task_type="CAUSAL_LM",  # Task type for the model
)

# Add LoRA adapters to the model
model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

# Training arguments
training_args = TrainingArguments(
    output_dir="./fine-tuned-llama-lora",
    overwrite_output_dir=True,
    per_device_train_batch_size=1,
    num_train_epochs=5,
    learning_rate=2e-5,
    logging_dir="./logs",
    logging_steps=500,  # Adjust logging frequency for smaller dataset
    save_strategy="epoch",
    save_total_limit=2,
    fp16=True,  # Enable mixed precision
    label_smoothing_factor=0.1,
)


# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("llama-lora_aqua/aqua_lora16_model_6000")
tokenizer.save_pretrained("llama-lora_aqua/aqua_lora16_model_6000")