In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset
import torch
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, PeftModel
import os
import bitsandbytes as bnb  # For 8-bit quantization
from evaluate import load
from tqdm import tqdm
import torch
import pandas as pd
import re
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
pd.options.display.max_colwidth = None
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
# model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
hf_auth_token = os.getenv("HF_AUTH_TOKEN")



In [2]:
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,  # Enable 8-bit quantization
    llm_int8_threshold=6.0  # Adjust threshold for higher precision on sensitive layers
)


tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    use_auth_token=hf_auth_token,
    cache_dir="/fs03/yu60/kojitanaka/model_cache"
)

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    use_auth_token=hf_auth_token,
    cache_dir="/fs03/yu60/kojitanaka/model_cache",
    device_map="auto",  # Automatically maps layers to GPU
    quantization_config=bnb_config,
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
print(torch.cuda.is_available())

True


In [4]:
dataset = load_dataset("timdettmers/openassistant-guanaco")

Repo card metadata block was not found. Setting CardData to empty.


In [5]:
# Language processing
squad_dataset = load_dataset("squad_v2")
# Reasoning
gsm8k_dataset = load_dataset("gsm8k", 'main')
# Memory Expert
nq_dataset = load_dataset("natural_questions")
# Decision Expert
arc_dataset = load_dataset("ai2_arc", "ARC-Easy")

Resolving data files:   0%|          | 0/287 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/287 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/235 [00:00<?, ?it/s]

## Language Processing SQUAD ##

In [6]:
def preprocess_squad(example):
    return {
        "text": f"Question: {example['question']}. Context: {example['context']}. "
                f"Please follow the following format and answer one or few words: Answer: [your answer] Explanation: [your explanation]. Answer:"
    }
# Apply preprocessing
tokenized_squad = squad_dataset.map(preprocess_squad)

In [None]:
squad_dataset['validation']['answers']
# small_validation_set['answers']

In [8]:
tokenized_squad
small_validation_set = tokenized_squad["validation"].select(range(50))

In [9]:

# Load SQuAD evaluation metric
squad_metric = load("squad")

In [None]:
# tokenizer('hello world!     a')
tokenizer.decode([128000, 15339, 1917, 0, 257, 264], skip_special_tokens=True)

In [11]:
def flatten(nested_list):
    flat_set = set()
    
    def helper(sublist):
        for item in sublist:
            if isinstance(item, list):
                helper(item)
            else:
                flat_set.add(item)
    
    helper(nested_list)
    return flat_set

def clean_and_tokenize(text):
    """
    Tokenizes a text into a set of unique words, removes punctuation, and returns the set.
    """
    import string

    # Remove punctuation and split into words
    tokens = text.translate(str.maketrans("", "", string.punctuation)).split()
    return set(tokens)

def calculate_f1_and_exact(predictions, references):
    f1_scores = []
    exact_matches = []
    df = pd.DataFrame(columns = ["prediction", "label"])

    for i, (pred, ref) in enumerate(zip(predictions, references)):
        # Extract tokens directly from the predicted answer and reference
        pred_tokens = clean_and_tokenize(pred["prediction_text"])
        ref_tokens = ref["answers"]["text"]  # Already a set from `evaluate_squad_llama`

        df.loc[i] = [pred_tokens, ref_tokens]
        # Exact Match: Check if predicted tokens exactly match reference tokens
        is_exact_match = pred_tokens == ref_tokens
        exact_matches.append(is_exact_match)

        # F1 Score: Calculate based on token overlap
        common_tokens = pred_tokens & ref_tokens  # Intersection
        if len(common_tokens) == 0:
            f1_scores.append(0)
            print("F1 Score: 0 (No common tokens)")
        else:
            precision = len(common_tokens) / len(pred_tokens) if len(pred_tokens) > 0 else 0
            recall = len(common_tokens) / len(ref_tokens) if len(ref_tokens) > 0 else 0
            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
            f1_scores.append(f1)
            print(f"Common Tokens: {common_tokens}")
            print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

        print("-" * 40)

    # Calculate averages
    avg_f1 = sum(f1_scores) / len(f1_scores) if f1_scores else 0
    exact_match_rate = sum(exact_matches) / len(exact_matches) if exact_matches else 0

    print(f"Average Exact Match: {exact_match_rate * 100:.2f}")
    print(f"Average F1 Score: {avg_f1 * 100:.2f}")
    return {"exact_match": exact_match_rate * 100, "f1": avg_f1 * 100, 'df': df}

In [12]:


def evaluate_squad_llama(model, tokenizer, dataset, split="validation"):
    model.eval()
    predictions = []
    references = []

    for i, sample in enumerate(dataset[split]):
        # Tokenize input for the model
        if len(sample["answers"]["text"]) == 0:
            print(f"Skipping unanswerable question ID: {sample['id']}")
            continue
        inputs = tokenizer(
            sample["text"], 
            return_tensors="pt", 
            truncation=True, 
            max_length=512
        )
        inputs = {key: value.to(model.device) for key, value in inputs.items()}

        # Generate predictions
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=50,  # Allow enough tokens for both answer and explanation
                temperature=0.7,
                num_beams=3,
                early_stopping=True,
                eos_token_id=tokenizer.eos_token_id
            )
            prediction_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        print(f"Sample {i + 1}:")
        # print(f"Input Text: {sample['text']}")
        # print(f"Generated Output: {prediction_text}")

        # Extract the content after the second "Answer:" and before "Explanation:"
        parts = prediction_text.split("Answer:")
        if len(parts) > 2:
            # Extract from the second "Answer:"
            answer_part = parts[2].strip()
            end_idx = answer_part.find("Explanation:")
            predicted_answer = answer_part[:end_idx].strip() if end_idx != -1 else answer_part
        else:
            # Fallback if the format is not followed
            predicted_answer = prediction_text.strip()

        tokenized_prediction = clean_and_tokenize(predicted_answer)
        print(f"Extracted Predicted Answer: {tokenized_prediction}")

        # Store prediction
        predictions.append({"id": sample["id"], "prediction_text": predicted_answer})

        # Extract ground truth answers (deduplicated)
        nested_list = [answer.split() for answer in sample["answers"]["text"]]
        flat_set = flatten(nested_list)
        references.append({
            "id": sample["id"],
            "answers": {
                # Split each answer into words, then store unique words as a set
                "text": flat_set
            }
        })
        print('answers: ', flat_set)

    # Compute metrics
    results = calculate_f1_and_exact(predictions, references)
    print(f"Final Exact Match (EM): {results['exact_match']:.2f}")
    print(f"Final F1 Score: {results['f1']:.2f}")
    return results

In [29]:
# results = evaluate_squad_llama(model, tokenizer, {"validation": small_validation_set}, split="validation")

In [None]:
df = results['df']
df.head(50)

### Math Reasoning Aquarat ###

In [6]:
aqua_rat = load_dataset("deepmind/aqua_rat", "raw")
aqua_df = pd.DataFrame(aqua_rat['train'])

In [23]:
aqua_df.columns

Index(['question', 'options', 'rationale', 'correct', 'prediction',
       'predicted_explanation'],
      dtype='object')

In [7]:
aqua_df.columns
def preprocess_aqua(example):
    return {
        "text": (
            f"Question: {example['question']}. "
            f"Options: {example['options']}. "
            "Please provide your answer in the exactly the same format as the example 1. Please keep the explanation short."
            "Format is Explanation: your explanation, Answer: [A/B/C/D/E]"
            "Example 1:"
            "'Explanation: [1 + 1 = 2. Therefore, the answer is A], Answer: [A]'<|eot_id|>."
            "Your turn "
            "Explanation: "
        )
    }

# Apply preprocessing
tokenized_aqua = aqua_rat.map(preprocess_aqua)

In [8]:
# tokenized_aqua['train']
# sample_text = tokenized_aqua['train']['text'][0]
# print(tokenizer.eos_token_id)
eos_token_id = tokenizer.eos_token_id
eos_token = tokenizer.decode([eos_token_id])
print(f"Decoded EOS Token: {eos_token}")

Decoded EOS Token: <|eot_id|>


In [9]:
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu


In [10]:
def extract_last_answer(text):
    # Regex pattern to match "Answer:" and capture the alphabet in different formats
    pattern = r"Answer:\s*[\[\(]?([A-E])[\]\)]?[.)]?"
    match = re.search(pattern, text)
    if match:
        return match.group(1)  # Extract the captured alphabet
    return None 

def extract_explanation(prediction_text):
    # Find the first occurrence of 'Answer:'
    answer_index = prediction_text.find("Answer:")
    if answer_index != -1:
        return prediction_text[:answer_index].strip()  # Return everything before 'Answer:'
    
    # If 'Answer:' is not found, return the full text
    return prediction_text.strip()

def remove_repetition(text, question):
    # Normalize whitespace and ensure consistent formatting
    question = question.strip()
    text = text.strip()
    # Remove the question if it appears in the generated text
    cleaned_text = text.replace(question, "").strip()
    return cleaned_text

def calculate_exact_match(predictions, correct_answers):
    correct = sum([1 for pred, correct in zip(predictions, correct_answers) if pred == correct])
    return (correct / len(correct_answers)) * 100

In [11]:
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu

def evaluate_explanations(predicted_explanations, correct_explanations):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    bleu_scores = []

    for pred, correct in zip(predicted_explanations, correct_explanations):
        if not pred or not correct:
            continue  # Skip if either explanation is missing

        # Calculate ROUGE scores
        rouge = scorer.score(correct, pred)
        rouge_scores['rouge1'].append(rouge['rouge1'].fmeasure)
        rouge_scores['rouge2'].append(rouge['rouge2'].fmeasure)
        rouge_scores['rougeL'].append(rouge['rougeL'].fmeasure)

        # Calculate BLEU score
        reference = correct.split()
        candidate = pred.split()
        bleu_scores.append(sentence_bleu([reference], candidate))

    # Average Scores
    avg_rouge1 = sum(rouge_scores['rouge1']) / len(rouge_scores['rouge1']) if rouge_scores['rouge1'] else 0
    avg_rouge2 = sum(rouge_scores['rouge2']) / len(rouge_scores['rouge2']) if rouge_scores['rouge2'] else 0
    avg_rougeL = sum(rouge_scores['rougeL']) / len(rouge_scores['rougeL']) if rouge_scores['rougeL'] else 0
    avg_bleu = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0

    return avg_rouge1, avg_rouge2, avg_rougeL, avg_bleu

In [18]:
sample_texts = tokenized_aqua['train']['text']
size = 5
for i in range(0, size):
    sample_text = sample_texts[i]
    inputs = tokenizer(sample_text, return_tensors="pt")
    inputs = {key: value.to(model.device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=400, eos_token_id=tokenizer.eos_token_id, top_k=1)
        prediction_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
        answer_only = remove_repetition(prediction_text, sample_text)
        answer = extract_last_answer(answer_only)
        print("answer only", answer_only)
        explanation = extract_explanation(answer_only)

        aqua_df.loc[i, 'prediction'] = answer
        aqua_df.loc[i, 'predicted_explanation'] = explanation
        
        print(f"Progress: {i} / {size}")
        print(f"Prediction: {answer} Answer: {aqua_df.loc[i]['correct']}")
    
predictions = aqua_df['prediction'][0:size].tolist()  # Your model predictions
correct_answers = aqua_df['correct'][0:size].tolist()  # Ground truth answers
print(f"predictions: {predictions}")
print(f"Correct answers: {correct_answers}")

# Evaluate
accuracy = calculate_exact_match(predictions, correct_answers)
print(f"Accuracy (Exact Match): {accuracy:.2f}%")


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


answer only <|begin_of_text|> 43 km is the total distance. Since they are walking towards each other, the distance between them will decrease by 15% of Friend Q's rate. Let Friend Q's rate be x. Then, Friend P's rate is x + 15% of x = 1.15x. The relative speed is x + x = 2x. So, the distance between them will decrease by 2x km every hour. Since the total distance is 43 km, the time taken to meet is 43 / (2x) hours. In this time, Friend P will have walked 1.15x * (43 / (2x)) = 20.525 km. Answer: [B]<|eot_id|>
Progress: 0 / 5
Prediction: B Answer: E


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


answer only <|begin_of_text|> 5 is the x-coordinate of the point where the line crosses the x-axis, so x = 5. The slope is 1/5, so the y-coordinate of the point where the line crosses the y-axis is y = 1. Therefore, the line passes through the points (5, 1) and (0, 1). Answer: [C]<|eot_id|>
Progress: 1 / 5
Prediction: C Answer: C


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


answer only <|begin_of_text|> 1. x@y = x^2 - xy. If xy = 0, then x@y = 0. 2. (xy)@y = (xy)^2 - (xy)y = x^2y^2 - x^2y = x^2y(x - y). If xy ≠ 0, then x - y ≠ 0. Therefore, (xy)@y ≠ 0. 3. x@(x + y) = x^2 - x(x + y) = x^2 - x^2 - xy = -xy. If xy ≠ 0, then -xy ≠ 0. Therefore, x@(x + y) ≠ 0. Answer: [D]<|eot_id|>
Progress: 2 / 5
Prediction: D Answer: B


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


answer only <|begin_of_text|> 4% quarterly is 1% monthly, which is 0.01 monthly. So, the interest paid in a year is 0.01 * 12 = 0.12. This is 12% of $10,000, which is $1200. However, this is not the correct answer. The interest is compounded quarterly, so we need to calculate the interest on the interest. The interest in the first quarter is $200. In the second quarter, the interest is calculated on $10,200, which is $204. In the third quarter, the interest is calculated on $10,404, which is $208.08. In the fourth quarter, the interest is calculated on $10,612.08, which is $212.25. The total interest paid in a year is $200 + $204 + $208.08 + $212.25 = $824.33. This is approximately $850. Answer: [C]<|eot_id|>
Progress: 3 / 5
Prediction: C Answer: A
answer only <|begin_of_text|> 80 meters is equal to 0.08 km. So, the total speed downstream is 25 + 11 = 36 kmph. Time = Distance / Speed = 0.08 / 36 = 0.00222 hours. 1 hour = 3600 seconds. So, 0.00222 hours = 8 seconds. Answer: [D]<|eot_id|

In [16]:
predicted_explanations = aqua_df['predicted_explanation'][0:size].tolist()  # Predicted explanations
correct_explanations = aqua_df['rationale'][0:size].tolist()
# Example Usage
rouge1, rouge2, rougeL, bleu = evaluate_explanations(predicted_explanations, correct_explanations)
print(f"ROUGE-1: {rouge1:.2f}, ROUGE-2: {rouge2:.2f}, ROUGE-L: {rougeL:.2f}, BLEU: {bleu:.2f}")


ROUGE-1: 0.35, ROUGE-2: 0.11, ROUGE-L: 0.24, BLEU: 0.00


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [19]:
aqua_df.head(2)
# sample_text = aqua_df['answer'][0]

Unnamed: 0,question,options,rationale,correct,prediction,predicted_explanation
0,"Two friends plan to walk along a 43-km trail, starting at opposite ends of the trail at the same time. If Friend P's rate is 15% faster than Friend Q's, how many kilometers will Friend P have walked when they pass each other?","[A)21, B)21.5, C)22, D)22.5, E)23]","If Q complete x kilometers, then P completes 1.15x kilometers.\nx + 1.15x = 43\n2.15x=43\nx = 43/2.15 = 20\nThen P will have have walked 1.15*20=23 km.\nThe answer is E.",E,B,"<|begin_of_text|> 43 km is the total distance. Since they are walking towards each other, the distance between them will decrease by 15% of Friend Q's rate. Let Friend Q's rate be x. Then, Friend P's rate is x + 15% of x = 1.15x. The relative speed is x + x = 2x. So, the distance between them will decrease by 2x km every hour. Since the total distance is 43 km, the time taken to meet is 43 / (2x) hours. In this time, Friend P will have walked 1.15x * (43 / (2x)) = 20.525 km."
1,"In the coordinate plane, points (x, 1) and (5, y) are on line k. If line k passes through the origin and has slope 1/5, then what are the values of x and y respectively?","[A)4 and 1, B)1 and 5, C)5 and 1, D)3 and 5, E)5 and 3]","Line k passes through the origin and has slope 1/5 means that its equation is y=1/5*x.\nThus: (x, 1)=(5, 1) and (5, y) = (5,1) -->x=5 and y=1\nAnswer: C",C,C,"<|begin_of_text|> 5 is the x-coordinate of the point where the line crosses the x-axis, so x = 5. The slope is 1/5, so the y-coordinate of the point where the line crosses the y-axis is y = 1. Therefore, the line passes through the points (5, 1) and (0, 1)."


In [24]:
from transformers import TrainingArguments, Trainer
from datasets import DatasetDict
# Prepare train and validation datasets
def preprocess_aqua(example):
    return {
        "input_text": (
            f"Question: {example['question']}. "
            f"Options: {example['options']}. "
            "Please provide your answer in the exactly the same format as the example 1. Please keep the explanation short. "
            "Format is Explanation: your explanation, Answer: [A/B/C/D/E]. "
            "Example 1: "
            "'Explanation: [1 + 1 = 2. Therefore, the answer is A], Answer: [A]'<|eot_id|>. "
            "Your turn "
            "Explanation: "
        ),
        "target_text": (
            f"Question: {example['question']}. "
            f"Options: {example['options']}. "
            "Please provide your answer in the exactly the same format as the example 1. Please keep the explanation short. "
            "Format is Explanation: your explanation, Answer: [A/B/C/D/E]. "
            "Example 1: "
            "'Explanation: [1 + 1 = 2. Therefore, the answer is A], Answer: [A]'<|eot_id|>. "
            "Your turn "
            f"Explanation: {example['rationale']}, Answer: [{example['correct']}]"
        ),
    }

# Apply preprocessing
processed_aqua = aqua_rat.map(preprocess_aqua)

Map:   0%|          | 0/97467 [00:00<?, ? examples/s]

Map:   0%|          | 0/254 [00:00<?, ? examples/s]

Map:   0%|          | 0/254 [00:00<?, ? examples/s]

In [41]:
def tokenize_for_finetuning(example):
    input_encodings = tokenizer(example["input_text"], truncation=True, padding="max_length", max_length=512)
    target_encodings = tokenizer(example["target_text"], truncation=True, padding="max_length", max_length=512)
    
    # Combine inputs and labels
    input_encodings["labels"] = target_encodings["input_ids"]
    return input_encodings

tokenizer.pad_token = tokenizer.eos_token
tokenized_datasets = processed_aqua.map(tokenize_for_finetuning, batched=True)

tokenized_datasets.save_to_disk("tokenized_aqua")
# Remove unnecessary columns
# tokenized_datasets = tokenized_datasets.remove_columns(["input_text", "target_text", "rationale", "correct"])

Map:   0%|          | 0/97467 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/97467 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/254 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/254 [00:00<?, ? examples/s]

In [44]:
from transformers import TrainingArguments, Trainer
from datasets import load_from_disk

# Load the preprocessed dataset
tokenized_datasets = load_from_disk("tokenized_aqua")
train_subset = tokenized_datasets["train"].select(range(500))

# Replace the original train dataset with the subset (optional)
tokenized_datasets["train"] = train_subset


lora_config = LoraConfig(
    r=16,  # Low-rank dimension
    lora_alpha=16,  # Reduce scaling for stable updates
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.1,  # Dropout rate for LoRA
    bias="none",  # Bias type ("none", "all", or "lora_only")
    task_type="CAUSAL_LM",  # Task type for the model
)

# Add LoRA adapters to the model
model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

# Training arguments
training_args = TrainingArguments(
    output_dir="./fine-tuned-llama-lora",
    overwrite_output_dir=True,
    per_device_train_batch_size=1,
    num_train_epochs=5,
    learning_rate=2e-5,
    logging_dir="./logs",
    logging_steps=500,  # Adjust logging frequency for smaller dataset
    save_strategy="epoch",
    save_total_limit=2,
    fp16=True,  # Enable mixed precision
    label_smoothing_factor=0.1,
)


# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("fine-tuned-llama-lora/aqua_lora_model")
tokenizer.save_pretrained("fine-tuned-llama-lora/aua_lora_model")

trainable params: 13,631,488 || all params: 8,043,892,736 || trainable%: 0.1695


  trainer = Trainer(


Step,Training Loss
500,3.5364
1000,2.6175
1500,2.5472
2000,2.516
2500,2.4753
3000,2.5257
3500,2.4915


KeyboardInterrupt: 

In [42]:
# tokenized_datasets["train"]
tokenized_datasets["test"]

Dataset({
    features: ['question', 'options', 'rationale', 'correct', 'input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 254
})

## Reasoning GSM8k ##

In [21]:
def preprocess_gsm8k(example):
    return {
        "text": f"question: {example['question']} solution: {example['answer']}"
    }

tokenized_gsm8k = gsm8k_dataset.map(preprocess_gsm8k)

## Memory Expert: Natural Questions ##

In [None]:
math_df = pd.DataFrame(tokenized_gsm8k['train'])
math_df['answer'][0]
math_df

## Decision Expert: AI2 Artc ##

In [23]:
def preprocess_arc(example):
    # Combine choices into a single string with labels
    choices_text = " ".join(
        [f"({label}) {text}" for text, label in zip(example["choices"]["text"], example["choices"]["label"])]
    )
    return {
        "text": f"question: {example['question']} choices: {choices_text} answer: {example['answerKey']}"
    }

tokenized_arc = arc_dataset.map(preprocess_arc)

In [24]:
import math
from tqdm import tqdm

def evaluate_model(model, tokenizer, dataset, split="test"):
    model.eval()
    total_loss = 0
    count = 0

    for sample in tqdm(dataset[split]):
        inputs = tokenizer(sample["text"], return_tensors="pt", truncation=True, max_length=512)
        inputs = {key: value.to(model.device) for key, value in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
            total_loss += outputs.loss.item()
            count += 1

    avg_loss = total_loss / count
    perplexity = math.exp(avg_loss)
    print(f"Perplexity on {split} set: {perplexity}")
    return perplexity



In [None]:
# Evaluate the model
evaluate_model(model, tokenizer, dataset) # 7.275754489173488

In [None]:
print(dataset)

In [27]:
from transformers import DataCollatorForLanguageModeling

# Tokenize the dataset
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)
# Assign the eos_token as the pad_token
tokenizer.pad_token = tokenizer.eos_token
# Apply preprocessing to both train and test splits
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=["text"])

# Data collator for dynamic padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Not using masked language modeling
)


In [None]:
from transformers import TrainingArguments, Trainer
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Training arguments
training_args = TrainingArguments(
    output_dir="output",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    bf16=True,  # Use bf16 if your GPU supports it
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    weight_decay=0.1,
    report_to="none"
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("output/lora_finetuned_model")
tokenizer.save_pretrained("output/lora_finetuned_model")

In [None]:
# Load the fine-tuned model and tokenizer
model_path = "output/lora_finetuned_model"
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
# Evaluate the fine-tuned model on the test dataset
evaluate_model(model, tokenizer, dataset, split="test") # 4.153494606255717

In [3]:
dataset = load_dataset("boolq")

In [None]:
# Extract texts from the evaluation dataset
eval_texts = [item["text"] for item in eval_dataset]  # Adjust the key based on your dataset
len(eval_texts)

In [None]:
# Calculate perplexity
perplexity = calculate_perplexity(model, tokenizer, eval_texts)
print(f"Perplexity: {perplexity}")

In [4]:
def create_prompt(question):
    prompt = f"""
    Please answer the following question with either "True" or "False" only. Do not provide explanations or repeat the question.
    Question: {question}
    Answer:"""
    return prompt


In [None]:
correct = 0
total = 0

# Iterate through dataset examples
for i in range(100):  # Adjust range for your dataset
    record = dataset['train'][i]
    question = record['question']
    answer = str(record['answer'])  # Ensure the answer is clean

    # Create the input prompt
    input_text = create_prompt(question=question)

    # Tokenize input
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

    # Generate output
    output = model.generate(
        **inputs,
        max_new_tokens=2,  # Limit output to 2 tokens
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode and post-process the output
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True).strip()
    if "True" in generated_text:
        generated_text = "True"
    elif "False" in generated_text:
        generated_text = "False"
    else:
        generated_text = "Unknown"  # Handle unexpected outputs

    # Increment counters
    total += 1
    if generated_text == answer:  # Check if the generated answer matches the expected answer
        correct += 1

    # Print the result for debugging (optional)
    print(f"Question: {question}")
    print(f"Generated Answer: {generated_text}")
    print(f"Expected Answer: {answer}")

# Final accuracy report
print(f"Total Cases: {total}")
print(f"Correct Cases: {correct}")
accuracy = (correct / total) * 100 if total > 0 else 0
print(f"Accuracy: {accuracy:.2f}%")

In [12]:
torch.cuda.empty_cache()

In [None]:
# Select only the first 100 records
train_dataset = dataset["train"].select(range(100))

# Ensure the tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as pad_token
# Tokenization function with padding
def tokenize_function(examples):
    inputs = [f"Question: {q}\nAnswer:" for q in examples["question"]]
    outputs = [f" {str(a)}" for a in examples["answer"]]  # Ensure space before label
    
    # Tokenize inputs and outputs together
    tokenized_inputs = tokenizer(inputs, text_target=outputs, padding="max_length", truncation=True, max_length=512)
    return tokenized_inputs

# Tokenize dataset
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["passage"])
tokenized_train

In [None]:
# Prepare LoRA configuration
lora_config = LoraConfig(
    r=16,  # Low-rank dimension
    lora_alpha=16,  # Reduce scaling for stable updates
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.1,  # Dropout rate for LoRA
    bias="none",  # Bias type ("none", "all", or "lora_only")
    task_type="CAUSAL_LM",  # Task type for the model
)

# Add LoRA adapters to the model
model = get_peft_model(model, lora_config)

# Print trainable parameters to confirm LoRA is applied
model.print_trainable_parameters()

# Define training arguments
training_args = TrainingArguments(
    output_dir="./fine-tuned-llama-lora",
    overwrite_output_dir=True,
    per_device_train_batch_size=1,
    num_train_epochs=5,
    learning_rate=2e-5,
    logging_dir="./logs",
    logging_steps=10,  # Adjust logging frequency for smaller dataset
    save_strategy="epoch",
    save_total_limit=2,
    fp16=True,  # Enable mixed precision
    label_smoothing_factor=0.1,
)

# Define the Trainer for fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
)

# Start training
trainer.train()

# Save the fine-tuned model with LoRA adapters
model.save_pretrained("./fine-tuned-llama-lora")
tokenizer.save_pretrained("./fine-tuned-llama-lora")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("./fine-tuned-llama-lora")

# Load the base model
base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")

# Apply the LoRA adapters
model = PeftModel.from_pretrained(base_model, "./fine-tuned-llama-lora")

In [None]:
correct = 0
total = 0

# Iterate through dataset examples
for i in range(100):  # Adjust range for your dataset
    record = dataset['train'][i]
    question = record['question']
    answer = str(record['answer'])  # Ensure the answer is clean

    # Create the input prompt
    input_text = create_prompt(question=question)

    # Tokenize input
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

    # Generate output
    output = model.generate(
        **inputs,
        max_new_tokens=2,  # Limit output to 2 tokens
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode and post-process the output
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True).strip()
    if "True" in generated_text:
        generated_text = "True"
    elif "False" in generated_text:
        generated_text = "False"
    else:
        generated_text = "Unknown"  # Handle unexpected outputs

    # Increment counters
    total += 1
    if generated_text == answer:  # Check if the generated answer matches the expected answer
        correct += 1

    # Print the result for debugging (optional)
    print(f"Question: {question}")
    print(f"Generated Answer: {generated_text}")
    print(f"Expected Answer: {answer}")

# Final accuracy report
print(f"Total Cases: {total}")
print(f"Correct Cases: {correct}")
accuracy = (correct / total) * 100 if total > 0 else 0
print(f"Accuracy: {accuracy:.2f}%")