### Importing Dependencies

In [None]:
import torch
import torch.nn.functional as F
from datasets import load_dataset
from tokenizers import Tokenizer
from tqdm.auto import tqdm
import numpy as np
import json
from ModelArchitecture import Transformer, ModelConfig

### Device 

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

###  Load Model and Tokenizer

In [None]:
tokenizer = Tokenizer.from_file("LumenTokenizer.json")
vocab_size = tokenizer.get_vocab_size()
print(f"Vocab size: {vocab_size}")

config = ModelConfig(
    vocab_size=32000,
    hidden_size=768,
    n_heads=12,
    n_kv_heads=4,
    n_kv_groups=3,
    head_dim=64,
    n_layers=12,
    attention_bias=False,
    intermediate_size=3072,
    mlp_bias=False,
    eps=1e-5,
    dropout=0.0,
    max_position_embeddings=2048,
    pre_norm=True,
    tie_weights=True,
    max_seq_len=2048,
)

# Initialize model
model = Transformer(config).to(device)

# Load checkpoint
checkpoint_path = "../Models/best_model_params_80k.pt"
checkpoint = torch.load(checkpoint_path, map_location=device)
model.load_state_dict(checkpoint)
model.eval()

print(f"Model loaded from {checkpoint_path}")
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")

### HellaSwag

#### Import Dataset

In [None]:
dataset = load_dataset("Rowan/hellaswag", split="validation")
print(f"Loaded {len(dataset)} examples from HellaSwag validation set")

#### Evaluation Functions


In [None]:
def preprocess_text(text):
    """Clean and normalize text for HellaSwag."""
    # HellaSwag specific preprocessing
    text = text.strip()
    # Add space before if needed for proper tokenization
    return text


def encode_text(text):
    """Encode text using the tokenizer."""
    output = tokenizer.encode(text)
    return torch.tensor(output.ids, dtype=torch.long)


def compute_perplexity(model, input_ids):
    """
    Compute perplexity for a sequence by calculating the average log likelihood.
    Lower perplexity = higher likelihood = better continuation.
    """
    with torch.no_grad():
        input_ids = input_ids.unsqueeze(0).to(device)  # Add batch dimension
        logits = model(input_ids)
        
        # Shift logits and labels for next token prediction
        shift_logits = logits[:, :-1, :].contiguous()
        shift_labels = input_ids[:, 1:].contiguous()
        
        # Calculate log probabilities
        log_probs = F.log_softmax(shift_logits, dim=-1)
        
        # Gather the log probabilities of the actual tokens
        token_log_probs = log_probs.gather(2, shift_labels.unsqueeze(-1)).squeeze(-1)
        
        # Average negative log likelihood
        avg_nll = -token_log_probs.mean().item()
        
        return avg_nll


def compute_continuation_score(model, context, continuation):
    """
    Compute the score for a continuation given a context.
    Uses perplexity of the full sequence as the scoring metric.
    """
    # Combine context and continuation
    full_text = context + " " + continuation
    
    try:
        # Encode the full text
        input_ids = encode_text(full_text)
        
        # Skip if sequence is too long
        if len(input_ids) > config.max_seq_len:
            return float('inf')  # Return worst score
        
        # Compute perplexity (lower is better)
        score = compute_perplexity(model, input_ids)
        
        return score
    except Exception as e:
        print(f"Error processing text: {e}")
        return float('inf')


def evaluate_hellaswag_example(model, example):
    """
    Evaluate a single HellaSwag example.
    Returns True if model prediction matches the correct label.
    """
    context = preprocess_text(example['ctx'])
    endings = [preprocess_text(ending) for ending in example['endings']]
    correct_label = int(example['label'])
    
    # Compute scores for all endings
    scores = []
    for ending in endings:
        score = compute_continuation_score(model, context, ending)
        scores.append(score)
    
    # Predict the ending with lowest perplexity (best fit)
    predicted_label = np.argmin(scores)
    
    return predicted_label == correct_label, predicted_label, correct_label, scores

#### Run Benchmark

In [None]:
def run_hellaswag_benchmark(model, dataset, num_examples=None, save_results=True):
    """
    Run the complete HellaSwag benchmark.
    
    Args:
        model: The language model to evaluate
        dataset: HellaSwag dataset
        num_examples: Number of examples to evaluate (None = all)
        save_results: Whether to save detailed results to file
    
    Returns:
        Dictionary with accuracy and detailed results
    """
    if num_examples is None:
        num_examples = len(dataset)
    else:
        num_examples = min(num_examples, len(dataset))
    
    correct = 0
    total = 0
    results = []
    
    print(f"Evaluating on {num_examples} examples...")
    
    for i in tqdm(range(num_examples)):
        example = dataset[i]
        is_correct, pred_label, true_label, scores = evaluate_hellaswag_example(model, example)
        
        if is_correct:
            correct += 1
        total += 1
        
        result = {
            'index': i,
            'context': example['ctx'],
            'endings': example['endings'],
            'predicted': int(pred_label),
            'correct': int(true_label),
            'is_correct': bool(is_correct),
            'scores': [float(s) for s in scores]
        }
        results.append(result)
        
        # Print progress every 100 examples
        if (i + 1) % 100 == 0:
            current_acc = correct / total * 100
            print(f"Progress: {i+1}/{num_examples} | Current Accuracy: {current_acc:.2f}%")
    
    accuracy = correct / total * 100
    
    benchmark_results = {
        'accuracy': accuracy,
        'correct': correct,
        'total': total,
        'num_examples': num_examples,
        'detailed_results': results
    }
    
    #Save results to file
    if save_results:
        with open('hellaswag_benchmark_results.json', 'w') as f:
            json.dump(benchmark_results, f, indent=2)
        print(f"\nResults saved to 'hellaswag_benchmark_results.json'")
    
    return benchmark_results


# Run the benchmark on a subset first (for testing)
# Change num_examples=None to evaluate on the full dataset
results = run_hellaswag_benchmark(
    model=model,
    dataset=dataset,
    num_examples=1024,  # Start with 100 examples, set to None for full evaluation
    save_results=True
)

print(f"\n{'='*50}")
print(f"HELLASWAG BENCHMARK RESULTS")
print(f"{'='*50}")
print(f"Accuracy: {results['accuracy']:.2f}%")
print(f"Correct: {results['correct']}/{results['total']}")
print(f"{'='*50}")

#### Analyze Result

In [None]:
# Show some examples of correct and incorrect predictions
print("=" * 80)
print("SAMPLE CORRECT PREDICTIONS")
print("=" * 80)

correct_examples = [r for r in results['detailed_results'] if r['is_correct']]
for i, example in enumerate(correct_examples[:3]):
    print(f"\nExample {i+1}:")
    print(f"Context: {example['context'][:100]}...")
    print(f"Predicted ending ({example['predicted']}): {example['endings'][example['predicted']][:80]}...")
    print(f"Correct answer: {example['correct']}")
    print(f"Scores (lower=better): {[f'{s:.3f}' for s in example['scores']]}")

print("\n" + "=" * 80)
print("SAMPLE INCORRECT PREDICTIONS")
print("=" * 80)

incorrect_examples = [r for r in results['detailed_results'] if not r['is_correct']]
for i, example in enumerate(incorrect_examples[:3]):
    print(f"\nExample {i+1}:")
    print(f"Context: {example['context'][:100]}...")
    print(f"Predicted ending ({example['predicted']}): {example['endings'][example['predicted']][:80]}...")
    print(f"Correct ending ({example['correct']}): {example['endings'][example['correct']][:80]}...")
    print(f"Scores (lower=better): {[f'{s:.3f}' for s in example['scores']]}")

#### Comparison with Baselines

For reference, here are typical HellaSwag accuracy scores:
- **Random Chance**: 25% (4 choices)
- **GPT-2 (117M)**: ~29-31%
- **GPT-2 (1.5B)**: ~40-43%
- **GPT-3 (175B)**: ~78-79%
- **Human Performance**: ~95%

Your model's performance will depend on:
1. Model size and architecture
2. Training data quality and quantity
3. Training duration
4. Whether the model has seen similar reasoning tasks during training

### ARC 

#### Importing Dataset

In [None]:
arc_easy = load_dataset("allenai/ai2_arc", "ARC-Easy", split="test")
arc_challenge = load_dataset("allenai/ai2_arc", "ARC-Challenge", split="test")

print(f"Loaded {len(arc_easy)} examples from ARC-Easy test set")
print(f"Loaded {len(arc_challenge)} examples from ARC-Challenge test set")

#### Evaluation Function

In [None]:
def preprocess_text(text):
    """Clean and normalize text for ARC."""
    text = text.strip()
    return text


def encode_text(text):
    """Encode text using the tokenizer."""
    output = tokenizer.encode(text)
    return torch.tensor(output.ids, dtype=torch.long)


def compute_perplexity(model, input_ids):
    """
    Compute perplexity for a sequence by calculating the average log likelihood.
    Lower perplexity = higher likelihood = better answer.
    """
    with torch.no_grad():
        input_ids = input_ids.unsqueeze(0).to(device)
        logits = model(input_ids)
        
        # Shift logits and labels for next token prediction
        shift_logits = logits[:, :-1, :].contiguous()
        shift_labels = input_ids[:, 1:].contiguous()
        
        # Calculate log probabilities
        log_probs = F.log_softmax(shift_logits, dim=-1)
        
        # Gather the log probabilities of the actual tokens
        token_log_probs = log_probs.gather(2, shift_labels.unsqueeze(-1)).squeeze(-1)
        
        # Average negative log likelihood
        avg_nll = -token_log_probs.mean().item()
        
        return avg_nll


def compute_answer_score(model, question, answer):
    """
    Compute the score for an answer given a question.
    Uses perplexity of the question + answer sequence as the scoring metric.
    """
    # Format as Q&A
    full_text = f"Question: {question} Answer: {answer}"
    
    try:
        # Encode the full text
        input_ids = encode_text(full_text)
        
        # Skip if sequence is too long
        if len(input_ids) > config.max_seq_len:
            return float('inf')
        
        # Compute perplexity (lower is better)
        score = compute_perplexity(model, input_ids)
        
        return score
    except Exception as e:
        print(f"Error processing text: {e}")
        return float('inf')


def evaluate_arc_example(model, example):
    """
    Evaluate a single ARC example.
    Returns True if model prediction matches the correct answer.
    """
    question = preprocess_text(example['question'])
    choices = example['choices']
    
    # Extract choice texts and labels
    choice_texts = [preprocess_text(choice) for choice in choices['text']]
    choice_labels = choices['label']
    correct_answer = example['answerKey']
    
    # Compute scores for all choices
    scores = []
    for choice_text in choice_texts:
        score = compute_answer_score(model, question, choice_text)
        scores.append(score)
    
    # Predict the choice with lowest perplexity (best fit)
    predicted_idx = np.argmin(scores)
    predicted_label = choice_labels[predicted_idx]
    
    return predicted_label == correct_answer, predicted_label, correct_answer, scores, choice_labels

#### Run Benchmark

In [None]:
def run_arc_benchmark(model, dataset, dataset_name, num_examples=None, save_results=True):
    """
    Run the complete ARC benchmark.
    
    Args:
        model: The language model to evaluate
        dataset: ARC dataset (Easy or Challenge)
        dataset_name: Name of the dataset for reporting
        num_examples: Number of examples to evaluate (None = all)
        save_results: Whether to save detailed results to file
    
    Returns:
        Dictionary with accuracy and detailed results
    """
    if num_examples is None:
        num_examples = len(dataset)
    else:
        num_examples = min(num_examples, len(dataset))
    
    correct = 0
    total = 0
    results = []
    
    print(f"Evaluating {dataset_name} on {num_examples} examples...")
    
    for i in tqdm(range(num_examples)):
        example = dataset[i]
        is_correct, pred_label, true_label, scores, choice_labels = evaluate_arc_example(model, example)
        
        if is_correct:
            correct += 1
        total += 1
        
        # Store detailed results
        result = {
            'index': i,
            'question': example['question'],
            'choices': {
                'text': example['choices']['text'],
                'label': example['choices']['label']
            },
            'predicted': pred_label,
            'correct': true_label,
            'is_correct': bool(is_correct),
            'scores': [float(s) for s in scores]
        }
        results.append(result)
        
        # Print progress every 50 examples
        if (i + 1) % 50 == 0:
            current_acc = correct / total * 100
            print(f"Progress: {i+1}/{num_examples} | Current Accuracy: {current_acc:.2f}%")
    
    accuracy = correct / total * 100
    
    benchmark_results = {
        'dataset_name': dataset_name,
        'accuracy': accuracy,
        'correct': correct,
        'total': total,
        'num_examples': num_examples,
        'detailed_results': results
    }
    
    # Save results to file
    if save_results:
        filename = f'arc_{dataset_name.lower().replace("-", "_")}_results.json'
        with open(filename, 'w') as f:
            json.dump(benchmark_results, f, indent=2)
        print(f"\nResults saved to '{filename}'")
    
    return benchmark_results

#### Evaluate ARC-Easy

In [None]:
# Change num_examples=None to evaluate on the full dataset
easy_results = run_arc_benchmark(
    model=model,
    dataset=arc_easy,
    dataset_name="ARC-Easy",
    num_examples=None,  # Start with 100 examples, set to None for full evaluation
    save_results=True
)

print(f"\n{'='*50}")
print(f"ARC-EASY BENCHMARK RESULTS")
print(f"{'='*50}")
print(f"Accuracy: {easy_results['accuracy']:.2f}%")
print(f"Correct: {easy_results['correct']}/{easy_results['total']}")
print(f"{'='*50}")

#### Evaluate ARC-Challenge

In [None]:
# Change num_examples=None to evaluate on the full dataset
challenge_results = run_arc_benchmark(
    model=model,
    dataset=arc_challenge,
    dataset_name="ARC-Challenge",
    num_examples=None,  # Start with 100 examples, set to None for full evaluation
    save_results=True
)

print(f"\n{'='*50}")
print(f"ARC-CHALLENGE BENCHMARK RESULTS")
print(f"{'='*50}")
print(f"Accuracy: {challenge_results['accuracy']:.2f}%")
print(f"Correct: {challenge_results['correct']}/{challenge_results['total']}")
print(f"{'='*50}")

#### Comparison with Baselines

For reference, here are typical ARC accuracy scores:

#### ARC-Easy:
- **Random Chance**: ~25% (4 choices on average)
- **GPT-2 (117M)**: ~40-45%
- **GPT-2 (1.5B)**: ~55-60%
- **GPT-3 (175B)**: ~70-75%
- **Human Performance**: ~90%

#### ARC-Challenge:
- **Random Chance**: ~25% (4 choices on average)
- **GPT-2 (117M)**: ~20-25%
- **GPT-2 (1.5B)**: ~30-35%
- **GPT-3 (175B)**: ~50-55%
- **Human Performance**: ~85%

#### What affects performance:
1. **Model size**: Larger models typically perform better on reasoning tasks
2. **Training data**: Models trained on more diverse scientific/educational content perform better
3. **Architecture**: Better attention mechanisms help with understanding context
4. **Training duration**: More training generally improves reasoning capabilities

The ARC-Challenge set is significantly harder than ARC-Easy, requiring deeper reasoning and scientific knowledge.

### PIQA

#### Import Dataset

In [None]:
dataset = load_dataset("ybisk/piqa", split="validation")
print(f"Loaded {len(dataset)} examples from PIQA validation set")

#### Evaluation Function

In [None]:
def preprocess_text(text):
    """Clean and normalize text for PIQA."""
    text = text.strip()
    return text


def encode_text(text):
    """Encode text using the tokenizer."""
    output = tokenizer.encode(text)
    return torch.tensor(output.ids, dtype=torch.long)


def compute_perplexity(model, input_ids):
    """
    Compute perplexity for a sequence by calculating the average log likelihood.
    Lower perplexity = higher likelihood = better solution.
    """
    with torch.no_grad():
        input_ids = input_ids.unsqueeze(0).to(device)
        logits = model(input_ids)
        
        # Shift logits and labels for next token prediction
        shift_logits = logits[:, :-1, :].contiguous()
        shift_labels = input_ids[:, 1:].contiguous()
        
        # Calculate log probabilities
        log_probs = F.log_softmax(shift_logits, dim=-1)
        
        # Gather the log probabilities of the actual tokens
        token_log_probs = log_probs.gather(2, shift_labels.unsqueeze(-1)).squeeze(-1)
        
        # Average negative log likelihood
        avg_nll = -token_log_probs.mean().item()
        
        return avg_nll


def compute_solution_score(model, goal, solution):
    """
    Compute the score for a solution given a goal.
    Uses perplexity of the goal + solution sequence as the scoring metric.
    """
    # Format as Goal + Solution
    full_text = f"Goal: {goal} Solution: {solution}"
    
    try:
        # Encode the full text
        input_ids = encode_text(full_text)
        
        # Skip if sequence is too long
        if len(input_ids) > config.max_seq_len:
            return float('inf')
        
        # Compute perplexity (lower is better)
        score = compute_perplexity(model, input_ids)
        
        return score
    except Exception as e:
        print(f"Error processing text: {e}")
        return float('inf')


def evaluate_piqa_example(model, example):
    """
    Evaluate a single PIQA example.
    Returns True if model prediction matches the correct label.
    """
    goal = preprocess_text(example['goal'])
    solution1 = preprocess_text(example['sol1'])
    solution2 = preprocess_text(example['sol2'])
    correct_label = int(example['label'])
    
    # Compute scores for both solutions
    score1 = compute_solution_score(model, goal, solution1)
    score2 = compute_solution_score(model, goal, solution2)
    
    scores = [score1, score2]
    
    # Predict the solution with lowest perplexity (best fit)
    predicted_label = np.argmin(scores)
    
    return predicted_label == correct_label, predicted_label, correct_label, scores

#### Run Benchmark

In [None]:
def run_piqa_benchmark(model, dataset, num_examples=None, save_results=True):
    """
    Run the complete PIQA benchmark.
    
    Args:
        model: The language model to evaluate
        dataset: PIQA dataset
        num_examples: Number of examples to evaluate (None = all)
        save_results: Whether to save detailed results to file
    
    Returns:
        Dictionary with accuracy and detailed results
    """
    if num_examples is None:
        num_examples = len(dataset)
    else:
        num_examples = min(num_examples, len(dataset))
    
    correct = 0
    total = 0
    results = []
    
    print(f"Evaluating on {num_examples} examples...")
    
    for i in tqdm(range(num_examples)):
        example = dataset[i]
        is_correct, pred_label, true_label, scores = evaluate_piqa_example(model, example)
        
        if is_correct:
            correct += 1
        total += 1
        
        # Store detailed results
        result = {
            'index': i,
            'goal': example['goal'],
            'solutions': [example['sol1'], example['sol2']],
            'predicted': int(pred_label),
            'correct': int(true_label),
            'is_correct': bool(is_correct),
            'scores': [float(s) for s in scores]
        }
        results.append(result)
        
        # Print progress every 100 examples
        if (i + 1) % 100 == 0:
            current_acc = correct / total * 100
            print(f"Progress: {i+1}/{num_examples} | Current Accuracy: {current_acc:.2f}%")
    
    accuracy = correct / total * 100
    
    benchmark_results = {
        'accuracy': accuracy,
        'correct': correct,
        'total': total,
        'num_examples': num_examples,
        'detailed_results': results
    }
    
    # Save results to file
    if save_results:
        with open('piqa_benchmark_results.json', 'w') as f:
            json.dump(benchmark_results, f, indent=2)
        print(f"\nResults saved to 'piqa_benchmark_results.json'")
    
    return benchmark_results


# Run the benchmark on a subset first (for testing)
# Change num_examples=None to evaluate on the full dataset
results = run_piqa_benchmark(
    model=model,
    dataset=dataset,
    num_examples=100,  # Start with 100 examples, set to None for full evaluation
    save_results=True
)

print(f"\n{'='*50}")
print(f"PIQA BENCHMARK RESULTS")
print(f"{'='*50}")
print(f"Accuracy: {results['accuracy']:.2f}%")
print(f"Correct: {results['correct']}/{results['total']}")
print(f"{'='*50}")

#### Comparison with Baselines

For reference, here are typical PIQA accuracy scores:

- **Random Chance**: 50% (2 choices)
- **GPT-2 (117M)**: ~63-65%
- **GPT-2 (1.5B)**: ~70-73%
- **GPT-3 (175B)**: ~81-82%
- **RoBERTa Large**: ~79%
- **Human Performance**: ~95%

#### What is PIQA?

PIQA (Physical Interaction QA) tests a model's understanding of physical commonsense. Each question presents:
- A **goal** (e.g., "To separate egg whites from the yolk")
- Two **solutions** (e.g., using different physical methods)

The model must choose which solution is physically plausible and practical.

#### What affects performance:

1. **Physical Commonsense**: Understanding how objects interact in the real world
2. **Practical Knowledge**: Knowing everyday solutions to common problems
3. **Model Size**: Larger models tend to have better physical reasoning
4. **Training Data**: Models trained on diverse practical/how-to content perform better
5. **Reasoning Ability**: Understanding cause and effect in physical scenarios

PIQA is particularly challenging because it requires:
- Understanding of physics and material properties
- Practical experience with everyday objects
- Ability to reason about physical interactions