# Evaluation & Inference Notebook for Fine-tuned Mistral-7B

This notebook contains steps 5-8 from the main training notebook:
- **5. Evaluation Metrics** - Comprehensive evaluation of the fine-tuned model
- **6. Inference Pipeline** - Grade new student answers
- **7. Visualization & Analysis** - Visualize results and analyze performance
- **8. Optional Enhancements** - Future enhancements (RAG, Chain-of-Thought, Ensemble)

## Usage:
1. Set your checkpoint path in the configuration cell
2. Run all cells to evaluate your trained model
3. Use the inference functions to grade new answers
4. Visualize and analyze the results

**Note:** This notebook loads the model on a single device (using `model.to(device)`) - NO `device_map='auto'` to avoid device conflicts.


## Setup & Configuration


In [None]:
# Install required libraries if needed
!pip install -q transformers>=4.35.0 peft>=0.6.0 bitsandbytes>=0.41.0 datasets>=2.14.0 accelerate>=0.24.0 scikit-learn nltk rouge-score bert-score torch matplotlib seaborn --upgrade


In [None]:
import os
import json
import pandas as pd
import numpy as np
import torch
import re
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)
from peft import PeftModel
from sklearn.metrics import (
    cohen_kappa_score,
    accuracy_score,
    confusion_matrix,
    classification_report
)
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

print("Libraries imported successfully!")


In [None]:
# Check GPU availability
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    print("\nGPU Details:")
    for i in range(torch.cuda.device_count()):
        props = torch.cuda.get_device_properties(i)
        print(f"  GPU {i}: {props.name}")
        print(f"    Total Memory: {props.total_memory / 1e9:.2f} GB")
    device = 'cuda:0'  # Use first GPU
else:
    device = 'cpu'
    print("Warning: No GPU detected. Evaluation will be slow on CPU.")

print(f"\nUsing device: {device}")


In [None]:
# Configuration - MODIFY THESE PATHS
CONFIG = {
    # Model paths - UPDATE THESE!
    'checkpoint_path': './output/checkpoints',  # Path to your fine-tuned checkpoint
    'base_model_name': 'mistralai/Mistral-7B-v0.1',  # Base model name
    
    # Quantization (should match training config)
    'use_4bit': True,
    'bnb_4bit_compute_dtype': 'float16',
    'bnb_4bit_quant_type': 'nf4',
    'use_nested_quant': False,
    
    # Dataset path
    'dataset_dir': 'EngSAF dataset',  # Path to your dataset
    
    # Evaluation config
    'max_length': 1024,
    'max_samples': None,  # Set to None to evaluate on all samples, or a number to limit
    'temperature': 0.7,
    'top_p': 0.9,
    'max_new_tokens': 256,
}

print("Configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")


## Load Fine-tuned Model

**IMPORTANT:** This notebook loads the model WITHOUT `device_map='auto'` and uses `model.to(device)` instead to avoid device conflicts.


In [None]:
# Load fine-tuned model (base + LoRA weights)
# NO device_map='auto' - we use model.to(device) instead

print(f"Loading fine-tuned model from: {CONFIG['checkpoint_path']}")
print(f"Base model: {CONFIG['base_model_name']}")

# Configure quantization
if CONFIG['use_4bit']:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type=CONFIG['bnb_4bit_quant_type'],
        bnb_4bit_compute_dtype=getattr(torch, CONFIG['bnb_4bit_compute_dtype']),
        bnb_4bit_use_double_quant=CONFIG['use_nested_quant'],
    )
    print("4-bit quantization configured")
else:
    bnb_config = None
    print("Using full precision")

# Load base model (NO device_map='auto')
print("\nLoading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    CONFIG['base_model_name'],
    quantization_config=bnb_config,
    trust_remote_code=True,
    torch_dtype=torch.float16 if CONFIG['use_4bit'] else torch.float32
)

# Load LoRA adapters
print("Loading LoRA adapters...")
model = PeftModel.from_pretrained(base_model, CONFIG['checkpoint_path'])

# Move model to device (single GPU, no device_map issues)
print(f"Moving model to {device}...")
model = model.to(device)

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(CONFIG['checkpoint_path'])
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

print("\nâœ… Model loaded successfully!")
print(f"Model device: {next(model.parameters()).device}")


## Load Dataset


In [None]:
# Load EngSAF dataset
def load_engsaf_split(dataset_dir=None, split='train'):
    """Load a specific split of the EngSAF dataset."""
    if dataset_dir is None:
        dataset_dir = CONFIG['dataset_dir']
    
    split_files = {
        'train': 'train.csv',
        'val': 'val.csv',
        'validation': 'val.csv',
        'unseen_question': 'unseen_question.csv',
        'unseen_answers': 'unseen_answers.csv',
        'test': 'unseen_question.csv'
    }
    
    if split not in split_files:
        raise ValueError(f"Unknown split: {split}")
    
    file_path = os.path.join(dataset_dir, split_files[split])
    
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Could not find {file_path}")
    
    df = pd.read_csv(file_path)
    
    # Map column names
    column_mapping = {
        'Question': 'question',
        'Student Answer': 'student_answer',
        'Correct Answer': 'reference_answer',
        'output_label': 'score',
        'feedback': 'feedback',
    }
    
    df = df.rename(columns=column_mapping)
    
    # Clean data
    df = df.dropna(subset=['question', 'student_answer', 'score'])
    df['score'] = df['score'].astype(int)
    df = df[df['question'].str.strip() != '']
    df = df[df['student_answer'].str.strip() != '']
    
    print(f"Loaded {split} split: {len(df)} samples")
    print(f"Score distribution:\n{df['score'].value_counts().sort_index()}")
    
    return df

# Load test dataset (unseen questions - primary test set)
test_df = load_engsaf_split(CONFIG['dataset_dir'], 'unseen_question')
print(f"\nTest dataset ready: {len(test_df)} samples")


## 5. Evaluation Metrics {#evaluation-metrics}


In [None]:
# Prompt template functions
DEFAULT_RUBRIC = """You are an expert grader evaluating student answers. Consider:
1. Accuracy: Is the answer factually correct?
2. Completeness: Does it address all parts of the question?
3. Clarity: Is the answer well-structured and clear?
4. Depth: Does it demonstrate understanding beyond surface level?

Provide a score (0-5) and constructive feedback."""

def create_prompt_template(question, student_answer, rubric=None):
    """Create instruction-tuning prompt template."""
    if rubric is None:
        rubric = DEFAULT_RUBRIC
    
    system_prompt = rubric
    user_prompt = f"""Question: {question}
Student Answer: {student_answer}

Please grade this answer and provide feedback."""
    
    return system_prompt, user_prompt

def format_instruction(system_prompt, user_prompt, assistant_response=None):
    """Format instruction in Mistral's chat template format."""
    if assistant_response is None:
        prompt = f"<s>[INST] {system_prompt}\n\n{user_prompt} [/INST]"
    else:
        prompt = f"<s>[INST] {system_prompt}\n\n{user_prompt} [/INST] {assistant_response}</s>"
    
    return prompt

print("Prompt template functions defined.")


In [None]:
# Evaluation utility functions

def quadratic_weighted_kappa(y_true, y_pred):
    """
    Calculate Quadratic Weighted Kappa (QWK) score.
    QWK is the standard metric for automated essay scoring.
    """
    min_score = min(min(y_true), min(y_pred))
    max_score = max(max(y_true), max(y_pred))
    
    weights = np.zeros((max_score - min_score + 1, max_score - min_score + 1))
    for i in range(len(weights)):
        for j in range(len(weights)):
            weights[i][j] = ((i - j) ** 2) / ((max_score - min_score) ** 2)
    
    kappa = cohen_kappa_score(y_true, y_pred, weights=weights)
    return kappa

def extract_score_from_response(response_text):
    """Extract score from model response."""
    patterns = [
        r'Score:\s*(\d+)',
        r'score:\s*(\d+)',
        r'Score\s*(\d+)',
        r'(\d+)\s*out\s*of',
        r'Grade:\s*(\d+)',
    ]
    
    for pattern in patterns:
        match = re.search(pattern, response_text, re.IGNORECASE)
        if match:
            try:
                score = int(match.group(1))
                score = max(0, min(5, score))
                return score
            except ValueError:
                continue
    
    numbers = re.findall(r'\d+', response_text)
    if numbers:
        try:
            score = int(numbers[0])
            score = max(0, min(5, score))
            return score
        except ValueError:
            pass
    
    return None

def extract_feedback_from_response(response_text):
    """Extract feedback text from model response."""
    patterns = [
        r'Feedback:\s*(.+?)(?:\n|$)',
        r'feedback:\s*(.+?)(?:\n|$)',
        r'Feedback\s*(.+?)(?:\n|$)',
    ]
    
    for pattern in patterns:
        match = re.search(pattern, response_text, re.IGNORECASE | re.DOTALL)
        if match:
            return match.group(1).strip()
    
    lines = response_text.split('\n')
    if len(lines) > 1:
        return '\n'.join(lines[1:]).strip()
    
    return response_text.strip()

print("Evaluation utility functions defined.")


In [None]:
# Evaluation function for feedback quality

def evaluate_feedback_quality(predicted_feedback, reference_feedback):
    """Evaluate feedback quality using BLEU, ROUGE, and BERTScore."""
    try:
        from rouge_score import rouge_scorer
        from bert_score import score as bert_score_fn
        import nltk
        
        try:
            nltk.data.find('tokenizers/punkt')
        except LookupError:
            nltk.download('punkt', quiet=True)
        
        from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
        
        smooth = SmoothingFunction().method1
        bleu = sentence_bleu(
            [reference_feedback.split()],
            predicted_feedback.split(),
            smoothing_function=smooth
        )
        
        rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        rouge_scores = rouge_scorer_obj.score(reference_feedback, predicted_feedback)
        
        P, R, F1 = bert_score_fn(
            [predicted_feedback],
            [reference_feedback],
            lang='en',
            verbose=False
        )
        
        return {
            'bleu': bleu,
            'rouge1': rouge_scores['rouge1'].fmeasure,
            'rouge2': rouge_scores['rouge2'].fmeasure,
            'rougeL': rouge_scores['rougeL'].fmeasure,
            'bertscore_f1': F1.item()
        }
    except Exception as e:
        print(f"Error evaluating feedback: {e}")
        return None

print("Feedback evaluation functions defined.")


In [None]:
# Create dataset class for evaluation
from torch.utils.data import Dataset

class GradingDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=1024, rubric=None):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.rubric = rubric
        
        # Prepare data
        self.data = []
        for idx in range(len(self.df)):
            row = self.df.iloc[idx]
            
            sys_prompt, user_prompt = create_prompt_template(
                row['question'],
                row['student_answer'],
                rubric=self.rubric
            )
            
            feedback = row.get('feedback', 'No feedback available.')
            assistant_resp = f"Score: {row['score']}\nFeedback: {feedback}"
            full_text = format_instruction(sys_prompt, user_prompt, assistant_resp)
            
            self.data.append({
                'text': full_text,
                'score': row['score'],
                'question': row['question'],
                'student_answer': row['student_answer']
            })
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        encoding = self.tokenizer(
            item['text'],
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': encoding['input_ids'].flatten(),
            'score': item['score']
        }

# Create test dataset
test_dataset = GradingDataset(test_df, tokenizer, max_length=CONFIG['max_length'])
print(f"Test dataset created: {len(test_dataset)} samples")


In [None]:
# Comprehensive evaluation function

def evaluate_model(model, tokenizer, test_dataset, device='cuda', max_samples=None):
    """
    Comprehensive evaluation of the model on test set.
    """
    model.eval()
    
    print(f"Using device: {device} for evaluation")
    
    predictions = []
    true_scores = []
    predicted_scores = []
    predicted_feedbacks = []
    reference_feedbacks = []
    
    # Limit samples if specified
    eval_indices = range(len(test_dataset))
    if max_samples:
        eval_indices = eval_indices[:max_samples]
    
    with torch.no_grad():
        for idx in tqdm(eval_indices, desc="Evaluating"):
            item = test_dataset[idx]
            
            # Get question and answer from dataset
            question = test_dataset.df.iloc[idx]['question']
            student_answer = test_dataset.df.iloc[idx]['student_answer']
            true_score = test_dataset.df.iloc[idx]['score']
            reference_feedback = test_dataset.df.iloc[idx].get('feedback', '')
            
            # Create prompt
            sys_prompt, user_prompt = create_prompt_template(question, student_answer)
            prompt = format_instruction(sys_prompt, user_prompt)
            
            # Tokenize and move to device
            inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=CONFIG['max_length'])
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            # Generate
            outputs = model.generate(
                **inputs,
                max_new_tokens=CONFIG['max_new_tokens'],
                temperature=CONFIG['temperature'],
                top_p=CONFIG['top_p'],
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id
            )
            
            # Decode
            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            # Extract score and feedback
            predicted_score = extract_score_from_response(generated_text)
            predicted_feedback = extract_feedback_from_response(generated_text)
            
            predictions.append({
                'question': question,
                'student_answer': student_answer,
                'true_score': true_score,
                'predicted_score': predicted_score,
                'reference_feedback': reference_feedback,
                'predicted_feedback': predicted_feedback,
                'full_response': generated_text
            })
            
            if predicted_score is not None:
                true_scores.append(true_score)
                predicted_scores.append(predicted_score)
                predicted_feedbacks.append(predicted_feedback)
                reference_feedbacks.append(reference_feedback)
    
    # Calculate metrics
    results = {}
    
    if len(true_scores) > 0:
        # Score metrics
        results['qwk'] = quadratic_weighted_kappa(true_scores, predicted_scores)
        results['cohen_kappa'] = cohen_kappa_score(true_scores, predicted_scores)
        results['accuracy'] = accuracy_score(true_scores, predicted_scores)
        results['confusion_matrix'] = confusion_matrix(true_scores, predicted_scores)
        
        # Feedback metrics (sample-based for efficiency)
        if len(predicted_feedbacks) > 0:
            sample_size = min(50, len(predicted_feedbacks))
            sample_indices = np.random.choice(len(predicted_feedbacks), sample_size, replace=False)
            
            feedback_metrics = []
            for idx in sample_indices:
                metrics = evaluate_feedback_quality(
                    predicted_feedbacks[idx],
                    reference_feedbacks[idx]
                )
                if metrics:
                    feedback_metrics.append(metrics)
            
            if feedback_metrics:
                results['feedback_metrics'] = {
                    'bleu': np.mean([m['bleu'] for m in feedback_metrics]),
                    'rouge1': np.mean([m['rouge1'] for m in feedback_metrics]),
                    'rouge2': np.mean([m['rouge2'] for m in feedback_metrics]),
                    'rougeL': np.mean([m['rougeL'] for m in feedback_metrics]),
                    'bertscore_f1': np.mean([m['bertscore_f1'] for m in feedback_metrics])
                }
    
    return results, predictions

print("Evaluation function defined.")


In [None]:
# Run evaluation
print("Starting evaluation...")
print(f"Device: {device}")
print(f"Max samples: {CONFIG['max_samples'] if CONFIG['max_samples'] else 'All'}")
print("\n" + "="*80)

results, predictions = evaluate_model(
    model,
    tokenizer,
    test_dataset,
    device=device,
    max_samples=CONFIG['max_samples']
)

print("\n" + "="*80)
print("EVALUATION RESULTS")
print("="*80)
print(f"Quadratic Weighted Kappa (QWK): {results['qwk']:.4f}")
print(f"Cohen's Kappa: {results['cohen_kappa']:.4f}")
print(f"Accuracy: {results['accuracy']:.4f}")
print("\nConfusion Matrix:")
print(results['confusion_matrix'])

if 'feedback_metrics' in results:
    print("\nFeedback Metrics:")
    for metric, value in results['feedback_metrics'].items():
        print(f"  {metric}: {value:.4f}")

print("\n" + "="*80)


## 6. Inference Pipeline {#inference-pipeline}


In [None]:
# Inference function for grading new answers

def grade_answer(
    model,
    tokenizer,
    question,
    student_answer,
    rubric=None,
    temperature=0.7,
    top_p=0.9,
    max_new_tokens=256,
    device='cuda'
):
    """
    Grade a student answer and generate feedback.
    
    Args:
        model: Fine-tuned model
        tokenizer: Tokenizer
        question: The question text
        student_answer: Student's answer
        rubric: Optional custom rubric
        temperature: Sampling temperature
        top_p: Nucleus sampling parameter
        max_new_tokens: Maximum tokens to generate
        device: Device to run inference on
    
    Returns:
        dict with 'score' and 'feedback'
    """
    model.eval()
    
    # Create prompt
    sys_prompt, user_prompt = create_prompt_template(question, student_answer, rubric=rubric)
    prompt = format_instruction(sys_prompt, user_prompt)
    
    # Tokenize and move to device
    inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=CONFIG['max_length'])
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    # Decode
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract score and feedback
    score = extract_score_from_response(generated_text)
    feedback = extract_feedback_from_response(generated_text)
    
    return {
        'score': score,
        'feedback': feedback,
        'full_response': generated_text
    }

print("Inference function defined.")


In [None]:
# Example usage
example_question = "Explain the process of photosynthesis."
example_answer = "Photosynthesis is when plants use sunlight to make food."

result = grade_answer(
    model,
    tokenizer,
    example_question,
    example_answer,
    temperature=CONFIG['temperature'],
    top_p=CONFIG['top_p'],
    device=device
)

print("Example Grading:")
print("="*80)
print(f"Question: {example_question}")
print(f"Answer: {example_answer}")
print(f"\nPredicted Score: {result['score']}")
print(f"\nGenerated Feedback:\n{result['feedback']}")
print("="*80)


## 7. Visualization & Analysis {#visualization}


In [None]:
# Plot confusion matrix

def plot_confusion_matrix(y_true, y_pred, save_path=None):
    """Plot confusion matrix for score predictions."""
    cm = confusion_matrix(y_true, y_pred)
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(
        cm,
        annot=True,
        fmt='d',
        cmap='Blues',
        xticklabels=range(len(cm)),
        yticklabels=range(len(cm))
    )
    plt.xlabel('Predicted Score')
    plt.ylabel('True Score')
    plt.title('Confusion Matrix - Score Predictions')
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    
    plt.show()

# Plot confusion matrix
if len(predictions) > 0 and any(p['predicted_score'] is not None for p in predictions):
    true_scores = [p['true_score'] for p in predictions if p['predicted_score'] is not None]
    pred_scores = [p['predicted_score'] for p in predictions if p['predicted_score'] is not None]
    plot_confusion_matrix(true_scores, pred_scores, save_path='confusion_matrix.png')


In [None]:
# Display example predictions

def display_examples(predictions, n_examples=5, show_good=True, show_bad=True):
    """Display example predictions, both good and bad cases."""
    examples = []
    
    if show_good:
        good_examples = [
            p for p in predictions
            if p['predicted_score'] == p['true_score'] and p['predicted_score'] is not None
        ]
        if good_examples:
            examples.extend(np.random.choice(good_examples, min(n_examples, len(good_examples)), replace=False))
    
    if show_bad:
        bad_examples = [
            p for p in predictions
            if p['predicted_score'] is not None
            and abs(p['predicted_score'] - p['true_score']) >= 2
        ]
        if bad_examples:
            examples.extend(np.random.choice(bad_examples, min(n_examples, len(bad_examples)), replace=False))
    
    for i, ex in enumerate(examples[:n_examples * 2], 1):
        print(f"\n{'='*80}")
        print(f"Example {i}")
        print(f"{'='*80}")
        print(f"Question: {ex['question']}")
        print(f"\nStudent Answer: {ex['student_answer']}")
        print(f"\nTrue Score: {ex['true_score']}")
        print(f"Predicted Score: {ex['predicted_score']}")
        print(f"\nReference Feedback: {ex['reference_feedback'][:200]}...")
        print(f"\nGenerated Feedback: {ex['predicted_feedback'][:200]}...")

# Display examples
display_examples(predictions, n_examples=3)


In [None]:
# Classification report
if len(predictions) > 0 and any(p['predicted_score'] is not None for p in predictions):
    true_scores = [p['true_score'] for p in predictions if p['predicted_score'] is not None]
    pred_scores = [p['predicted_score'] for p in predictions if p['predicted_score'] is not None]
    
    print("Classification Report:")
    print(classification_report(true_scores, pred_scores))
    
    print("\nScore Distribution:")
    print(f"True scores: {pd.Series(true_scores).value_counts().sort_index()}")
    print(f"Predicted scores: {pd.Series(pred_scores).value_counts().sort_index()}")


## 8. Optional Enhancements {#optional-enhancements}


### 8.1 RAG Integration Placeholder

For future enhancement: Integrate Retrieval-Augmented Generation (RAG) to retrieve relevant course materials when grading answers.


In [None]:
# RAG Integration Placeholder
def retrieve_course_materials(question, top_k=3):
    """
    Placeholder for RAG system to retrieve relevant course materials.
    
    Future implementation:
    - Use embeddings to find relevant course content
    - Retrieve top-k most relevant passages
    - Include in prompt context
    """
    return []

print("RAG placeholder defined.")


### 8.2 Chain-of-Thought Verification

For future enhancement: Add Chain-of-Thought reasoning to make grading decisions more transparent.


In [None]:
# Chain-of-Thought Verification Placeholder

def grade_with_cot(model, tokenizer, question, answer, device='cuda'):
    """
    Placeholder for Chain-of-Thought grading.
    
    Future implementation:
    - Generate reasoning steps before final score
    - Verify consistency of reasoning
    - Use reasoning to improve score prediction
    """
    return grade_answer(model, tokenizer, question, answer, device=device)

print("Chain-of-Thought placeholder defined.")


### 8.3 Ensemble with RoBERTa Baseline

For future enhancement: Create ensemble model combining Mistral-7B with RoBERTa-based scoring model.


In [None]:
# Ensemble Placeholder

def ensemble_grade(mistral_result, roberta_result, weights=[0.7, 0.3]):
    """
    Placeholder for ensemble grading.
    
    Future implementation:
    - Load fine-tuned RoBERTa model for scoring
    - Combine predictions with weighted average
    - Use ensemble for final score and feedback
    """
    return mistral_result

print("Ensemble placeholder defined.")


## Save Results (Optional)


In [None]:
# Save results to CSV
results_df = pd.DataFrame(predictions)
output_file = 'evaluation_results.csv'
results_df.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")

# Save metrics
metrics = {
    'qwk': float(results['qwk']),
    'cohen_kappa': float(results['cohen_kappa']),
    'accuracy': float(results['accuracy']),
    'confusion_matrix': results['confusion_matrix'].tolist()
}

if 'feedback_metrics' in results:
    metrics['feedback_metrics'] = {k: float(v) for k, v in results['feedback_metrics'].items()}

with open('evaluation_metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print("Metrics saved to evaluation_metrics.json")
