In [None]:
import os
import re
import json
from sentence_transformers import SentenceTransformer
from langchain_mistralai import ChatMistralAI
from langchain.schema import HumanMessage, SystemMessage

# Initialize Mistral AI with API Key
MISTRAL_API_KEY = "APIKEY"
llm = ChatMistralAI(model="mistral-large-latest", temperature=0, api_key=MISTRAL_API_KEY)

# Initialize Embedding Model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")


# 🔹 Assign Dynamic Weights to Evaluation Traits
def assign_trait_weights(question):
    """Uses Mistral AI to dynamically assign weightage to grading traits based on question complexity."""

    weightage_prompt = (
        "You are an expert evaluator. Analyze the given question and assign appropriate weightage (out of 100%) "
        "to the following grading traits:\n"
        "1. **Content Accuracy**: Importance of factual correctness and completeness.\n"
        "2. **Coherence**: Importance of logical flow and clarity.\n"
        "3. **Vocabulary**: Importance of precise wording and expression.\n"
        "4. **Grammar**: Importance of correct grammar and readability.\n\n"
        "### **STRICT OUTPUT FORMAT:**\n"
        "- Provide output **ONLY** in JSON format (without explanation).\n"
        "- Ensure the weights sum **exactly** to 100.\n"
        "- Example:\n"
        '{ "content_accuracy": 40, "coherence": 30, "vocabulary": 20, "grammar": 10 }\n'
        "Output should contain **no extra text**—only the JSON object."
    )

    system_message = SystemMessage(content=weightage_prompt)
    user_message = HumanMessage(content=f"Question: {question}")

    response = llm([system_message, user_message])

    try:
        # Extract JSON using regex
        match = re.search(r"\{.*\}", response.content, re.DOTALL)
        if match:
            json_text = match.group(0)
            weights = json.loads(json_text)
        else:
            raise ValueError("No valid JSON found in response.")

        # Normalize weights to sum to 100 if needed
        total_weight = sum(weights.values())
        if total_weight != 100:
            weights = {k: round((v / total_weight) * 100, 2) for k, v in weights.items()}

        return weights

    except (json.JSONDecodeError, KeyError, TypeError, ValueError) as e:
        print(f"Error parsing weights: {e}")
        print(f"LLM Response: {response.content}")
        raise ValueError("Invalid weight format received from LLM.")


# 🔹 Evaluate Answer with Score Extraction
def evaluate_answer_dynamic(question, student_answer, model_answer):
    """Evaluates a student's answer and returns both feedback and numerical score."""

    weights = assign_trait_weights(question)

    evaluation_prompt = (
        "You are an expert answer evaluator. Compare the student's answer with the model answer based on:\n"
        f"1. **Content Accuracy ({weights['content_accuracy']}%)**: Factual correctness and completeness.\n"
        f"2. **Coherence ({weights['coherence']}%)**: Logical flow and clarity.\n"
        f"3. **Vocabulary ({weights['vocabulary']}%)**: Word choice and expression.\n"
        f"4. **Grammar ({weights['grammar']}%)**: Grammar and sentence structure.\n\n"
        "### Evaluation Requirements:\n"
        "- For each category, highlight errors and provide specific improvement suggestions\n"
        "- Calculate weighted score considering the trait weights\n"
        "### Strict Output Format:\n"
        "- End your evaluation with: 'Overall Score: X.X/10' where X.X is between 0-10\n"
        "- Example: 'Overall Score: 7.5/10'\n"
        "- The score must be the last line of your response"
    )

    system_message = SystemMessage(content=evaluation_prompt)
    user_message = HumanMessage(
        content=f"Question: {question}\n\nModel Answer: {model_answer}\n\nStudent Answer: {student_answer}"
    )

    response = llm([system_message, user_message])

    # Parse numerical score from response
    score_match = re.search(r"Overall Score:\s*(\d+\.?\d*)\/10", response.content)
    if not score_match:
        raise ValueError("Failed to parse score from evaluation response")

    return {
        "feedback": response.content,
        "score": float(score_match.group(1))
    }


import numpy as np
from sklearn.metrics import cohen_kappa_score

def calculate_qwk(model_scores, human_scores, bins=[0, 4, 7, 10]):
    """
    Robust QWK calculation with comprehensive checks
    """
    # Input validation
    if len(model_scores) != len(human_scores):
        raise ValueError("Input arrays must have the same length")

    if len(model_scores) < 5:
        print("Warning: QWK requires at least 5 samples for meaningful results")
        return float('nan')

    # Binning function with edge handling
    def bin_scores(scores):
        return np.digitize(scores, bins, right=True) - 1

    model_binned = bin_scores(np.array(model_scores))
    human_binned = bin_scores(np.array(human_scores))

    # Check for uniform distributions
    if len(np.unique(human_binned)) == 1 and len(np.unique(model_binned)) == 1:
        print("All scores in same bin - QWK undefined")
        return float('nan')

    # Calculate QWK with full labels
    all_labels = np.unique(np.concatenate([model_binned, human_binned]))
    return cohen_kappa_score(human_binned, model_binned,
                           labels=all_labels,
                           weights='quadratic')

# Test cases
print(calculate_qwk([7], [7]))  # Single sample: Warning + NaN
print(calculate_qwk([7,7,7], [7,7,7]))  # Uniform scores: 🚨 + NaN
print(calculate_qwk([5,6,7,8,9], [6,7,8,9,10]))  # Realistic: ~0.6-0.8