## Technical

### Make the criteria

In [None]:
import os
import json
import pandas as pd
from dotenv import load_dotenv

from langchain.chat_models import AzureChatOpenAI
from langchain import PromptTemplate, LLMChain

# ─── Helper: robust JSON extraction ─────────────────────────────────────────────
def extract_json(text: str):
    for start_idx, ch in enumerate(text):
        if ch in ("{", "["):
            open_char, close_char = (ch, "}" if ch=="{" else "]")
            balance = 0
            for end_idx in range(start_idx, len(text)):
                if text[end_idx] == open_char: balance += 1
                elif text[end_idx] == close_char:
                    balance -= 1
                    if balance == 0:
                        return json.loads(text[start_idx:end_idx+1])
    raise ValueError(f"No complete JSON object/array found in:\n{text}")


# ─── 1. Load credentials ───────────────────────────────────────────────────────
load_dotenv()
os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
os.environ["OPENAI_API_TYPE"] = "Azure"

llm = AzureChatOpenAI(
    openai_api_version="2023-12-01-preview",
    azure_deployment="GPT-4O-50-1",
)


# ─── 2. Rubric generation ─────────────────────────────────────────────────────
def generate_rubric(csv_path: str, max_examples: int = 50) -> list:
    df = pd.read_csv(csv_path).dropna(subset=["question", "answer"])
    samples = df.head(max_examples)
    context = "\n\n".join(f"Q: {q}\nA: {a}" for q, a in zip(samples["question"], samples["answer"]))

    prompt = PromptTemplate(
        input_variables=["context"],
        template="""
You are an expert evaluator. Here are sample Q/A pairs:
{context}

Generate a JSON **array** of 5–8 evaluation criteria. Each criterion must be an object:
- "name": short title
- "description": one-sentence explanation

**Only output the JSON array**, without any extra text.
""",
    )
    chain = LLMChain(llm=llm, prompt=prompt)
    raw = chain.run(context=context)
    return extract_json(raw)


# ─── 3. Save & Load functions ───────────────────────────────────────────────────
def save_rubric(rubric: list, filepath: str):
    """Save rubric (list of dicts) to a JSON file."""
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(rubric, f, indent=2, ensure_ascii=False)
    print(f"Rubric saved to {filepath}")

def load_rubric(filepath: str) -> list:
    """Load rubric (list of dicts) from a JSON file."""
    with open(filepath, "r", encoding="utf-8") as f:
        return json.load(f)


# ─── 4. Evaluation function ────────────────────────────────────────────────────
def evaluate_answer(question: str, answer: str, rubric: list) -> dict:
    rubric_json = json.dumps(rubric, indent=2)
    prompt = PromptTemplate(
        input_variables=["rubric", "question", "answer"],
        template="""
You are an objective assessor. Here is a rubric (JSON array):
{rubric}

Now evaluate this response.

Question:
{question}

Answer:
{answer}

For each rubric item, produce an object with:
- "name"  (same as criterion)
- "score" (integer 0–100)
- "explanation" (one-sentence rationale)

Then compute "overall_score" as the average of all scores.

**Return only the final JSON object** with keys "scores" and "overall_score".
""",
    )
    chain = LLMChain(llm=llm, prompt=prompt)
    raw = chain.run(rubric=rubric_json, question=question, answer=answer)
    return extract_json(raw)


# ─── 5. Example usage ─────────────────────────────────────────────────────────
if __name__ == "__main__":
    # Generate and save rubric
    rubric = generate_rubric("Data\Technical\dataset.csv")
    save_rubric(rubric, "Data\Technical\rubric.json")

    # Later on, or in another script, simply:
    rubric = load_rubric("Data\Technical\rubric.json")

    # Evaluate a new Q/A
    new_q = "How would you optimize a Python loop over large datasets?"
    new_a = "You can vectorize with NumPy, leverage Cython for hotspots, or use multiprocessing for parallel batches."
    report = evaluate_answer(new_q, new_a, rubric)
    print(json.dumps(report, indent=2))


  llm = AzureChatOpenAI(
  chain = LLMChain(llm=llm, prompt=prompt)
  raw = chain.run(context=context)


Rubric saved to rubric.json
{
  "scores": [
    {
      "name": "Clarity",
      "score": 85,
      "explanation": "The answer is generally clear but uses technical terms that might not be accessible to all readers."
    },
    {
      "name": "Accuracy",
      "score": 90,
      "explanation": "The methods mentioned are accurate for optimizing loops in Python but lack specific examples."
    },
    {
      "name": "Completeness",
      "score": 75,
      "explanation": "The answer mentions key optimization methods but doesn't cover all possible approaches."
    },
    {
      "name": "Relevance",
      "score": 95,
      "explanation": "The answer directly addresses the question by providing relevant techniques."
    },
    {
      "name": "Depth",
      "score": 70,
      "explanation": "The answer is somewhat superficial and doesn't delve deeply into any of the methods."
    },
    {
      "name": "Conciseness",
      "score": 90,
      "explanation": "The answer is succinct and avo

### run criteria on the dataset

In [None]:
import os
import json
import pandas as pd
from dotenv import load_dotenv

from langchain.chat_models import AzureChatOpenAI
from langchain import PromptTemplate, LLMChain

# ─── Helper: robust JSON extraction ─────────────────────────────────────────────
def extract_json(text: str):
    """
    Find the first JSON object or array in `text` by bracket matching.
    Returns the parsed JSON. Raises ValueError if none found or unbalanced.
    """
    for start_idx, ch in enumerate(text):
        if ch in ("{", "["):
            open_char = ch
            close_char = "}" if ch == "{" else "]"
            balance = 0
            for end_idx in range(start_idx, len(text)):
                if text[end_idx] == open_char:
                    balance += 1
                elif text[end_idx] == close_char:
                    balance -= 1
                    if balance == 0:
                        snippet = text[start_idx : end_idx + 1]
                        return json.loads(snippet)
            break
    raise ValueError(f"No complete JSON object/array found in LLM output:\n{text}")

# ─── LLM setup ─────────────────────────────────────────────────────────────────
load_dotenv()
os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
os.environ["OPENAI_API_TYPE"] = "Azure"

llm = AzureChatOpenAI(
    openai_api_version="2023-12-01-preview",
    azure_deployment="GPT-4O-50-1",
)

# ─── Evaluation function ───────────────────────────────────────────────────────
def evaluate_answer(question: str, answer: str, rubric: list) -> dict:
    """
    Given a single question/answer and a rubric (list of {"name","description"}),
    prompts the LLM to score each criterion 0–100 and give a one-sentence explanation.
    Returns a dict: {
      "scores": [ { "name":"", "score":int, "explanation":"" }, … ],
      "overall_score": float
    }
    """
    rubric_json = json.dumps(rubric, indent=2)
    prompt = PromptTemplate(
        input_variables=["rubric", "question", "answer"],
        template="""
You are an objective assessor. Here is a rubric (JSON array):
{rubric}

Now evaluate this response.

Question:
{question}

Answer:
{answer}

For each rubric item, produce an object with:
- "name"      (same as criterion)
- "score"     (integer between 0 and 100)
- "explanation" (one-sentence rationale)

Then compute "overall_score" as the average of all scores.

Return **only** the final JSON object with keys "scores" and "overall_score".
""",
    )
    chain = LLMChain(llm=llm, prompt=prompt)
    raw = chain.run(rubric=rubric_json, question=question, answer=answer)
    return extract_json(raw)

# ─── 1. Load rubric from JSON file ─────────────────────────────────────────────
with open("Data\Technical\rubric.json", "r", encoding="utf-8") as f:
    rubric = json.load(f)

# ─── 2. Load dataset of Q/A pairs ─────────────────────────────────────────────
df = pd.read_csv("Data\Technical\dataset.csv").dropna(subset=["question", "answer"])

# ─── 3. Iterate and evaluate ─────────────────────────────────────────────────
results = []
for idx, row in df.iterrows():
    q = row["question"]
    a = row["answer"]

    try:
        report = evaluate_answer(q, a, rubric)
    except Exception as e:
        # in case of API or parsing errors, record the exception
        report = {
            "error": str(e)
        }

    results.append({
        "question": q,
        "answer": a,
        "evaluation": report
    })

# ─── 4. Save full JSON report ─────────────────────────────────────────────────
with open("Data\Technical\evaluation_results.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

# ─── 5. Flatten for CSV ────────────────────────────────────────────────────────
# We'll make one row per criterion per Q/A, plus the overall score
rows = []
for entry in results:
    if "evaluation" in entry and "scores" in entry["evaluation"]:
        for crit in entry["evaluation"]["scores"]:
            rows.append({
                "question": entry["question"],
                "answer": entry["answer"],
                "criterion": crit["name"],
                "score": crit["score"],
                "explanation": crit["explanation"],
                "overall_score": entry["evaluation"].get("overall_score")
            })
    else:
        # if an error occurred, record it
        rows.append({
            "question": entry["question"],
            "answer": entry["answer"],
            "criterion": None,
            "score": None,
            "explanation": None,
            "overall_score": None,
            "error": entry["evaluation"].get("error")
        })

df_flat = pd.DataFrame(rows)
df_flat.to_csv("Data\Technical\evaluation_results.csv", index=False)

print("Done! Results written to evaluation_results.json and evaluation_results.csv")

Done! Results written to evaluation_results.json and evaluation_results.csv


### Evaluate New upcomming Question

- first we take the question of the user 
- then benshof law mawgod fel dataset
- law el question mawgod est5dem 70% el question dah be egabto men el dataset w 30% men el criteria
    * el kalam dah hay7sal in case eno el question ely fel dataset aslan gayeb score fo2 el 70% based on this criteria "evaluation_results_dataset.csv" goaha da
- law el question mesh mawgod
- shof el top relvant questions hal orybyn meno wala la2
- law orybyn est5dmhom homa ma3a el criteria 30% leyhom w 70% criteria
- law el top relvant questions mesh relevant
- 5alas use el criteria bas

In [2]:
import os
import re
import json
import pandas as pd
from dotenv import load_dotenv

from langchain.chat_models import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain import PromptTemplate, LLMChain

# ─── Helpers ───────────────────────────────────────────────────────────────────
def extract_json(text: str):
    for i,ch in enumerate(text):
        if ch in ("{","["):
            open_c, close_c = (ch, "}" if ch=="{" else "]")
            bal = 0
            for j in range(i, len(text)):
                if text[j]==open_c:   bal+=1
                elif text[j]==close_c: bal-=1
                if bal==0:
                    return json.loads(text[i:j+1])
    raise ValueError("No valid JSON in LLM output.")

def evaluate_answer(question: str, answer: str, rubric: list) -> dict:
    prompt = PromptTemplate(
        input_variables=["rubric","question","answer"],
        template="""
You are an objective assessor. Here is a rubric (JSON array):
{rubric}

Now evaluate this response.

Question:
{question}

Answer:
{answer}

For each rubric item, produce an object with:
- "name"       
- "score"      (0–100 integer)
- "explanation" (one-sentence rationale)

Then compute "overall_score" as the average of all scores.

Return **only** the JSON:
{{ "scores":[…], "overall_score":… }}
""",
    )
    chain = LLMChain(llm=llm, prompt=prompt)
    raw = chain.run(
        rubric=json.dumps(rubric,indent=2),
        question=question,
        answer=answer
    )
    return extract_json(raw)


# ─── Setup ─────────────────────────────────────────────────────────────────────
load_dotenv()
os.environ["OPENAI_API_TYPE"]      = "Azure"
os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
os.environ["AZURE_OPENAI_ENDPOINT"]= os.getenv("AZURE_OPENAI_ENDPOINT")

# LLM & Embeddings
llm = AzureChatOpenAI(
    openai_api_version="2023-12-01-preview",
    azure_deployment="GPT-4O-50-1",
)
embeddings = AzureOpenAIEmbeddings(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    openai_api_key=os.getenv("AZURE_OPENAI_API_KEY"),
)

# Load dataset & build FAISS index on questions
df = pd.read_csv("Data\Technical\dataset.csv")
questions = df["question"].astype(str).tolist()
vectorstore = FAISS.from_texts(questions, embeddings)
retriever  = vectorstore.as_retriever(search_kwargs={"k":3})

# Chain to decide if a new question exactly exists
match_prompt = PromptTemplate(
    input_variables=["context","question"],
    template="""
You are an assistant that determines if a new question exactly matches one in the dataset.
From these retrieved questions:
{context}

New Question:
{question}

Respond with **exactly**:
- YES: "<matched question>"  
- NO
""",
)
match_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": match_prompt}
)

# Chain to ask which of the top-3 are truly relevant
relevance_prompt = PromptTemplate(
    input_variables=["new_question","candidates"],
    template="""
New question:
{new_question}

Here are 3 candidate questions retrieved from the dataset:
{candidates}

For each candidate, respond with YES or NO if it's truly relevant to the new question.
Return a JSON array of only those candidate strings that are relevant.
Example output: ["Q1 text", "Q3 text"]
""",
)
relevance_chain = LLMChain(llm=llm, prompt=relevance_prompt)

# Load rubric (for evaluation)
with open(r"Data\Technical\rubric.json","r",encoding="utf-8") as f:
    rubric = json.load(f)


# ─── Main logic ────────────────────────────────────────────────────────────────
def main():
    q_new = input("Enter your new question:\n> ").strip()
    a_new = input("\nEnter the answer to evaluate:\n> ").strip()

    # 1) Exact-match check
    docs = retriever.get_relevant_documents(q_new)
    context = "\n".join(d.page_content for d in docs)
    match_out = match_chain.run(q_new).strip()

    if match_out.upper().startswith("YES"):
        # Found exact Q in dataset
        m = re.search(r'YES:\s*"(.*)"', match_out)
        q_match = m.group(1) if m else None
        print(f"\n→ Exact match in dataset: “{q_match}”")

        # get its past overall score if available
        # (assumes you've pre-computed evaluation_results.json)
        with open("Data\Technical\evaluation_results.json","r",encoding="utf-8") as f:
            past = json.load(f)
        past_scores = {e["question"]: e["evaluation"].get("overall_score",0) for e in past}
        ds_score = past_scores.get(q_match, 0.0)
        print(f"Dataset Q/A overall score: {ds_score:.2f}/100")

        if ds_score > 70:
            print("Combining 70% dataset score + 30% fresh rubric evaluation…")
            new_eval = evaluate_answer(q_new, a_new, rubric)
            rub_score = new_eval["overall_score"]
            combined = 0.7*ds_score + 0.3*rub_score
            print(f"Rubric eval: {rub_score:.2f}/100  → Combined: {combined:.2f}/100")
            print("\nFull rubric breakdown:")
            print(json.dumps(new_eval, indent=2))
        else:
            print("Dataset score ≤70%, using criteria only:")
            new_eval = evaluate_answer(q_new, a_new, rubric)
            print(json.dumps(new_eval, indent=2))

    else:
        # 2) No exact match → check relevance of top-3
        print("\n→ No exact match. Checking relevance of top-3 retrieved questions…")
        candidates = json.dumps([d.page_content for d in docs], indent=2)
        rel_raw = relevance_chain.run(new_question=q_new, candidates=candidates)
        relevant = extract_json(rel_raw)
        print(f"Relevant retrieved questions: {relevant}")

        if not relevant:
            print("None relevant → using criteria only:")
            new_eval = evaluate_answer(q_new, a_new, rubric)
            print(json.dumps(new_eval, indent=2))
        else:
            # 3) Evaluate each relevant Q/A from dataset
            scores = []
            for q_old in relevant:
                ans_old = df.loc[df.question==q_old, "answer"].iloc[0]
                eval_old = evaluate_answer(q_old, ans_old, rubric)
                scores.append(eval_old["overall_score"])
            avg_old = sum(scores)/len(scores)

            # 4) Fresh rubric score for new answer
            new_eval = evaluate_answer(q_new, a_new, rubric)
            rub_score = new_eval["overall_score"]

            # 5) Combine 70% rubric + 30% avg relevant
            combined = 0.7*rub_score + 0.3*avg_old
            print(f"\nRubric eval (70%): {rub_score:.2f}/100")
            print(f"Avg relevant Q/A eval (30%): {avg_old:.2f}/100")
            print(f"Combined final score: {combined:.2f}/100")
            print("\nFull rubric breakdown of your answer:")
            print(json.dumps(new_eval, indent=2))


if __name__ == "__main__":
    main()


  docs = retriever.get_relevant_documents(q_new)
  match_out = match_chain.run(q_new).strip()



→ No exact match. Checking relevance of top-3 retrieved questions…
Relevant retrieved questions: []
None relevant → using criteria only:
{
  "scores": [
    {
      "name": "Clarity",
      "score": 80,
      "explanation": "The answer is clear and easy to understand."
    },
    {
      "name": "Accuracy",
      "score": 100,
      "explanation": "The answer is factually correct, providing the name asked for."
    },
    {
      "name": "Completeness",
      "score": 50,
      "explanation": "The answer provides the name but lacks any additional context or detail."
    },
    {
      "name": "Relevance",
      "score": 100,
      "explanation": "The answer directly addresses the question asked."
    },
    {
      "name": "Depth",
      "score": 30,
      "explanation": "The answer lacks detail beyond a basic response."
    },
    {
      "name": "Conciseness",
      "score": 100,
      "explanation": "The answer is succinct and avoids unnecessary verbosity."
    },
    {
      "name

## HR

### Create a concrete‐answer HR dataset


In [12]:
import os
import json
import pandas as pd
import re
from dotenv import load_dotenv

from langchain.chat_models import AzureChatOpenAI
from langchain import PromptTemplate, LLMChain

# ─── Helper: robust JSON extraction ─────────────────────────────────────────────
def extract_json(text: str):
    for start_idx, ch in enumerate(text):
        if ch in ("{", "["):
            open_char, close_char = (ch, "}" if ch=="{" else "]")
            balance = 0
            for end_idx in range(start_idx, len(text)):
                if text[end_idx] == open_char:
                    balance += 1
                elif text[end_idx] == close_char:
                    balance -= 1
                    if balance == 0:
                        return json.loads(text[start_idx:end_idx+1])
    raise ValueError(f"No complete JSON object/array found in:\n{text}")

# ─── Helper: detect “instructional” style answers ──────────────────────────────
def is_instructional(text: str) -> bool:
    """
    Very simple heuristic: if the answer begins with common
    imperative phrases or contains KEYWORDS like “Best strategy”,
    “Remember that…”, “Example:…”, etc., assume it’s meta‐instructions
    rather than a direct answer. Adjust these patterns as needed.
    """
    patterns = [
        r"^\s*(Start with|Remember that|BEST ANSWERS?|Best strategy|Example:|Remember, you|To answer this question|If you want to|The only right answer|To cover both|Many executives)",
        r"\b(you should|you must|always|never|exercise)\b"
    ]
    for pat in patterns:
        if re.search(pat, text, flags=re.IGNORECASE):
            return True
    return False

# ─── Helper: convert “instructions” → a concrete sample answer ────────────────
def convert_to_sample_answer(question: str, instructions: str, llm) -> str:
    """
    When the raw “answer” is really a block of instructions,
    call the LLM to produce one short, direct example answer to that question.
    """
    prompt = PromptTemplate(
        input_variables=["question", "instructions"],
        template="""
Here is an HR interview question:
{question}

Here are some instructions about *how* to answer that question in a perfectly effective way:
{instructions}

Please write a brief (1–2 paragraph) example answer to the question, 
directly implementing those instructions.
**Do not repeat the instructions.** Produce only the final, concrete answer.
""",
    )
    chain = LLMChain(llm=llm, prompt=prompt)
    sample = chain.run(question=question, instructions=instructions).strip()
    return sample

# ─── 1. Load credentials ───────────────────────────────────────────────────────
load_dotenv()
os.environ["AZURE_OPENAI_API_KEY"]  = os.getenv("AZURE_OPENAI_API_KEY")
os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
os.environ["OPENAI_API_TYPE"]       = "Azure"

llm = AzureChatOpenAI(
    openai_api_version="2023-12-01-preview",
    azure_deployment="GPT-4O-50-1",
)

# ─── 2. Load original HR Q/A CSV and preprocess to concrete answers ────────────
ORIGINAL_CSV = r"Data\HR\interview_best_answers_cleaned.csv"
SAMPLE_CSV   = r"Data\HR\interview_best_answers_samples.csv"

df_orig = pd.read_csv(ORIGINAL_CSV).dropna(subset=["question", "answer"])
df_samples = df_orig.copy()

for idx, row in df_samples.iterrows():
    ans = row["answer"]
    if is_instructional(ans):
        try:
            concrete = convert_to_sample_answer(row["question"], ans, llm)
        except Exception:
            concrete = ans  # fallback if LLM call fails
        df_samples.at[idx, "answer"] = concrete

# ─── 3. Save the new “samples” CSV ──────────────────────────────────────────────
os.makedirs(os.path.dirname(SAMPLE_CSV), exist_ok=True)
df_samples.to_csv(SAMPLE_CSV, index=False, encoding="utf-8")
print(f"Concrete‐answer dataset saved to {SAMPLE_CSV}")


Concrete‐answer dataset saved to Data\HR\interview_best_answers_samples.csv


### Make the criteria (for HR Q/A)

In [13]:
import os
import json
import pandas as pd
from dotenv import load_dotenv

from langchain.chat_models import AzureChatOpenAI
from langchain import PromptTemplate, LLMChain

# ─── Helper: robust JSON extraction ─────────────────────────────────────────────
def extract_json(text: str):
    for start_idx, ch in enumerate(text):
        if ch in ("{", "["):
            open_char, close_char = (ch, "}" if ch=="{" else "]")
            balance = 0
            for end_idx in range(start_idx, len(text)):
                if text[end_idx] == open_char:
                    balance += 1
                elif text[end_idx] == close_char:
                    balance -= 1
                    if balance == 0:
                        return json.loads(text[start_idx:end_idx+1])
    raise ValueError(f"No complete JSON object/array found in:\n{text}")

# ─── 1. Load credentials ───────────────────────────────────────────────────────
load_dotenv()
os.environ["AZURE_OPENAI_API_KEY"]  = os.getenv("AZURE_OPENAI_API_KEY")
os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
os.environ["OPENAI_API_TYPE"]       = "Azure"

llm = AzureChatOpenAI(
    openai_api_version="2023-12-01-preview",
    azure_deployment="GPT-4O-50-1",
)

# ─── 2. Rubric generation ─────────────────────────────────────────────────────
def generate_rubric(csv_path: str, max_examples: int = 50) -> list:
    """
    Reads the concrete‐answer HR Q/A CSV (with columns "question" and "answer"),
    then prompts the LLM to produce a JSON array of evaluation criteria.
    """
    df = pd.read_csv(csv_path).dropna(subset=["question", "answer"])
    samples = df.head(max_examples)
    context = "\n\n".join(f"Q: {q}\nA: {a}" for q, a in zip(samples["question"], samples["answer"]))

    prompt = PromptTemplate(
        input_variables=["context"],
        template="""
You are an expert evaluator. Here are sample HR Q/A pairs:
{context}

Generate a JSON **array** of 5–8 evaluation criteria tailored for HR question/answer quality. 
Each criterion must be an object with:
- "name": short title
- "description": one-sentence explanation

**Output only the JSON array**, without any extra commentary.
""",
    )
    chain = LLMChain(llm=llm, prompt=prompt)
    raw = chain.run(context=context)
    return extract_json(raw)

# ─── 3. Save & Load functions ───────────────────────────────────────────────────
def save_rubric(rubric: list, filepath: str):
    """Save rubric (list of dicts) to a JSON file."""
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(rubric, f, indent=2, ensure_ascii=False)
    print(f"Rubric saved to {filepath}")

def load_rubric(filepath: str) -> list:
    """Load rubric (list of dicts) from a JSON file."""
    with open(filepath, "r", encoding="utf-8") as f:
        return json.load(f)

# ─── 4. Evaluation function ────────────────────────────────────────────────────
def evaluate_answer(question: str, answer: str, rubric: list) -> dict:
    """
    Using the provided rubric, prompt the LLM to score the given HR answer.
    Returns a dict with "scores" (list of {name, score, explanation})
    and "overall_score" (float).
    """
    rubric_json = json.dumps(rubric, indent=2)
    prompt = PromptTemplate(
        input_variables=["rubric", "question", "answer"],
        template="""
You are an objective assessor. Here is a rubric (JSON array):
{rubric}

Now evaluate this HR response.

Question:
{question}

Answer:
{answer}

For each rubric item, produce an object with:
- "name"       (same as criterion name)
- "score"      (integer 0–100)
- "explanation" (one-sentence rationale)

Then compute "overall_score" as the average of all scores.

Return **only** the final JSON object with keys "scores" and "overall_score".
""",
    )
    chain = LLMChain(llm=llm, prompt=prompt)
    raw = chain.run(rubric=rubric_json, question=question, answer=answer)
    return extract_json(raw)

# ─── 5. Example usage ─────────────────────────────────────────────────────────
if __name__ == "__main__":
    SAMPLE_CSV = r"Data\HR\interview_best_answers_samples.csv"
    # Generate and save rubric from the concrete‐answer HR dataset
    rubric = generate_rubric(SAMPLE_CSV)
    save_rubric(rubric, r"Data\HR\hr_rubric.json")

    # Later on, or in another script, simply:
    rubric = load_rubric(r"Data\HR\hr_rubric.json")

    # Evaluate a new HR Q/A
    new_q = "How do you handle conflict in a team setting?"
    new_a = "I first listen actively to both sides, encourage open communication, and then work with them to find a compromise that aligns with our shared goals."
    report = evaluate_answer(new_q, new_a, rubric)
    print(json.dumps(report, indent=2))


Rubric saved to Data\HR\hr_rubric.json
{
  "scores": [
    {
      "name": "Relevance",
      "score": 90,
      "explanation": "The answer directly addresses the question by outlining a clear approach to handling conflict."
    },
    {
      "name": "Clarity",
      "score": 85,
      "explanation": "The response is clear and concise, detailing a straightforward process."
    },
    {
      "name": "Professionalism",
      "score": 80,
      "explanation": "The tone is professional and appropriate for an HR interview."
    },
    {
      "name": "Depth of Insight",
      "score": 75,
      "explanation": "While the response offers a basic strategy, it lacks detailed examples or insights."
    },
    {
      "name": "Positivity",
      "score": 85,
      "explanation": "The answer is positively framed, focusing on collaboration and shared goals."
    },
    {
      "name": "Engagement",
      "score": 70,
      "explanation": "The response does not explicitly invite further discussion

### Run criteria on the HR dataset


In [14]:
import os
import re
import json
import pandas as pd
from dotenv import load_dotenv

from langchain.chat_models import AzureChatOpenAI
from langchain import PromptTemplate, LLMChain

# ─── Helper: robust JSON extraction ─────────────────────────────────────────────
def extract_json(text: str):
    """
    Find the first JSON object or array in `text` by bracket matching.
    Returns the parsed JSON. Raises ValueError if none found or unbalanced.
    """
    for start_idx, ch in enumerate(text):
        if ch in ("{", "["):
            open_char = ch
            close_char = "}" if ch == "{" else "]"
            balance = 0
            for end_idx in range(start_idx, len(text)):
                if text[end_idx] == open_char:
                    balance += 1
                elif text[end_idx] == close_char:
                    balance -= 1
                    if balance == 0:
                        snippet = text[start_idx : end_idx + 1]
                        return json.loads(snippet)
            break
    raise ValueError(f"No complete JSON object/array found in LLM output:\n{text}")

# ─── New helper: detect “instructional” style answers ──────────────────────────
def is_instructional(text: str) -> bool:
    """
    Very simple heuristic: if the answer begins with common
    imperative phrases or contains KEYWORDS like “Best strategy”,
    “Remember that…”, “Example:…”, etc., assume it’s meta‐instructions
    rather than a direct answer. Adjust these patterns as needed.
    """
    patterns = [
        r"^\s*(Start with|Remember that|BEST ANSWERS?|Best strategy|Example:|Remember, you|To answer this question|If you want to|The only right answer|To cover both|Many executives)",
        r"\b(you should|you must|always|never|exercise)\b"
    ]
    for pat in patterns:
        if re.search(pat, text, flags=re.IGNORECASE):
            return True
    return False

# ─── New helper: convert “instructions” → a concrete sample answer ─────────────
def convert_to_sample_answer(question: str, instructions: str, llm) -> str:
    """
    When the raw “answer” is really a block of instructions,
    call the LLM to produce one short, direct example answer to that question.
    """
    prompt = PromptTemplate(
        input_variables=["question", "instructions"],
        template="""
Here is an HR interview question:
{question}

Here are some instructions about *how* to answer that question in a perfectly effective way:
{instructions}

Please write a brief (1–2 paragraph) example answer to the question, 
directly implementing those instructions.
**Do not repeat the instructions.** Produce only the final, concrete answer.
""",
    )
    chain = LLMChain(llm=llm, prompt=prompt)
    sample = chain.run(question=question, instructions=instructions).strip()
    return sample

# ─── LLM setup ─────────────────────────────────────────────────────────────────
load_dotenv()
os.environ["AZURE_OPENAI_API_KEY"]  = os.getenv("AZURE_OPENAI_API_KEY")
os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
os.environ["OPENAI_API_TYPE"]       = "Azure"

llm = AzureChatOpenAI(
    openai_api_version="2023-12-01-preview",
    azure_deployment="GPT-4O-50-1",
)

# ─── Evaluation function ───────────────────────────────────────────────────────
def evaluate_answer(question: str, answer: str, rubric: list) -> dict:
    """
    Given a single HR question/answer and a rubric (list of {"name","description"}),
    prompts the LLM to score each criterion 0–100 and give a one-sentence explanation.
    Returns a dict: {
      "scores": [ { "name":"", "score":int, "explanation":"" }, … ],
      "overall_score": float
    }
    """
    rubric_json = json.dumps(rubric, indent=2)
    prompt = PromptTemplate(
        input_variables=["rubric", "question", "answer"],
        template="""
You are an objective assessor. Here is a rubric (JSON array):
{rubric}

Now evaluate this HR response.

Question:
{question}

Answer:
{answer}

For each rubric item, produce an object with:
- "name"       (same as criterion)
- "score"      (integer between 0 and 100)
- "explanation" (one-sentence rationale)

Then compute "overall_score" as the average of all scores.

Return **only** the final JSON object with keys "scores" and "overall_score".
""",
    )
    chain = LLMChain(llm=llm, prompt=prompt)
    raw = chain.run(rubric=rubric_json, question=question, answer=answer)
    return extract_json(raw)

# ─── 1. Load rubric from JSON file ─────────────────────────────────────────────
with open(r"Data\HR\hr_rubric.json", "r", encoding="utf-8") as f:
    rubric = json.load(f)

# ─── 2. Load concrete‐answer HR dataset of Q/A pairs ───────────────────────────
df = pd.read_csv(r"Data\HR\interview_best_answers_samples.csv").dropna(subset=["question", "answer"])

# ─── 3. Iterate, convert if needed, and evaluate ──────────────────────────────
results = []
for idx, row in df.iterrows():
    q = row["question"]
    raw_ans = row["answer"]

    # If the “answer” is really a set of instructions, convert to a sample answer
    if is_instructional(raw_ans):
        try:
            concrete_ans = convert_to_sample_answer(q, raw_ans, llm)
        except Exception:
            concrete_ans = raw_ans  # fallback if generation fails
    else:
        concrete_ans = raw_ans

    try:
        report = evaluate_answer(q, concrete_ans, rubric)
    except Exception as e:
        report = {"error": str(e)}

    results.append({
        "question": q,
        "raw_answer": raw_ans,
        "concrete_answer_used": concrete_ans,
        "evaluation": report
    })

# ─── 4. Save full JSON report ─────────────────────────────────────────────────
with open(r"Data\HR\hr_evaluation_results_with_samples.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

# ─── 5. Flatten for CSV ────────────────────────────────────────────────────────
rows = []
for entry in results:
    q = entry["question"]
    a_raw = entry["raw_answer"]
    a_used = entry["concrete_answer_used"]
    eval_block = entry["evaluation"]

    if "scores" in eval_block:
        for crit in eval_block["scores"]:
            rows.append({
                "question": q,
                "raw_answer": a_raw,
                "concrete_answer": a_used,
                "criterion": crit["name"],
                "score": crit["score"],
                "explanation": crit["explanation"],
                "overall_score": eval_block.get("overall_score")
            })
    else:
        rows.append({
            "question": q,
            "raw_answer": a_raw,
            "concrete_answer": a_used,
            "criterion": None,
            "score": None,
            "explanation": None,
            "overall_score": None,
            "error": eval_block.get("error")
        })

df_flat = pd.DataFrame(rows)
df_flat.to_csv(r"Data\HR\hr_evaluation_results_with_samples.csv", index=False)

print("Done! HR results written to hr_evaluation_results_with_samples.json and hr_evaluation_results_with_samples.csv")


Done! HR results written to hr_evaluation_results_with_samples.json and hr_evaluation_results_with_samples.csv


### Evaluate a New HR Question Using Past Dataset + Criteria


In [15]:
import os
import re
import json
import pandas as pd
from dotenv import load_dotenv

from langchain.chat_models import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain import PromptTemplate, LLMChain

# ─── Helpers ───────────────────────────────────────────────────────────────────
def extract_json(text: str):
    """
    Locate the first complete JSON object or array in `text` by bracket matching.
    Returns the parsed JSON. Raises ValueError if none found.
    """
    for i, ch in enumerate(text):
        if ch in ("{", "["):
            open_c, close_c = (ch, "}" if ch == "{" else "]")
            bal = 0
            for j in range(i, len(text)):
                if text[j] == open_c:
                    bal += 1
                elif text[j] == close_c:
                    bal -= 1
                    if bal == 0:
                        return json.loads(text[i : j + 1])
    raise ValueError("No valid JSON in LLM output.")

def is_instructional(text: str) -> bool:
    """
    Detect if the provided HR answer text is actually a set of instructions
    rather than a concrete sample answer. For example, many entries begin
    with "BEST ANSWERS:" or contain bullet‐style guidelines.
    """
    t = text.strip().lower()
    return t.startswith("best answers:") or "instructions" in t or "strategy" in t

def convert_to_sample_answer(question: str, instruction: str, llm) -> str:
    """
    Given an HR question and a block of instructional text, ask the LLM
    to produce a concrete sample answer that follows those instructions.
    """
    prompt = PromptTemplate(
        input_variables=["instruction", "question"],
        template="""
You are a helpful assistant. Based on these instructions, generate a concrete, polished
sample answer to the HR interview question.

Instructions:
{instruction}

Question:
{question}

Provide the resulting sample answer (1–3 paragraphs) that follows the instructions.
""",
    )
    chain = LLMChain(llm=llm, prompt=prompt)
    return chain.run(instruction=instruction, question=question).strip()

# ─── Setup ─────────────────────────────────────────────────────────────────────
load_dotenv()
os.environ["OPENAI_API_TYPE"]      = "Azure"
os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
os.environ["AZURE_OPENAI_ENDPOINT"]= os.getenv("AZURE_OPENAI_ENDPOINT")

llm = AzureChatOpenAI(
    openai_api_version="2023-12-01-preview",
    azure_deployment="GPT-4O-50-1",
)
embeddings = AzureOpenAIEmbeddings(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    openai_api_key=os.getenv("AZURE_OPENAI_API_KEY"),
)

# ─── Load HR dataset & build FAISS index on questions ─────────────────────────
HR_CSV_PATH = r"Data\HR\interview_best_answers_samples.csv"
df = pd.read_csv(HR_CSV_PATH)
questions = df["question"].astype(str).tolist()
vectorstore = FAISS.from_texts(questions, embeddings)
retriever  = vectorstore.as_retriever(search_kwargs={"k": 3})

# ─── Chain: EXACT‐MATCH check ──────────────────────────────────────────────────
match_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are an assistant that determines if a new HR question exactly matches one in the dataset.
From these retrieved questions:
{context}

New Question:
{question}

Respond with exactly:
- YES: "<matched question>"
- NO
""",
)
match_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": match_prompt}
)

# ─── Chain: TOP‐3 RELEVANCE check ───────────────────────────────────────────────
relevance_prompt = PromptTemplate(
    input_variables=["new_question", "candidates"],
    template="""
New question:
{new_question}

Here are 3 candidate questions retrieved from the HR dataset:
{candidates}

For each candidate, respond with YES or NO if it's truly relevant to the new question.
Return a JSON array of only those candidate strings that are relevant.
Example output: ["Tell me about yourself.", "What are your strengths?"]
""",
)
relevance_chain = LLMChain(llm=llm, prompt=relevance_prompt)

# ─── Load HR rubric & past evaluation results ─────────────────────────────────
RUBRIC_PATH = r"Data\HR\hr_rubric.json"
EVAL_JSON   = r"Data\HR\hr_evaluation_results_with_samples.json"
with open(RUBRIC_PATH, "r", encoding="utf-8") as f:
    rubric = json.load(f)

with open(EVAL_JSON, "r", encoding="utf-8") as f:
    past = json.load(f)

# Precompute a map: question → old overall_score
past_scores = {
    entry["question"]: entry["evaluation"].get("overall_score", 0.0)
    for entry in past
}

# ─── Helpers (reused) ──────────────────────────────────────────────────────────
def evaluate_answer(question: str, answer: str, rubric: list) -> dict:
    """
    Given a single HR question/answer and a rubric (list of {"name","description"}),
    prompts the LLM to score each criterion 0–100 and give a one-sentence explanation.
    Returns a dict: { "scores": […], "overall_score": … }
    """
    prompt = PromptTemplate(
        input_variables=["rubric", "question", "answer"],
        template="""
You are an objective assessor. Here is a rubric (JSON array):
{rubric}

Now evaluate this HR response.

Question:
{question}

Answer:
{answer}

For each rubric item, produce an object with:
- "name"       (same as criterion)
- "score"      (0–100 integer)
- "explanation" (one-sentence rationale)

Then compute "overall_score" as the average of all scores.

Return **only** the JSON:
{{ "scores":[…], "overall_score":… }}
""",
    )
    chain = LLMChain(llm=llm, prompt=prompt)
    raw = chain.run(
        rubric=json.dumps(rubric, indent=2),
        question=question,
        answer=answer
    )
    return extract_json(raw)

# ─── Main interactive loop ────────────────────────────────────────────────────
def main():
    q_new = input("Enter your new HR question:\n> ").strip()
    a_new = input("\nEnter the answer to evaluate:\n> ").strip()

    # 1) EXACT‐MATCH check
    docs = retriever.get_relevant_documents(q_new)
    context = "\n".join(d.page_content for d in docs)
    match_out = match_chain.run(q_new).strip()

    if match_out.upper().startswith("YES"):
        # Found an exact match in the HR dataset
        m = re.search(r'YES:\s*"(.*)"', match_out)
        q_match = m.group(1) if m else None
        print(f"\n→ Exact match in HR dataset: “{q_match}”")

        # Retrieve its old overall score
        ds_score = past_scores.get(q_match, 0.0)
        print(f"Dataset Q/A overall score: {ds_score:.2f}/100")

        if ds_score > 70:
            print("Combining 70% dataset score + 30% fresh rubric evaluation…")
            new_eval = evaluate_answer(q_new, a_new, rubric)
            rub_score = new_eval["overall_score"]
            combined = 0.7 * ds_score + 0.3 * rub_score
            print(f" • Rubric eval: {rub_score:.2f}/100")
            print(f" • Combined:    {combined:.2f}/100\n")
            print("Full rubric breakdown:")
            print(json.dumps(new_eval, indent=2))
        else:
            print("Dataset score ≤ 70 → using only fresh rubric evaluation:")
            new_eval = evaluate_answer(q_new, a_new, rubric)
            print(json.dumps(new_eval, indent=2))

    else:
        # 2) NO EXACT MATCH → do TOP‐3 RELEVANCE
        print("\n→ No exact match. Checking relevance of top‐3 retrieved questions…")
        candidates = json.dumps([d.page_content for d in docs], indent=2)

        rel_raw = relevance_chain.run(new_question=q_new, candidates=candidates)

        try:
            relevant = extract_json(rel_raw)
        except ValueError:
            # LLM did not return valid JSON → treat as “no relevant questions”
            print("⚠️ Warning: could not parse relevance output as JSON.")
            relevant = []

        print(f"Relevant retrieved questions: {relevant}")

        if not relevant:
            # 3.a) No relevant Q found → score purely by rubric
            print("None relevant → using rubric only:")
            new_eval = evaluate_answer(q_new, a_new, rubric)
            print(json.dumps(new_eval, indent=2))

        else:
            # 3.b) Found relevant Qs → average their old scores
            scores = []
            for q_old in relevant:
                # Fetch the old answer from the sample dataset
                ans_old = df.loc[df.question == q_old, "answer"].iloc[0]

                # If that old answer was instructional, generate a concrete sample
                if is_instructional(ans_old):
                    try:
                        ans_old_concrete = convert_to_sample_answer(q_old, ans_old, llm)
                    except Exception:
                        ans_old_concrete = ans_old
                else:
                    ans_old_concrete = ans_old

                eval_old = evaluate_answer(q_old, ans_old_concrete, rubric)
                scores.append(eval_old["overall_score"])
            avg_old = sum(scores) / len(scores)

            # 4) Now score the new answer by rubric
            new_eval = evaluate_answer(q_new, a_new, rubric)
            rub_score = new_eval["overall_score"]

            # 5) Combine 70% rubric + 30% avg_old
            combined = 0.7 * rub_score + 0.3 * avg_old
            print(f"\n • Rubric eval (70%):          {rub_score:.2f}/100")
            print(f" • Avg relevant old eval (30%): {avg_old:.2f}/100")
            print(f" → Combined final score:       {combined:.2f}/100\n")
            print("Full rubric breakdown for the new answer:")
            print(json.dumps(new_eval, indent=2))

if __name__ == "__main__":
    main()



→ No exact match. Checking relevance of top‐3 retrieved questions…
Relevant retrieved questions: ['What do you worry about?']

 • Rubric eval (70%):          24.00/100
 • Avg relevant old eval (30%): 85.00/100
 → Combined final score:       42.30/100

Full rubric breakdown for the new answer:
{
  "scores": [
    {
      "name": "Relevance",
      "score": 50,
      "explanation": "The answer addresses the question but lacks detail."
    },
    {
      "name": "Clarity",
      "score": 30,
      "explanation": "The response is unclear due to language and phrasing."
    },
    {
      "name": "Professionalism",
      "score": 20,
      "explanation": "The tone is informal and not suitable for an HR interview."
    },
    {
      "name": "Depth of Insight",
      "score": 10,
      "explanation": "The answer provides minimal insight into the candidate's qualifications."
    },
    {
      "name": "Positivity",
      "score": 40,
      "explanation": "The answer does not explicitly frame 