# LLM call

In [None]:
from typing import (Generic, List, Optional, Tuple, TypeVar, Any, AsyncIterator, 
                    Union, Sequence)
from langchain_core.messages import BaseMessage
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.output_parsers import PydanticOutputParser
from langchain.output_parsers import RetryOutputParser
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.runnables import Runnable, RunnableLambda, RunnableParallel, RunnablePassthrough
from langchain_core.prompt_values import PromptValue
from langchain_core.prompts import PromptTemplate
import langchain.evaluation.qa.eval_chain as eval_chain
import logging
import pandas as pd
from pydantic import BaseModel, Field

In [None]:
MessageLikeRepresentation = Union[
    BaseMessage, list[str], tuple[str, str], str, dict[str, Any]
]
LanguageModelInput = Union[PromptValue, str, Sequence[MessageLikeRepresentation]]

In [None]:
class VLLMChatOpenAI(ChatOpenAI):
    def _get_request_payload(
        self,
        input_: LanguageModelInput,
        *,
        stop: Optional[List[str]] = None,
        **kwargs: Any,
    ) -> dict:
        payload = super()._get_request_payload(input_, stop=stop, **kwargs)
        # max_tokens was deprecated in favor of max_completion_tokens
        # in September 2024 release
        if "max_completion_tokens" in payload:
            payload["max_tokens"] = payload.pop("max_completion_tokens")
        return payload

In [None]:
logger = logging.getLogger(__name__)

def build_chain(llm: BaseChatModel, 
                system_template: str,
                human_template: str,
                parser: Optional[PydanticOutputParser] = None) -> Runnable:
    prompt = ChatPromptTemplate(
        messages=[
            SystemMessagePromptTemplate.from_template([{"text": system_template}]),
            HumanMessagePromptTemplate.from_template([{"text": human_template}])
        ]
    )
    
    chain = (
        prompt
        | RunnableParallel(completion=llm, prompt_value=RunnablePassthrough())
    )

    if parser:
        retry_planner_parser = RetryOutputParser.from_llm(
            parser=parser,
            llm=llm,
            prompt=PromptTemplate.from_template("{prompt}"),
            max_retries=3
        )
        
        def _do_parsing_retrying(x: dict):
                result = None
                completion = x['completion'].content
                prompt_value = x['prompt_value']

                logger.info(f"Trying structured parsing, Received completion: {completion}")

                try:
                    result = retry_planner_parser.parse_with_prompt(completion=completion,prompt_value=prompt_value)
                except OutputParserException as e:
                    logger.warning("Proceeding without result due to parser errors (even after retrying). "
                                   "Prompt - %s" % prompt_value)
                    raise e                    

                return result

        chain = (
            RunnableLambda(lambda x: {**x, "response_format_description": parser.get_format_instructions()})
            | chain
            | RunnableLambda(_do_parsing_retrying, name="retry_planner_lambda")
        )

    return chain

In [None]:
OPENAI_API_BASE = 'http://10.32.2.11:8041/v1'
OPENAI_API_KEY = 'token-abc123'

In [None]:
llm = VLLMChatOpenAI(
            model="/model",
            base_url=OPENAI_API_BASE,
            api_key=OPENAI_API_KEY,
            temperature=0.3,
            max_tokens=8096,
        )

In [None]:
df = pd.read_json("ObliQA_trained.json")
#print(len(df))
Original_Answers = [df["Or_A"][i] for i in range(len(df["Or_A"]))]
Answers = [df["A"][i] for i in range(len(df["A"]))]

In [None]:
class ExtractedMeta(BaseModel):
    correctness: Optional[float] = Field(default=None, description="Correctness score")
    coverage: Optional[float] = Field(default=None, description="Coverage score")
    clarity: Optional[float] = Field(default=None, description="Clarity score")
    overall_assessment: Optional[str] = Field(default=None, description="1–3 sentence summary of the comparison.")

In [None]:
system_template = """Task:
You are given two answers to the same question:
Golden Answer – the correct or ideal reference answer.
Generated Answer – the model-produced answer to evaluate.
Your job is to compare the Generated Answer against the Golden Answer using the criteria below and then produce a short structured evaluation.

Criteria (USE ONLY THESE CRITERIA):
Correctness — Does the generated answer provide factually correct information compared to the golden answer?(NOTE: Being different from the golden answer is allowed if still correct.)
Coverage of Key Details — Does the generated answer include the important points, constraints, and nuances mentioned in the golden answer?
Clarity & Coherence — Is the answer well-structured, easy to understand, and logically flowing?

Represent the answer as following:
Correctness score - the float number from 0.0 to 1.0(the more - the better, where 1.0 - 100% correct answer)
Coverage score - the float number from 0.0 to 1.0(the more - the better, where 1.0 - 100% details mentioned)
Clarity score - the float number from 0.0 to 1.0(the more - the better, where 1.0 - 100% correct structure, understandable and logically correct)
Overall assessment - summary of scoring.

Follow the answer format:
{response_format_description}
"""

human_template = """REAL DATA: The following section is the real data. You should use only this real data to prepare your answer. Extract all the necessary information for the answer.
Golden Answer: {answer}
Model Answer: {model_answer}
"""


parser = PydanticOutputParser(pydantic_object=ExtractedMeta)
meta_chain = build_chain(llm, system_template, human_template, parser)

In [None]:
result = []
for i in range(0, len(Original_Answers), 50):
    result.append(await meta_chain.abatch([{
            "answer": Original_Answers,
            "model_answer": Answers,
        } for j in range(min(50, len(Original_Answers)-i))] ))

In [None]:
print(result[0][0])

In [None]:
new_result = []
for j in result:
    for k in j:
        new_result.append(k)
print(new_result[0])

In [None]:
Result_dict = []
for i in range(len(new_result)):
    Result_dict.append({"Correctness": new_result[i].correctness, "Coverage": new_result[i].coverage, "Clarity": new_result[i].clarity, "Overall_assessment": new_result[i].overall_assessment})
print(Result_dict[0])
import json
with open('zeroshot_result.json', 'w') as fout:
    json.dump(Result_dict, fout)

In [None]:
print(len(Result_dict))

In [None]:
df = pd.read_json("Metrics for unlearned.json")
print(df)

In [None]:
avg_cor_u = 0
avg_cov_u = 0
avg_cla_u = 0
avg_cor_l = 0
avg_cov_l = 0
avg_cla_l = 0
for i in range(len(df["Correctness"])):
    avg_cor_u+=df["Correctness"][i]/len(df["Correctness"])
    avg_cov_u+=df["Coverage"][i]/len(df["Coverage"])
    avg_cla_u+=df["Clarity"][i]/len(df["Clarity"])
for i in range(len(Result_dict)):
    avg_cor_l+=Result_dict[i]["Correctness"]/len(Result_dict)
    avg_cov_l+=Result_dict[i]["Coverage"]/len(Result_dict)
    avg_cla_l+=Result_dict[i]["Clarity"]/len(Result_dict)
print(f"Average Correctness: {avg_cor_u} vs {avg_cor_l}\nAverage Coverage: {avg_cov_u} vs {avg_cov_l}\nAverage Clarity: {avg_cla_u} vs {avg_cla_l}")