## Import LangChain

In [1]:
import os
import time
from pathlib import Path
import traceback
import json
import sys
import argparse

from langchain.vectorstores import FAISS
#from langchain_community.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
import json

from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
#from langchain.chat_models import ChatOpenAI
from langchain_community.chat_models import ChatOpenAI
#from langchain import PromptTemplate 
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
#from langchain.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import OpenAIEmbeddings
from langchain.callbacks import get_openai_callback
import APIKEY
os.environ["OPENAI_API_KEY"] = APIKEY.API_KEY_SERVICE_OPENAI

## Import LangSmith

In [2]:
from langsmith import Client

In [3]:
os.environ["LANGCHAIN_TRACING_V2"] = "true"
#os.environ["LANGCHAIN_PROJECT"] = f"Tracing Walkthrough - Test2"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = APIKEY.LANGCHAIN_API_KEY  # Update to your API key

DATASET_NAME = "ds_m460_trm"

In [4]:
client = Client()
datasets = client.list_datasets(dataset_name=DATASET_NAME)

In [5]:
examples = client.list_examples(dataset_name=DATASET_NAME)

In [6]:
for i in range(0, 1):
    print(next(examples))

dataset_id=UUID('a117ee3a-5545-4f7f-bb72-95f756df6116') inputs={'question': 'Can M460 TRNG generate 1000 random bits per second?', 'chat_history': ''} outputs={'output': 'No, the True Random Number Generator (TRNG) in the M460 series is capable of generating 800 random bits per second, as stated in the technical reference manual.'} id=UUID('09321f02-5cb7-4054-aa87-47329624483b') created_at=datetime.datetime(2024, 2, 23, 7, 7, 56, 147914, tzinfo=datetime.timezone.utc) modified_at=datetime.datetime(2024, 2, 27, 8, 32, 45, 224832, tzinfo=datetime.timezone.utc) runs=[] source_run_id=None


## Create the chain be tested

In [7]:
input_doc_pth =r'C:\Users\USER\Desktop\llma\LLM_Playing\pyinstaller_bg\doc_faiss\QA\TRM_M463_M467_pypdf'

gptmodel = 'gpt-4-0125-preview' #'gpt-4-0125-preview'

mmr_num = 5

chip_type = 'm460'
system_template = """Use the following pieces of context and chat history to answer the question at the end. The context is Nuvoton """+chip_type+""" Series Technical Reference Manual.
If you don't know the answer or the question has nothing to do with technical, don't try to make up an answer.
----------------
{context}
{chat_history}"""
messages = [
        SystemMessagePromptTemplate.from_template(system_template),
        HumanMessagePromptTemplate.from_template("{question}")
        ]
qa_prompt = ChatPromptTemplate.from_messages(messages)


def QA_LangChain_RQA_chain():
    llm = ChatOpenAI(temperature=0, model=gptmodel)
    
    # load embedding model
    print("===== Load the embedding model =====", flush=True)
    
    # choose your embeddings model
    embeddings = OpenAIEmbeddings()

    # FAISS  
    vectorstore=FAISS.load_local(input_doc_pth, embeddings)
    retriever_vec=vectorstore.as_retriever(
    search_type="mmr", # Also test "similarity"
    search_kwargs={"k": mmr_num})
    
    print("===== Create a ConversationalRetrievalChain chain =====", flush=True)
    # Normal memory
    memory = ConversationBufferMemory(memory_key="chat_history", input_key='question', output_key='answer', return_messages=True)
    
    chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever_vec, memory=memory,
                                                      return_source_documents=True, 
                                                      combine_docs_chain_kwargs={"prompt": qa_prompt}
                                                     )
    
    return chain

#test_chain = QA_LangChain_RQA_chain()

## Config evaluators

#### - Custom evaluator that logs a heuristic evaluation.

In [8]:
from langsmith.evaluation import EvaluationResult, run_evaluator
from langsmith.schemas import Example, Run


@run_evaluator
def check_not_idk(run: Run, example: Example):
    """Illustration of a custom evaluator."""
    agent_response = run.outputs["answer"]
    if "don't know" in agent_response or "not sure" in agent_response:
        score = 0
    else:
        score = 1
    # You can access the dataset labels in example.outputs[key]
    # You can also access the model inputs in run.inputs[key]
    return EvaluationResult(
        key="not_uncertain",
        score=score,
    )

#### - Custom LangChain string evaluators

In [9]:
import re
from typing import Any, Optional
from langchain_core.prompts import PromptTemplate
from langchain.evaluation import StringEvaluator

class GradeEvaluator(StringEvaluator):
    """An LLM-based relevance evaluator."""
    def __init__(self):
        llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)

        template = """You are a technical professor responsible for checking the correctness of technical issues and 
grading the students' answers to questions. You are given a question, the student's answer, and the true answer. 
You are asked to score the student's answer on a scale from 0 to 100.

Example Format:
QUESTION: question here
STUDENT ANSWER: student's answer here
TRUE ANSWER: true answer here
GRADE: score here between 0 to 100

Grade the student answers based ONLY on their factual accuracy and 
note if the values of the any parameters are correct.
Ignore differences in punctuation and phrasing between the student answer and true answer. 
It is OK if the student answer contains more information than the true answer, 
as long as it does not contain any conflicting statements. Begin!
QUESTION:{input}
STUDENT ANSWER:{prediction}
TRUE ANSWER:{reference}
GRADE:
"""
        self.eval_chain = PromptTemplate.from_template(template) | llm

    @property
    def requires_input(self) -> bool:
        return True

    @property
    def requires_reference(self) -> bool:
        return True

    @property
    def evaluation_name(self) -> str:
        return "scored_relevance"

    def _evaluate_strings(
        self,
        prediction: str,
        input: Optional[str] = None,
        reference: Optional[str] = None,
        **kwargs: Any
    ) -> dict:
        evaluator_result = self.eval_chain.invoke(
            {"input": input, "prediction": prediction, "reference": reference}, kwargs
        )
        score = evaluator_result.content
        score = re.search(r"\d+", score).group(0)
        if score is not None:
            score = float(score.strip()) / 100.0
        return {"score": score}

#### - Combine the custom & build-in evaluators in RunEvalConfig

In [10]:
from langchain.evaluation import EvaluatorType
from langchain.smith import RunEvalConfig
from langchain_core.prompts.prompt import PromptTemplate

#_PROMPT_TEMPLATE = """You are an expert professor specialized in grading students' answers to questions.
#You are grading the following question:
#{query}
#Here is the real answer:
#{answer}
#You are grading the following predicted answer:
#{result}
#Respond with CORRECT or INCORRECT:
#Grade:
#"""
#PROMPT = PromptTemplate(input_variables=['result', 'query', 'answer'], template=_PROMPT_TEMPLATE)

evaluation_config = RunEvalConfig(
    input_key='question',
    prediction_key='answer',
    evaluators=[
        #EvaluatorType.EMBEDDING_DISTANCE,
        RunEvalConfig.QA(llm=ChatOpenAI(temperature=0, model='gpt-3.5-turbo-0125')),
        RunEvalConfig.LabeledScoreString(
            {
               "accuracy": """
Score 1: The answer is completely unrelated to the reference.
Score 3: The answer has minor relevance but does not align with the reference.
Score 5: The answer has moderate relevance but contains inaccuracies.
Score 7: The answer aligns with the reference but has minor errors or omissions.
Score 10: The answer is completely accurate and aligns perfectly with the reference."""
            },
            normalize_by=10,
            llm=ChatOpenAI(temperature=0, model='gpt-4-0125-preview')
        ),
    ],
    # You can add custom StringEvaluator or RunEvaluator objects here as well, which will automatically be
    # applied to each prediction. Check out the docs for examples.
    custom_evaluators=[GradeEvaluator()]
)

  warn_deprecated(


## Run the agent and evaluators

In [11]:
import functools

from langchain.smith import arun_on_dataset, run_on_dataset

chain_results = run_on_dataset(
    dataset_name=DATASET_NAME,
    llm_or_chain_factory=QA_LangChain_RQA_chain,
    evaluation=evaluation_config,
    verbose=True,
    client=client,
    project_name=f"rag-chain-qa-1",
    # Project metadata communicates the experiment parameters,
    # Useful for reviewing the test results
    project_metadata={
        "env": "evl_chain.ipynb",
        "model": "gpt-4-0125-preview",
        "prompt": "QA_0226",
        "input_doc": "TRM_M463_M467_pypdf",
        "langchain_type": "ConversationalRetrievalChain"
    },
)

===== Load the embedding model =====


  warn_deprecated(


===== Create a ConversationalRetrievalChain chain =====
View the evaluation results for project 'rag-chain-qa-1' at:
https://smith.langchain.com/o/a2f265bc-195d-4980-affd-877d4fb55720/datasets/a117ee3a-5545-4f7f-bb72-95f756df6116/compare?selectedSessions=be988627-6c73-485e-88ea-f5ef04ce35d2

View all tests for Dataset ds_m460_trm at:
https://smith.langchain.com/o/a2f265bc-195d-4980-affd-877d4fb55720/datasets/a117ee3a-5545-4f7f-bb72-95f756df6116
===== Load the embedding model =====
===== Create a ConversationalRetrievalChain chain =====
===== Load the embedding model =====
===== Create a ConversationalRetrievalChain chain =====
===== Load the embedding model =====
===== Create a ConversationalRetrievalChain chain =====

===== Load the embedding model =====
===== Load the embedding model =====
===== Load the embedding model =====
===== Create a ConversationalRetrievalChain chain =====
===== Create a ConversationalRetrievalChain chain =====
===== Create a ConversationalRetrievalChain chai

Unnamed: 0,feedback.correctness,feedback.score_string:accuracy,feedback.scored_relevance,error,execution_time,run_id
count,24.0,24.0,24.0,0.0,25.0,25
unique,,,,0.0,,25
top,,,,,,f969b26c-7653-466e-97a8-d8a4c37c9ecd
freq,,,,,,1
mean,0.791667,0.641667,0.772917,,16.201695,
std,0.414851,0.333514,0.232182,,11.824602,
min,0.0,0.1,0.0,,3.984836,
25%,1.0,0.3,0.75,,5.865446,
50%,1.0,0.7,0.85,,10.6206,
75%,1.0,1.0,0.9,,24.473121,
