# Evaluate Dataset

## Generate code with various llms and send to WMX3 running for log and plot.

In [34]:

from langchain_openai import ChatOpenAI, AzureOpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.schema import StrOutputParser
from langchain.schema.runnable import Runnable
from langchain.schema.runnable.config import RunnableConfig

import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader, TextLoader, PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings, OpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import PromptTemplate
from langchain_core.documents import Document
from langchain.chains import LLMChain
from langchain_core.messages import HumanMessage, SystemMessage
from langchain.retrievers import BM25Retriever, EnsembleRetriever

from langchain_groq import ChatGroq
from langchain_community.chat_models.tongyi import ChatTongyi
from langchain_core.messages import HumanMessage
from langchain_ollama import ChatOllama

from time import *

from CodeClient import *
from make_code_runnable import *
from plot_log import *
from tqdm import tqdm
import json
import os
import re
from datetime import datetime
from dotenv import load_dotenv,find_dotenv


# Global variable to store the name of the LLM
llm_name = "gpt-4o-mini"
llm = ChatOpenAI(name="MCCoder and QA", model_name=llm_name, temperature=0.2, streaming=True)

# # Groq
# llm_name = "llama-3.1-8b-instant"
# llm = ChatGroq(
#     temperature=0.2,
#     model=llm_name)   # llama-3.1-8b-instant,  llama3-70b-8192,  llama-3.1-70b-versatile, llama-3.1-405b-reasoning, mixtral-8x7b-32768

# Tongyi Qwen
# llm_name = "qwen-plus"
# llm = ChatTongyi(
#     temperature=0.2,
#     model=llm_name)   # qwen-turbo(8k), qwen-plus  (32k), qwen-max  (6k),  qwen-max-longcontext (28k)

# Ollama
# llm_name = "deepseek-coder-v2"
# llm = ChatOllama(
# model="deepseek-coder-v2",            # codellama:7b , codellama:34b, tinyllama, codegeex4, deepseek-coder-v2
# temperature=0.2)


# Prepare docs for RAG

load_dotenv(find_dotenv()) 

# Preparation of documents for RAG-------------------------
# Vectorstore, for retrieval
embedding_model=OpenAIEmbeddings(model="text-embedding-3-large")   #text-embedding-3-large   #text-embedding-ada-002    #text-embedding-3-small

# Embedding model for Azure OpenAI, no need FQ.
# embedding_model = AzureOpenAIEmbeddings(model="text-embedding-3-large")


# If pdf vectorstore exists
vectorstore_path = "Vectorstore/chromadb-MCCoder"
if os.path.exists(vectorstore_path):
    vectorstore = Chroma(
                    embedding_function=embedding_model,
                    persist_directory=vectorstore_path,
                    ) 
    print("load from disk: " + vectorstore_path)
else:
        # Load from chunks and save to disk
    # vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model, persist_directory=vectorstore_path) 
    print("load from chunks")



# Txt loader of sample codes, for BM25 search
loader = TextLoader("./docs/WMX3API_MCEval_Samplecodes.txt")
docs = loader.load()

#Sample code chunk with dedicated separators
separators = ['``']  # Adjust based on actual document structure, `` is the end of each code snippet.
text_splitter = RecursiveCharacterTextSplitter(separators=separators, keep_separator=True, chunk_size=1000, chunk_overlap=200, add_start_index=True)
splits = text_splitter.split_documents(docs)

# Define a global variable user_question_global
user_question_global = ''

# Extracts and formats code instructions from a user question based on specific starting phrases.
def coder_router(user_question):
    """
    Extracts numbered sections of a user question based on specific starting phrases.
    
    If the question starts with 'Write a python code', 'Python code', or 'write python' (case insensitive),
    it splits the question into paragraphs that start with numbers (e.g., 1., 2., 3.) and adds 
    'Write python code to ' after the numbers. If the question does not start 
    with the specified phrases or does not contain numbered lists, the entire question is saved into a single 
    element array. If the question does not start with the specified phrases, NoCoder is set to 1.
    
    Args:
        user_question (str): The user's question.
    
    Returns:
        tuple: NoCoder (int), an array of strings with each element containing a code instruction or the entire question.
    """
    result = []
    NoCoder = 0
    # Check if the input starts with the specified prefixes
    if re.match(r'(?i)^(Write a python code|Python code|write python)', user_question):
        result.append(user_question)
    else:
        # Save the entire question to the array and set NoCoder to 1
        result.append(user_question)
        NoCoder = 1
    
    return NoCoder, result



# This function retrieves and concatenates documents for each element in the input string array.
def coder_retrieval(coder_router_result):
    """
    This function takes an array of strings as input. For each element in the array,
    it performs a retrieval using format_docs(retriever.invoke(element))
    and concatenates the element with the retrieval result into one long string, 
    with a newline character between them. Each concatenated result is separated by a specified separator.
    
    Args:
        coder_router_result (list): An array of strings.

    Returns:
        str: A single long string formed by concatenating each element with its retrieval result,
             separated by a newline character, and each concatenated result separated by a specified separator.
    """
    separator = "\n----------\n"
    long_string = ""
    using_basic_rag = False
    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})
    for element in coder_router_result:
        if using_basic_rag == True:
            # -------------------------------------------
            # Basic retrieval
            retrieval_result = format_docs(retriever.invoke(element))
        else:
            # -------------------------------------------
            # Fusion retrieval or hybrid search


            # initialize the bm25 retriever  
            bm25_retriever = BM25Retriever.from_documents(splits)
            bm25_retriever.k = 5

            # initialize the ensemble retriever
            ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, retriever], weights=[0.5, 0.5])

            ensemble_docs = ensemble_retriever.invoke(element)

            retrieval_result = format_docs(ensemble_docs)


        long_string += element + "\n" + retrieval_result + separator
    
    return long_string


# Joins the page content of each document with double newline
def format_docs(docs):
   return "\n\n".join(doc.page_content for doc in docs)


# Extracts code snippets written in Python from the given text
def extract_code(text):
    # Define the regular expression pattern to find text between ```python and ```
    pattern = r"```python(.*?)```"

    # Use re.findall to find all occurrences
    matches = re.findall(pattern, text, re.DOTALL)
    if matches == []: 
        return text
    # Return the matches, join them if there are multiple matches
    return "\n\n# ---\n\n".join(matches)


# Call LLM to generate code
def CoderLLM(user_question, code_context, task_id):

    # Prompt for code generation
    prompt_template = """Write a python code based on the following Question and Context. You need to choose the most relevant sample codes from the Context for a reference. And, note the following situations:
    1. Review the Question carefully and only if you find words as 'Axis number', 'IO Input' and 'IO Output'(case insensitive), add them to the first lines of the generated code in the following format: 
    # Axes = [Axis number 1, Axis number 2, ...]
    # Inputs = [byte.bit 1, byte.bit 2, ...]
    # Outputs = [byte.bit 1, byte.bit 2, ...]
    For instance, if the Question is '...Axis 9..., ...Axis 12..., ...Axis 2..., IO Input 0.3 and 1.2, ...IO Output 3.4 and 6.1', then exact the information after matching the keywords: "Axis", "Input", "Output":
    # Axes = [9, 12, 2]
    # Inputs = [0.3, 1.2, ...]
    # Outputs = [3.4, 6.1, ...]
    2. Include all the generated codes within one paragraph between ```python and ``` tags. 
    3. Don't import any library.
    4. Don't create any functions or example usage or unit test.
    5. You need to wait until the Axes reaches the target position and stops, after the motion API, unless otherwise specified. For instance, Wmx3Lib_cm.motion.Wait(4), while 4 is the Axis specified in Axes.
    6. Use StartPos for absolute positioning, as in 'Move Axis 4 to 200', and StartMov for relative positioning, as in 'Move Axis 4 by a distance of 200'.
    7. Strictly follow the Question for the specified profile type.
    8. If acceleration/acc, deceleration/dec, and velocity/speed are not specified in the user query, use the default values provided in the context's sample codes.
    ----------------------------------------------

    Question: 
    {question}

    Context: 
    {context}

        """

    prompt_code = ChatPromptTemplate.from_template(prompt_template)

    rag_chain = (
        #{"context": context_msg, "question": RunnablePassthrough()}
        prompt_code
        | llm
        | StrOutputParser()
    )


    codes = rag_chain.invoke({"context": code_context, "question": user_question})
    
    folder_path = f'/Users/yin/Documents/GitHub/MCCodeLog/{llm_name}'
    file_name = f"{task_id}_{llm_name}_direct_output.txt"
    file_path = f"{folder_path}/{file_name}"
    os.makedirs(folder_path, exist_ok=True)

    # Write the direct output codes to the file
    with open(file_path, 'w') as file:
        file.write(codes)

    # Get python code from the output of LLM
    ext_codes = extract_code(codes)

    return ext_codes

# Corrects the provided error codes based on specified error information calling LLM
def self_correct(err_info, original_code):
   # Search to get the python function as a context for self correction.
    python_function_retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6, "filter":{"source":"./docs/WMX3API_FunctionPython.json"}}) 

    # Split the string into lines
    lines = err_info.split('\n')
    error_str = ''
    # Iterate through each line to find 'Error:'
    for line in lines:
        if 'Error:' in line:
            # Assign the line containing 'Error:' to error_str
            error_str = line
    # error_str = 'GetInBit'
    print('\nerror string:----------------------\n' + error_str )
    python_function_result = python_function_retriever.invoke(error_str)
    err_ref = format_docs(python_function_result)
    print('\nerror ref:----------------------\n' + err_ref)

    
    
   # Remember to write "python" code in the prompt later
    template = """Correct the original code based on the user question, error infomation and FunctionPython reference. And, note the following situations:
    1. Only if the error is 'variable_name is not defined', and if the variable_name is in the user question , assigni it a value of None firstly.
    2. Only if an error information indicates that acc, dec, velocity, or other arguments are out of range, just assign them the default values presented in the preceding code samples. For instance, xxx.profile.acc = 10000, xxx.profile.dec = 10000.
    3. Only if an error information indicates that '... buffer memory has already been allocated...', free the buffer in the beginning.

        User question:
        {user_question}

        Original code:
        {original_code}

        Error information:
        {err_info}

        FunctionPython reference:
        {err_ref}


        """

    custom_rag_prompt = PromptTemplate.from_template(template)
    
    rag_chain = (
            # {"err_codes": RunnablePassthrough()}
            custom_rag_prompt
            | llm
            | StrOutputParser()
        )

    code_corrected=rag_chain.invoke({ "user_question": user_question_global, "original_code": original_code, "err_info": err_info, "err_ref": err_ref})
 
    return(code_corrected)


# Decompose tasks from user questions using a LLM
def task_decomposer_llm(user_question):
   #  
    template = """Only if the user question contains a consecutive numbered list as '1.', '2.', '3.', decompose the tasks; otherwise just output the user question. For example, the user question 'Write Python code to execute the following tasks: 1. Move Axis 1 to 200; 2. Move Axis 9 as a distance of 150; 3. Set IO output 4.3 to 1, and sleep for 1.5 seconds.' should be decomposed into three tasks as output adding 'Write python code to ':
    1. Write python code to Move Axis 1 to 200;
    2. Write python code to Move Axis 9 as a distance of 150;
    3. Write python code to Set IO output 4.3 to 1, and sleep for 1.5 seconds.

        User question:
        {question}

        Output:

        """

    custom_rag_prompt = PromptTemplate.from_template(template)
    
    rag_chain = (
            # {"err_codes": RunnablePassthrough()}
            custom_rag_prompt
            | llm
            | StrOutputParser()
        )

    task_str=rag_chain.invoke({"question": user_question})
    
    lines = task_str.splitlines()
    tasks = []
    
    # Check if each line starts with the correct number followed by a period and a space
    for i, line in enumerate(lines):
        expected_number = f"{i + 1}."
        if line.startswith(expected_number):
            tasks.append(line.strip())
    
    # If there are no multiple tasks, just output the original question.
    if len(tasks) == 0:
        tasks.append(task_str)
    
    return tasks


# Decompose tasks from user questions using a LLM
def tasks_composer_llm(user_question, code_from_llm_str):
   #  
    template = """Write a Python code that incorporates the Context_Codes (tasks) to address the following Question:

    Question: 
    {question}

    Context_Codes: 
    {context}

        """

    custom_rag_prompt = PromptTemplate.from_template(template)
    
    rag_chain = (
            # {"err_codes": RunnablePassthrough()}
            custom_rag_prompt
            | llm
            | StrOutputParser()
        )

    code_from_composer_llm=rag_chain.invoke({"questions": user_question, "context": code_from_llm_str})
    # Get python code from the output of LLM
    code_from_composer_llm = extract_code(code_from_composer_llm)
    
    return code_from_composer_llm


# Send the code generated by the LLM to WMX3 engine
def RunCode(codes_from_llm, task_info):

    RunnableCode = make_code_runnable(codes_from_llm, llm_name, task_info)
    # print(RunnableCode)

    # Run Code in WMX3
    codereturn = SendCode(RunnableCode)
    # If there is an error, invoke llm to self-correct, and then send to WMX3 again.
    if 'error' in codereturn.lower():
        code_corrected = self_correct(codereturn, codes_from_llm)
        msgCode = extract_code(code_corrected)
        RunnableCode = make_code_runnable(msgCode, llm_name, task_info)
        codereturn = SendCode(RunnableCode)
        if 'error' in codereturn.lower():
            self_correct_str = "Self-correction but still got an error.\n\n"
        else:
            self_correct_str = "Self-corrected.\n\n"
        
        codereturn += self_correct_str
        print(self_correct_str)

    return codereturn
    

# Evaluate dataset
def EvalDataset():
    # Declare the use of the global variable to store user question
    global user_question_global

    # Define task range
    task_infos = range(1, 106)

    # Read JSON file
    with open("./docs/WMX3API_MCEval_Evaluation_Dataset.json", "r") as f:
        dataset = json.load(f)

    # Initialize statistics dictionary
    statistics = {
        1: {'correct': 0, 'syntax_error': 0, 'api_error': 0, 'self_corrected_error': 0, 'self_corrected_correct': 0, 'total_errors': 0, 'total': 0},
        2: {'correct': 0, 'syntax_error': 0, 'api_error': 0, 'self_corrected_error': 0, 'self_corrected_correct': 0, 'total_errors': 0, 'total': 0},
        3: {'correct': 0, 'syntax_error': 0, 'api_error': 0, 'self_corrected_error': 0, 'self_corrected_correct': 0, 'total_errors': 0, 'total': 0}
    }

    total_correct = 0
    total_syntax_error = 0
    total_api_error = 0
    total_self_corrected_error = 0
    total_self_corrected_correct = 0
    total_errors = 0

    # Initialize error log list
    error_log = []
    self_corrected_log = []

    start_time = datetime.now()  # Record start time

    # Iterate through task range
    for task_info in tqdm(task_infos, desc="Processing tasks"):
        # Get task information
        task_entry = next(item for item in dataset if item["TaskId"] == task_info)
        user_question = task_entry["Instruction"]
        difficulty = task_entry["Difficulty"]
        task_id = task_entry["TaskId"]

        print(f"Task ID: {task_id} 🔽")
        folder_path = f'/Users/yin/Documents/GitHub/MCCodeLog/{llm_name}'
        
        user_question_global = user_question
        # Call coder_router function
        NoCoder, coder_router_result = coder_router(user_question)

        # Route the result based on NoCoder value
        if NoCoder == 0:  # Coding task
            tasks = task_decomposer_llm(user_question)
            # Initialize a code string from LLM
            code_from_llm_str = ''
            for i in range(len(tasks)):
                coder_return = coder_retrieval(coder_router_result)  # Code context
                # Call CoderLLM function
                code_from_llm = CoderLLM(user_question, coder_return, task_id)
                code_from_llm_str += f'\n#---------task{i}:---------\n' + tasks[i] + f'\n#---------code{i}:---------\n' + code_from_llm 

        # Single task
        if len(tasks) == 1:
            # Run code
            CoderResult = RunCode(code_from_llm, task_info)
        else:  # Multi tasks
            code_from_composer_llm = tasks_composer_llm(user_question, code_from_llm_str)
            CoderResult = RunCode(code_from_composer_llm, task_info)

        # Init Correctness, if equals 1, then plot.
        Correctness = 0
        statistics[difficulty]['total'] += 1
        # Check for "Self-correct" in the result
        if 'self-correct' in CoderResult.lower():
            self_corrected_log.append({'TaskId': task_info, 'Result': CoderResult})
            if 'self-correction but still got an error' in CoderResult.lower():
                error_info = {
                    'TaskId': task_info,
                    'Error': CoderResult
                }
                error_log.append(error_info)
                statistics[difficulty]['total_errors'] += 1
                statistics[difficulty]['self_corrected_error'] += 1
                total_self_corrected_error += 1
                total_errors += 1

                if 'syntaxerror' in CoderResult.lower():
                    statistics[difficulty]['syntax_error'] += 1
                    total_syntax_error += 1
                else:
                    statistics[difficulty]['api_error'] += 1
                    total_api_error += 1
            elif 'self-corrected' in CoderResult.lower():
                statistics[difficulty]['correct'] += 1
                statistics[difficulty]['self_corrected_correct'] += 1
                total_self_corrected_correct += 1
                total_correct += 1
                Correctness = 1
        else:
            statistics[difficulty]['correct'] += 1
            total_correct += 1
            Correctness = 1

        if Correctness == 1:
            os.makedirs(folder_path, exist_ok=True)
            # Plot with the log file
            log_file_path = os.path.join(folder_path, f"{task_info}_{llm_name}_log.txt")
            plot_log(log_file_path)
            print('# -------------------------------------------------------------------------\n')

    end_time = datetime.now()  # Record end time
    total_runtime = end_time - start_time  # Calculate total runtime
    total_runtime_str = f"{total_runtime.seconds // 60}m{total_runtime.seconds % 60}s"  # Format runtime

    # Define the folder path and the file name
    file_name = f'{llm_name}_Runlog_{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}_{total_runtime_str}.txt'
    file_path = f'{folder_path}/{file_name}'

    # Open the file in write mode
    with open(file_path, 'w') as file:
        # Print and write overall statistics
        total_tasks = sum([statistics[d]['total'] for d in statistics])
        overall_results = (
            f"Overall Results:\n"
            f"  Total Correct: {total_correct} ({total_correct / total_tasks:.2%})\n"
            f"      Total Self-corrected Correct: {total_self_corrected_correct} ({total_self_corrected_correct / total_tasks:.2%})\n"
            f"  Total Errors: {total_errors} ({total_errors / total_tasks:.2%})\n"
            f"      Total Syntax Error: {total_syntax_error} ({total_syntax_error / total_tasks:.2%})\n"
            f"      Total API Error: {total_api_error} ({total_api_error / total_tasks:.2%})\n"
            f"  Total Self-corrected Errors: {total_self_corrected_error} ({total_self_corrected_error / total_tasks:.2%})\n\n"
            
        )
        print(overall_results)
        file.write(overall_results)

        # Print and write statistics by difficulty
        for difficulty, counts in statistics.items():
            total_difficulty = counts['total']
            if total_difficulty != 0:
                difficulty_results = (
                    f"Difficulty: {difficulty}\n"
                    f"  Correct: {counts['correct']} ({counts['correct'] / total_difficulty:.2%})\n"
                    f"      Self-corrected Correct: {counts['self_corrected_correct']} ({counts['self_corrected_correct'] / total_difficulty:.2%})\n"
                    f"  Errors: {counts['total_errors']} ({counts['total_errors'] / total_difficulty:.2%})\n"
                    f"      Syntax Error: {counts['syntax_error']} ({counts['syntax_error'] / total_difficulty:.2%})\n"
                    f"      API Error: {counts['api_error']} ({counts['api_error'] / total_difficulty:.2%})\n"
                    f"  Self-corrected Errors: {counts['self_corrected_error']} ({counts['self_corrected_error'] / total_difficulty:.2%})\n\n"
                    
                )
                print(difficulty_results)
                file.write(difficulty_results)

        # Print and write error log
        if error_log:
            error_log_results = "Error Log:\n"
            for error in error_log:
                error_log_results += f"  TaskId: {error['TaskId']}, Error: {error['Error']}\n"
            error_log_results += "\n"
            print(error_log_results)
            file.write(error_log_results)

        # Print and write self-corrected log
        if self_corrected_log:
            self_corrected_log_results = "Self-corrected Log:\n"
            for log in self_corrected_log:
                self_corrected_log_results += f"  TaskId: {log['TaskId']}, Result: {log['Result']}\n"
            self_corrected_log_results += "\n"
            print(self_corrected_log_results)
            file.write(self_corrected_log_results)



EvalDataset()
    


load from disk: Vectorstore/chromadb-MCCoder


Processing tasks:   0%|          | 0/10 [00:00<?, ?it/s]

Task ID: 1 🔽
codeerr:
Traceback (most recent call last):
  File "\\mac\Home\Downloads\codedemo\sample.py", line 300, in <module>
    main()
  File "\\mac\Home\Downloads\codedemo\sample.py", line 204, in main
    ret = WMX3Log.SetCustomLog(0)
          ^^^^^^^^^^^^^^^^^^^^^^^
TypeError: Log.SetCustomLog() missing 1 required positional argument: 'input'
!!!
----------------------


error string:----------------------
TypeError: Log.SetCustomLog() missing 1 required positional argument: 'input'

error ref:----------------------
{
        "No": 699,
        "FunctionPython": "def SetLog(channel, input)\u00a0",
        "FunctionC++": "WMX3APIFUNC SetLog(unsigned int channel, LogInput *input)",
        "Parameters": "[in] channel The channel of the log operation. Each channel operates independently from all other channels. The first channel is 0. The number of available channels is defined by the maxLogChannel constant. \n[in] input A pointer to an object of a class that inherits the LogInpu

Processing tasks:  10%|█         | 1/10 [01:16<11:24, 76.07s/it]

codeerr:
  File "\\mac\Home\Downloads\codedemo\sample.py", line 200
    It appears that you have provided a description of two functions related to logging in a software or hardware environment. Let's break down each function and their details:
IndentationError: unexpected indent
!!!
----------------------

Self-correction but still got an error.


Task ID: 2 🔽


Processing tasks:  10%|█         | 1/10 [01:53<17:04, 113.79s/it]


ResponseError: an unknown error was encountered while running the model 