# CTC evaluation pipeline
- gold truth: human answer (image_label + human description + metaphorical meaning)
- score：5
- evaluation：prompt + GPT-4o
  

In [2]:
# import the OpenAI Python library for calling the OpenAI API
from openai import OpenAI
from copy import deepcopy
import json
import os

In [2]:
sys_prompt = '''
# Role
You are an impartial judge, familiar with Chinese traditional culture and traditional paintings.

## Attention
You are responsible for evaluating the quality of the descriptions provided by the model for traditional Chinese paintings. Your evaluation should refer to the human answer and score based on the Evaluation Standard.

## Evaluation Standard：
- [1 point]: The description of the picture is incomplete and does not include any background information. It only mentions the most obvious elements in the picture, lacking recognition and understanding of traditional cultural elements.
  
- [2 points]: The description of the image is relatively complete, but the background information is limited. Some traditional cultural elements can be identified, but their meaning is not explained in depth. There is a lack of insight into the aesthetic characteristics or skills of the image.
  
- [3 points]: The description of the image is comprehensive and includes some background information. The traditional cultural elements can be accurately identified and briefly explained. There is a preliminary understanding of the aesthetic characteristics or skills of the picture, but there is a lack of in-depth analysis. Some historical or cultural background is mentioned, but it is not fully expanded.
  
- [4 points]: The description of the image is detailed and contains rich background information. It is able to explain the significance of traditional cultural elements in depth. It provides a relatively in-depth analysis of the aesthetic characteristics and skills of the image. It provides relevant historical and cultural background and attempts to explore the implication of the image, but the implication is misunderstood.
  
- [5 points]: The description of the image is comprehensive and accurate, and the background information is in-depth and comprehensive. It can not only accurately interpret traditional cultural elements, but also reveal their deep meaning. It has unique insights into the aesthetic characteristics and skills of the image. It deeply expounds on the relevant historical and cultural background, and can see through the philosophical thoughts, humanistic spirit or social values contained in the image, showing a deep understanding of Chinese traditional culture.

## Constraints
- Avoid any position biases and be as objective as possible
- Do not allow the length of the descriptions to influence your evaluation

## Workflow
Output your final verdict by strictly following this format: "[ratings]". 
'''

In [9]:
def evaluate_model(sys_prompt, eval_answer):
    # call the OpenAI API to generate a response
  client = OpenAI(api_key='xxx')
  completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": sys_prompt},
            {"role": "user", "content": eval_answer}
        ],
        temperature=0.0,
        top_p=0.95
    )
  response = completion.choices[0].message.content
  return response

In [None]:
# load data: exp_CTC.json - the data file containing Chinese traditional paintings
with open('exp_CTC.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

result_data = deepcopy(data)

for instance in result_data:
    title = instance.get('title', '')
    explanation = instance.get('explanation', '')
    metaphorical_meaning = instance.get('metaphorical meaning', '')
    model_dep = instance.get('model_dep', '')

    eval_answer = f'''
    human answer: {{{title}{explanation}{metaphorical_meaning}}}
    model descriptions: {{{model_dep}}}
    '''

    try:
        score = evaluate_model(sys_prompt, eval_answer)
    except Exception as e:
        print(title, ':', e)
        score = '0'

    # score
    instance['score'] = score

with open('exp_CTC_with_scores.json', 'w', encoding='utf-8') as f:
    json.dump(result_data, f, ensure_ascii=False, indent=4)

print('Evaluation completed and results saved to exp_CTC_with_scores.json')

In [None]:
import re
# read json file: exp_CTC_with_scores.json - the data file containing Chinese traditional paintings with scores
with open('exp_CTC_with_scores.json', 'r', encoding='utf-8') as file:
	data = json.load(file)

count = 130 # total number of paintings
total_score = 0

# difficulty_scores
difficulty_scores = {
    "diff": {"total_score": 0, "count": 0},
    "mid": {"total_score": 0, "count": 0},
    "easy": {"total_score": 0, "count": 0}
}

# emotion_scores
emotion_scores = {
    "pos": {"total_score": 0, "count": 0},
    "neu": {"total_score": 0, "count": 0},
    "neg": {"total_score": 0, "count": 0}
}

for instance in data:
    difficulty = instance.get('difficulty')
    emotion = instance.get('emotion')
    score_str = instance.get('score', '[]')
    score_match = re.search(r'\d+', score_str)
    score = int(score_match.group()) if score_match else 0
    total_score += score

    if difficulty in difficulty_scores:
        difficulty_scores[difficulty]['total_score'] += score
        difficulty_scores[difficulty]['count'] += 1

    if emotion in emotion_scores:
        emotion_scores[emotion]['total_score'] += score
        emotion_scores[emotion]['count'] += 1    

average_score = total_score / count
print(f'overall: {average_score:.2f}, total:{count}')

# difficulty
for difficulty, values in difficulty_scores.items():
    total_score = values['total_score']
    count = values['count']
    average_score = total_score / count
    print(f'{difficulty}_overall: {average_score:.2f}, total:{count}')

# emotion
for emotion, values in emotion_scores.items():
    total_score = values['total_score']
    count = values['count']
    average_score = total_score / count
    print(f'{emotion}_overall: {average_score:.2f}, total:{count}')