# Baseline 모델 개발 및 평가 지표 측정

In [None]:
import math
import re

from tqdm.notebook import tqdm

from eval import get_eval_data, pointwise_eval
from utils import summarize

### 비용 기반 후보 모델 선정

- Claude 3 Haiku
- Gemini 1.5 Flash
- ChatGPT 3.5

In [None]:
PROMPT_BASELINE = f"""아래 사용자 대화에 대해 3문장 내로 요약해주세요:"""

In [None]:
print(get_eval_data()[0])

In [None]:
summary = summarize(
    conversation=get_eval_data()[0],
    prompt=PROMPT_BASELINE,
    model='claude-3-haiku-20240307'
)
eval_comment = pointwise_eval(get_eval_data()[0], summary)

print(summary)
print(eval_comment)

In [None]:
summary = summarize(
    conversation=get_eval_data()[0],
    prompt=PROMPT_BASELINE,
    model='gemini-1.5-flash-001'
)
eval_comment = pointwise_eval(get_eval_data()[0], summary)

print(summary)
print(eval_comment)

In [None]:
summary = summarize(
    conversation=get_eval_data()[0],
    prompt=PROMPT_BASELINE,
    model='gpt-3.5-turbo-0125',
)
eval_comment = pointwise_eval(get_eval_data()[0], summary)

print(summary)
print(eval_comment)

In [None]:
models = [
    'claude-3-haiku-20240307',
    'gemini-1.5-flash-001',
    'gpt-3.5-turbo-0125'
]
scores = {model: [] for model in models}
pattern = r'\[\[\d+\]\]'

for model in models:
    for i in tqdm(range(len(get_eval_data()))):
        summary = summarize(
            conversation=get_eval_data()[i],
            prompt=PROMPT_BASELINE,
            model=model
        )
        eval_comment = pointwise_eval(get_eval_data()[i], summary)
        match = re.search(pattern, eval_comment)
        matched_string = match.group(0)
        score = int(matched_string[2])
        scores[model].append(score)

In [None]:
for model in scores:
    print(scores[model], model)

In [None]:
for model in scores:
    mean = sum(scores[model]) / len(scores[model])
    variance = sum((x - mean) ** 2 for x in scores[model]) / (len(scores[model]) - 1)
    std_dev = math.sqrt(variance)
    print(f'{model}: {mean} / {round(std_dev, 2)}')

In [None]:
for model in scores:
    print(model, max(scores[model]), min(scores[model]))