In [None]:
from openai import OpenAI
from config import OPENAI_API_KEY
client = OpenAI(
    api_key=OPENAI_API_KEY
)

In [None]:
from pydantic import BaseModel

class Response(BaseModel):
    message: str
    total_tokens: int
    obj: object

    def entries(self):
        return list(map(
            lambda c: 'PASS' if c == '1' else 'FAIL',
            self.message,
        ))

In [None]:
with open('prompts/evaluation.txt', 'r') as f:
    SYSTEM_PROMPT = f.read()

def evaluate_output(output, real) -> Response:
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": [{
                    "type": "text",
                    "text": SYSTEM_PROMPT
                }]
            },
            {
                "role": "user",
                "content": [{
                    "text": f"Student audit:\n{output}\n\nGround truth:{real}",
                    "type": "text"
                }]
            },
        ],
        temperature=0,
        max_tokens=256,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        response_format={
            "type": "text"
        }
    )

    return Response(**{
        'message': response.choices[0].message.content,
        'total_tokens': response.usage.total_tokens,
        'obj': response
    })

In [None]:
from pathlib import Path

model_alias = 'llama3.1-8b'

MODEL_DIR = Path(model_alias)

In [None]:
import pandas as pd
data = pd.read_csv(MODEL_DIR/f'{model_alias}-outputs.csv')

In [None]:
def prepare_data(data):
    data.loc[data["real"].isnull(), 'real'] = "There is no vulnearbility"
    data.apply(lambda col: col.replace('\\n', '\n'))
    return data

data = prepare_data(data)

In [None]:
PRICE_PER_1M_INPUT_TOKENS = 0.150
def calculate_cost_milli_dollars(total_tokens):
    cost_dollars = (total_tokens / 1_000_000) * PRICE_PER_1M_INPUT_TOKENS
    cost_milli_dollars = cost_dollars * 100
    return cost_milli_dollars

In [None]:
from tqdm import tqdm
import csv

total_tokens = 0

with open(MODEL_DIR / 'evaluation.csv', 'w') as f:
    w = csv.writer(f)
    w.writerow(['id', 'cr0', 'cr1', 'cr2', 'cr3'])
    with tqdm(data.iterrows(), total=len(data), desc="Processing", unit="row") as progress_bar:
        for idx, row in progress_bar:
            vuln_out = row.output != 'There is no vulnearbility'
            vuln_real = row.real != 'There is no vulnearbility'

            if not vuln_out and not vuln_real:
                w.writerow([idx, 'PASS', 'PASS', 'PASS', 'PASS'])
                continue

            if vuln_out != vuln_real:
                w.writerow([idx, 'FAIL', 'FAIL', 'FAIL', 'FAIL'])
                continue

            r = evaluate_output(row.output, row.real)
            w.writerow([idx, 'PASS'] + r.entries())
            total_tokens += r.total_tokens
            progress_bar.set_postfix({
                'Price (0.01$)': calculate_cost_milli_dollars(total_tokens),
                'tokens': total_tokens
            })