In [None]:
from pathlib import Path
MODEL_ALIAS = 'llama3.1-8b'
MODEL_DIR = Path(MODEL_ALIAS)

## Run evaluations

In [None]:
from openai import OpenAI
from config import OPENAI_API_KEY
client = OpenAI(
    api_key=OPENAI_API_KEY
)

In [None]:
from pydantic import BaseModel

class Response(BaseModel):
    message: str
    total_tokens: int
    obj: object

    def entries(self):
        return list(map(
            lambda c: 'PASS' if c == '1' else 'FAIL',
            self.message,
        ))

In [None]:
with open('prompts/evaluation.txt', 'r') as f:
    SYSTEM_PROMPT = f.read()

def evaluate_output(output, real) -> Response:
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": [{
                    "type": "text",
                    "text": SYSTEM_PROMPT
                }]
            },
            {
                "role": "user",
                "content": [{
                    "text": f"Student audit:\n{output}\n\nGround truth:{real}",
                    "type": "text"
                }]
            },
        ],
        temperature=0,
        max_tokens=256,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        response_format={
            "type": "text"
        }
    )

    return Response(**{
        'message': response.choices[0].message.content,
        'total_tokens': response.usage.total_tokens,
        'obj': response
    })

In [None]:
from pathlib import Path

model_alias = 'llama3.1-8b'

MODEL_DIR = Path(model_alias)

In [None]:
import pandas as pd
data = pd.read_csv(MODEL_DIR/f'{MODEL_ALIAS}-outputs.csv')

In [None]:
def prepare_data(data):
    data.loc[data["real"].isnull(), 'real'] = "There is no vulnearbility"
    data.apply(lambda col: col.replace('\\n', '\n'))
    return data

data = prepare_data(data)

In [None]:
PRICE_PER_1M_INPUT_TOKENS = 0.150
def calculate_cost_milli_dollars(total_tokens):
    cost_dollars = (total_tokens / 1_000_000) * PRICE_PER_1M_INPUT_TOKENS
    cost_milli_dollars = cost_dollars * 100
    return cost_milli_dollars

In [None]:
from tqdm import tqdm
import csv

total_tokens = 0

with open(MODEL_DIR / 'evaluation.csv', 'w') as f:
    w = csv.writer(f)
    w.writerow(['id', 'cr0', 'cr1', 'cr2', 'cr3'])
    with tqdm(data.iterrows(), total=len(data), desc="Processing", unit="row") as progress_bar:
        for idx, row in progress_bar:
            vuln_out = row.output != 'There is no vulnearbility'
            vuln_real = row.real != 'There is no vulnearbility'

            if not vuln_out and not vuln_real:
                w.writerow([idx, 'PASS', 'PASS', 'PASS', 'PASS'])
                continue

            if vuln_out != vuln_real:
                w.writerow([idx, 'FAIL', 'FAIL', 'FAIL', 'FAIL'])
                continue

            r = evaluate_output(row.output, row.real)
            w.writerow([idx, 'PASS'] + r.entries())
            total_tokens += r.total_tokens
            progress_bar.set_postfix({
                'Price (0.01$)': calculate_cost_milli_dollars(total_tokens),
                'tokens': total_tokens
            })

## Plot evaluations

In [None]:
import pandas as pd

data = pd.read_csv(MODEL_DIR / 'evaluation.csv').drop(columns=['id'])

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score

# Calculate accuracy and F1 score for each criterion
def compute_metrics(data):

    data = data == 'PASS'
    criteria = list(data.columns)
    overall = data.all(axis=1)
    # print(overall[overall])
    ideal = [1] * len(data[criteria[0]])

    acc = [
        accuracy_score(data[criterion], ideal) for criterion in criteria
    ]
    acc.append(accuracy_score(overall, ideal))

    f1 = [
        f1_score(data[criterion], ideal) for criterion in criteria
    ]
    f1.append(f1_score(overall, ideal))

    return acc, f1

# Compute metrics for individual criteria
labels = ['Criterion 1', 'Criterion 2', 'Criterion 3', 'Criterion 4', 'Overall']
colors = ['turquoise', 'limegreen', 'darkorange', 'orangered', 'violet']
accuracy, f1_scores = compute_metrics(data)
# Create plots
fig, ax = plt.subplots(2, 1, figsize=(6, 7), sharex=True)


# Plot Accuracy
bars = ax[0].bar(labels, accuracy, color=colors)
ax[0].set_ylabel('Accuracy', fontsize=14)
ax[0].set_ylim([0, 1])
ax[0].set_title(f'{MODEL_ALIAS} Performance Metrics', fontsize=16)
ax[0].grid(axis='y')
fontdict = {'size': 11, 'weight': 'bold'}
for idx, bar in enumerate(bars):
    ax[0].text(bar.get_x() + bar.get_width() / 2.0 + 0.2, accuracy[idx]+0.02, f'{accuracy[idx]:.3f}', color=colors[idx], va='center', ha='right', fontdict=fontdict)

# Plot F1 Score
bars = ax[1].bar(labels, f1_scores, color=colors)
ax[1].set_ylabel('F1 Score', fontsize=14)
ax[1].set_ylim([0, 1])
ax[1].grid(axis='y')

ax[1].set_yticklabels([f'{0.2 * n:.1f}' for n in range(0, 5)])
ax[0].set_yticklabels(['0.0\n1.0', *[f'{0.2 * n:.1f}' for n in range(1, 6)]])

for idx, bar in enumerate(bars):
    ax[1].text(bar.get_x() + bar.get_width() / 2.0 + 0.2, f1_scores[idx]+0.02, f'{f1_scores[idx]:.3f}', color=colors[idx], va='center', ha='right', fontdict=fontdict)

ax[0].axvline(x=0.3 + bars[0].get_width() * 4, color='black', linestyle='--', linewidth=1.5)
ax[1].axvline(x=0.3 + bars[0].get_width() * 4, color='black', linestyle='--', linewidth=1.5)


plt.tight_layout()
plt.subplots_adjust(hspace=0.00)
plt.savefig(MODEL_DIR / 'metrics.pdf', format='pdf', bbox_inches='tight', pad_inches=0)
plt.savefig(MODEL_DIR / 'metrics.png', format='png', bbox_inches='tight', pad_inches=0)
plt.show()