In [None]:
from pathlib import Path
MODEL_ALIAS = 'llama3.1-8b-4bit'
MODEL_DIR = Path('runs') / MODEL_ALIAS

# Evalutations

In [None]:
from openai import OpenAI
from config import OPENAI_API_KEY
client = OpenAI(
    api_key=OPENAI_API_KEY
)

In [None]:
from pydantic import BaseModel

class Response(BaseModel):
    message: str
    total_tokens: int
    obj: object

    def entries(self):
        return list(map(
            lambda c: 'PASS' if c == '1' else 'FAIL',
            self.message,
        ))

In [None]:
def prompt(messages) -> Response:
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        temperature=0,
        max_tokens=256,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        response_format={
            "type": "text"
        }
    )

    return Response(**{
        'message': response.choices[0].message.content,
        'total_tokens': response.usage.total_tokens,
        'obj': response
    })

In [None]:
PRICE_PER_1M_INPUT_TOKENS = 0.150
def calculate_cost_milli_dollars(total_tokens):
    cost_dollars = (total_tokens / 1_000_000) * PRICE_PER_1M_INPUT_TOKENS
    cost_milli_dollars = cost_dollars * 100
    return cost_milli_dollars

## Run description evaluations

In [None]:
with open('evaluation-prompts/description.txt', 'r') as f:
    SYSTEM_PROMPT_DESC = f.read()

def evaluate_descriptions(output, real) -> Response:
    messages = [
        {
            "role": "system",
            "content": [{
                "type": "text",
                "text": SYSTEM_PROMPT_DESC
            }]
        },
        {
            "role": "user",
            "content": [{
                "text": f"Student audit:\n{output}\n\nGround truth:{real}",
                "type": "text"
            }]
        },
    ]
    return prompt(messages)

In [None]:
import pandas as pd
data = pd.read_csv(MODEL_DIR/'descriptions.csv')

In [None]:
def prepare_data(data):
    data.loc[data["real"].isnull(), 'real'] = "There is no vulnerability"
    data.loc[data["output"].isnull(), 'output'] = ""
    data.apply(lambda col: col.replace('\\n', '\n'))
    return data

data = prepare_data(data)

In [None]:
from tqdm import tqdm
import csv

total_tokens = 0

with open(MODEL_DIR / 'descriptions-eval.csv', 'w') as f:
    w = csv.writer(f)
    w.writerow(['id', 'cr0', 'cr1', 'cr2', 'cr3'])
    with tqdm(data.iterrows(), total=len(data), desc="Processing", unit="row") as progress_bar:
        for idx, row in progress_bar:
            vuln_out = not row.output.startswith('There is no vulnerability')
            vuln_real = not row.real.startswith('There is no vulnerability')

            if not vuln_out and not vuln_real:
                w.writerow([idx, 'PASS', 'PASS', 'PASS', 'PASS'])
                continue

            if vuln_out != vuln_real:
                w.writerow([idx, 'FAIL', 'FAIL', 'FAIL', 'FAIL'])
                continue

            r = evaluate_descriptions(row.output, row.real)
            w.writerow([idx, 'PASS'] + r.entries())
            total_tokens += r.total_tokens
            progress_bar.set_postfix({
                'Price (0.01$)': calculate_cost_milli_dollars(total_tokens),
                'tokens': total_tokens
            })

## Run Recommendation evaluations

In [None]:
with open('evaluation-prompts/recommendation.txt', 'r') as f:
    SYSTEM_PROMPT_RECOMMENDATIONS = f.read()

In [None]:
def evaluate_recommendations(output, real):
    messages = [
        {
            "role": "system",
            "content": [{
                "type": "text",
                "text": SYSTEM_PROMPT_RECOMMENDATIONS
            }]
        },
        {
            "role": "user",
            "content": [{
                "text": f"Output:\n{output}\n\nReal:{real}",
                "type": "text"
            }]
        },
    ]

    return prompt(messages)

In [None]:
import pandas as pd
data = pd.read_csv(MODEL_DIR/'recommendations.csv')

In [None]:
from tqdm import tqdm
import csv

total_tokens = 0

with open(MODEL_DIR / 'recommendations-eval.csv', 'w') as f:
    w = csv.writer(f)
    w.writerow(['id', 'cr'])
    with tqdm(data.iterrows(), total=len(data), desc="Processing", unit="row") as progress_bar:
        for idx, row in progress_bar:
            r = evaluate_recommendations(row.output, row.real)
            w.writerow([idx] + r.entries())
            total_tokens += r.total_tokens
            progress_bar.set_postfix({
                'Price (0.01$)': calculate_cost_milli_dollars(total_tokens),
                'tokens': total_tokens
            })

# Plot evaluations

In [None]:
from sklearn.metrics import accuracy_score, f1_score
from pydantic import BaseModel

class Statistic(BaseModel):
    name: str
    accuracy: float
    f1: float

    def __str__(self):
        return f'acc={self.accuracy:.3f} f1={self.f1:.3f} <{self.name}>'

    def __repr__(self):
        return str(self)

# Calculate accuracy and F1 score for each criterion
def compute_overall_metrics(data, name) -> Statistic:
    data = data == 'PASS'
    overall = data.all(axis=1)
    ideal = [1] * len(data[data.columns[0]])
    return Statistic(
        name=name,
        accuracy=accuracy_score(overall, ideal),
        f1=f1_score(overall, ideal, average='binary')
    )

## Description evaluations

In [None]:
import pandas as pd

data = pd.read_csv(MODEL_DIR / 'descriptions-eval.csv').drop(columns=['id'])

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score

# Calculate accuracy and F1 score for each criterion
def compute_metrics(data):

    data = data == 'PASS'
    criteria = list(data.columns)
    overall = data.all(axis=1)
    ideal = [1] * len(data[criteria[0]])

    acc = [
        accuracy_score(data[criterion], ideal) for criterion in criteria
    ]
    acc.append(accuracy_score(overall, ideal))

    f1 = [
        f1_score(data[criterion], ideal) for criterion in criteria
    ]
    f1.append(f1_score(overall, ideal))

    return acc, f1

# Compute metrics for individual criteria
labels = ['CRIT 1', 'CRIT 2', 'CRIT 3', 'CRIT 4', 'OVERALL']
colors = ['turquoise', 'limegreen', 'darkorange', 'orangered', 'violet']
accuracy, f1_scores = compute_metrics(data)

# Create plots
fig, ax = plt.subplots(2, 1, figsize=(6, 7), sharex=True)

# Plot Accuracy
bars = ax[0].bar(labels, accuracy, color=colors)
ax[0].set_ylabel('Accuracy', fontsize=14)
ax[0].set_ylim([0, 1])
ax[0].set_title(f'{MODEL_ALIAS} Performance Metrics', fontsize=16)
ax[0].grid(axis='y')
fontdict = {'size': 18, 'weight': 'bold'}
for idx, bar in enumerate(bars):
    ax[0].text(bar.get_x() + bar.get_width() / 2.0 + 0.4, accuracy[idx]+0.04, f'{accuracy[idx]:.3f}', color=colors[idx], va='center', ha='right', fontdict=fontdict)

# Plot F1 Score
bars = ax[1].bar(labels, f1_scores, color=colors)
ax[1].set_ylabel('F1 Score (Binary)', fontsize=14)
ax[1].set_ylim([0, 1])
ax[1].grid(axis='y')

ax[1].set_yticklabels([f'{0.2 * n:.1f}' for n in range(0, 5)])
ax[0].set_yticklabels(['0.0\n1.0', *[f'{0.2 * n:.1f}' for n in range(1, 6)]])

for idx, bar in enumerate(bars):
    ax[1].text(bar.get_x() + bar.get_width() / 2.0 + 0.4, f1_scores[idx]+0.04, f'{f1_scores[idx]:.3f}', color=colors[idx], va='center', ha='right', fontdict=fontdict)

ax[0].axvline(x=0.3 + bars[0].get_width() * 4, color='black', linestyle='--', linewidth=1.5)
ax[1].axvline(x=0.3 + bars[0].get_width() * 4, color='black', linestyle='--', linewidth=1.5)


plt.tight_layout()
plt.subplots_adjust(hspace=0.00)
plt.savefig(MODEL_DIR / f'{MODEL_ALIAS}-metrics.pdf', format='pdf', bbox_inches='tight', pad_inches=0)
plt.savefig(MODEL_DIR / 'metrics.png', format='png', bbox_inches='tight', pad_inches=0)
plt.show()

### Description Comparison

In [None]:
import matplotlib.pyplot as plt
import numpy as np

ls = !ls runs
directories = [f'runs/{name[5:-6]}/descriptions-eval.csv' for name in ls]
stats = []
for dir in directories:
    data = pd.read_csv(dir).drop(columns=['id'])
    stats.append(compute_overall_metrics(data, dir.split('/')[1]))

# Extracting model names, accuracies, and f1 scores
model_names = [stat.name for stat in stats]
accuracies = [stat.accuracy for stat in stats]
f1_scores = [stat.f1 for stat in stats]

# Creating the plots
fig, ax = plt.subplots(1, 2, figsize=(12, 6))  # One row with two columns
main_color = 'turquoise'
highlight_color = 'limegreen'
secondary_color = 'dodgerblue'
highlight_model = 'deepseek-coder-6.7b-finetuned'
secondary_model = 'gpt-4o-mini'

colors = [main_color] * len(model_names)
colors[model_names.index(highlight_model)] = highlight_color
colors[model_names.index(secondary_model)] = secondary_color

y_lim = min(max(max(accuracies), max(f1_scores)) + 0.1, 1)

# Plot Accuracy

bars_acc = ax[0].bar(model_names, accuracies, color=colors)
ax[0].set_ylim([0, y_lim])
ax[0].set_title('Accuracy', fontsize=16)
ax[0].grid(axis='y')
ax[0].set_xticks(np.arange(len(model_names)))
ax[0].set_xticklabels(model_names, rotation=45, ha='right', fontsize=14)

# Adding accuracy values above the bars
for idx, bar in enumerate(bars_acc):
    ax[0].text(bar.get_x() + bar.get_width() / 2.0, bar.get_height() + 0.01, f'{accuracies[idx]:.3f}', ha='center', va='bottom', fontsize=12, color='black')

# Plot F1 Score
bars_f1 = ax[1].bar(model_names, f1_scores, color=colors)
ax[1].set_ylim([0, y_lim])
ax[1].set_title('F1 Score (Binary)', fontsize=16)
ax[1].grid(axis='y')
ax[1].set_xticks(np.arange(len(model_names)))
ax[1].set_xticklabels(model_names, rotation=45, ha='right', fontsize=14)

# Adding F1 score values above the bars
for idx, bar in enumerate(bars_f1):
    ax[1].text(bar.get_x() + bar.get_width() / 2.0, bar.get_height() + 0.01, f'{f1_scores[idx]:.3f}', ha='center', va='bottom', fontsize=12, color='black')

plt.tight_layout()
plt.savefig('all-descriptions.pdf', format='pdf', bbox_inches='tight', pad_inches=0)
plt.savefig('all-descriptions.png', format='png', bbox_inches='tight', pad_inches=0)
plt.show()

## Recommendation comparison

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

ls = !ls runs
directories = [f'runs/{name[5:-6]}/recommendations-eval.csv' for name in ls]
stats = []
for dir in directories:
    data = pd.read_csv(dir).drop(columns=['id'])
    stats.append(compute_overall_metrics(data, dir.split('/')[1]))

# Extracting model names, accuracies, and f1 scores
model_names = [stat.name for stat in stats]
accuracies = [stat.accuracy for stat in stats]
f1_scores = [stat.f1 for stat in stats]

# Creating the plots
fig, ax = plt.subplots(1, 2, figsize=(12, 6))  # One row with two columns
main_color = 'turquoise'
highlight_color = 'limegreen'
secondary_color = 'dodgerblue'
highlight_model = 'deepseek-coder-6.7b-finetuned'
secondary_model = 'gpt-4o-mini'

colors = [main_color] * len(model_names)
colors[model_names.index(highlight_model)] = highlight_color
colors[model_names.index(secondary_model)] = secondary_color

y_lim = min(max(max(accuracies), max(f1_scores)) + 0.1, 1)

# Plot Accuracy
bars_acc = ax[0].bar(model_names, accuracies, color=colors)
ax[0].set_ylim([0, y_lim])
ax[0].set_title('Accuracy', fontsize=16)
ax[0].grid(axis='y')
ax[0].set_xticks(np.arange(len(model_names)))
ax[0].set_xticklabels(model_names, rotation=45, ha='right', fontsize=14)

# Adding accuracy values above the bars
for idx, bar in enumerate(bars_acc):
    ax[0].text(bar.get_x() + bar.get_width() / 2.0, bar.get_height() + 0.01, f'{accuracies[idx]:.3f}', ha='center', va='bottom', fontsize=12, color='black')

# Plot F1 Score
bars_f1 = ax[1].bar(model_names, f1_scores, color=colors)
ax[1].set_ylim([0, y_lim])
ax[1].set_title('F1 Score (Binary)', fontsize=16)
ax[1].grid(axis='y')
ax[1].set_xticks(np.arange(len(model_names)))
ax[1].set_xticklabels(model_names, rotation=45, ha='right', fontsize=14)

# Adding F1 score values above the bars
for idx, bar in enumerate(bars_f1):
    ax[1].text(bar.get_x() + bar.get_width() / 2.0, bar.get_height() + 0.01, f'{f1_scores[idx]:.3f}', ha='center', va='bottom', fontsize=12, color='black')

plt.tight_layout()
plt.savefig('all-recommendations.pdf', format='pdf', bbox_inches='tight', pad_inches=0)
plt.savefig('all-recommendations.png', format='png', bbox_inches='tight', pad_inches=0)
plt.show()