In [None]:
import os
import gc
import json
import time
import torch
import accelerate
import pandas as pd
from tqdm import tqdm
from getpass import getpass
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain_community.llms import HuggingFaceEndpoint
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig

In [None]:
# Loading samples from a file "vul_sample.csv"
df = pd.read_csv("../data/vul_sample.csv")
parsed_data = df[["CWE ID", "func_before", "len"]]

In [None]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = getpass(
    prompt="Введите ваш HuggingFaceHub API ключ"
)

In [None]:
model_id = "codellama/CodeLlama-13b-hf"

In [None]:
bloom = HuggingFaceEndpoint(
    repo_id=model_id,
)

In [None]:
# Template for question and answer
template = """System: You are a security researcher, expert in detecting security vulnerabilities. Provide response only in following format: vulnerability: <YES or NO> | vulnerability type: <CWE ID> | explanation: <explanation for prediction>. Use N/A in other fields if there are no vulnerabilities. Do not include anything else in response.
User: Evaluate the security of the following code snippet for potential vulnerabilities:
{vulnerable_code}

Response:"""
prompt = PromptTemplate(template=template, input_variables=["vulnerable_code"])

In [None]:
# Creating a LLMChain
llm_chain = LLMChain(prompt=prompt, llm=bloom)

In [None]:
results = []

# Running iterations over all sals from a parsed_data
for index, row in tqdm(parsed_data.iterrows(), total=len(parsed_data), desc="Processing", unit=" row"):
    result = {}
    retry = True
    
    while retry:
        # Sending a question to a model
        try:
            start_time = time.time()
            generated_text = llm_chain.invoke(row["func_before"])['text'].strip()
            end_time = time.time()

            print(f"Expected CWE ID: {row['CWE ID']}. The model answers:\n{generated_text}")
            print(f"\nExecution time: {(end_time - start_time):.2f} seconds")
        
            result["Expected CWE ID:"] = row["CWE ID"]
            result["generated_text"] = generated_text
            result["lead_time"] = end_time - start_time

            results.append(result)
            retry = False
        except Exception as e:
            print(f"Error occurred: {e}. Going for a break.")
            time.sleep(60 * 10)
            continue

# Saving results to a JSON file
model_name = model_id.split("/")[1]
output_file = f"../data/collected_generated_text/results_API_{model_name}.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4)