In [1]:
!python.exe -m pip install --upgrade pip
!pip install --upgrade  pyarrow
!pip install --upgrade  pandas
!pip install --upgrade  langchain
!pip install --upgrade  accelerate
!pip install --upgrade transformers
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.17.1%2Bcu118-cp310-cp310-win_amd64.whl (4.9 MB)
     ---------------------------------------- 0.0/4.9 MB ? eta -:--:--
      --------------------------------------- 0.1/4.9 MB 2.0 MB/s eta 0:00:03
     - -------------------------------------- 0.2/4.9 MB 2.2 MB/s eta 0:00:03
     --- ------------------------------------ 0.4/4.9 MB 3.3 MB/s eta 0:00:02
     ----- ---------------------------------- 0.7/4.9 MB 3.9 MB/s eta 0:00:02
     --------- ------------------------------ 1.1/4.9 MB 5.0 MB/s eta 0:00:01
     --------------- ------------------------ 1.9/4.9 MB 7.2 MB/s eta 0:00:01
     ------------------------------ --------- 3.8/4.9 MB 11.5 MB/s eta 0:00:01
     ---------------------------------------  4.9/4.9 MB 14.2 MB/s eta 0:00:01
     ---------------------------------------- 4.9/4.9 MB 12.5 MB/s eta 0:00:00
Collecting torchaudio
  Dow

In [2]:
import gc
import json
import time
import torch
import accelerate
import pandas as pd
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Загрузка модели и токенизатора на CUDA
config = BitsAndBytesConfig(
    load_in_8bit=True,  # Включаем X-битное квантование
)
model_id = "stabilityai/stable-code-3b"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    # quantization_config=config, # Включить, если не хватает VRAM
    torch_dtype=torch.float16,
    device_map="cuda",
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:27<00:00, 13.51s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
# Создание конвейера для генерации текста
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=64, temperature=0.3, do_sample=True)

# Создание HuggingFacePipeline с конвейером
bloom = HuggingFacePipeline(pipeline=pipe)

# Шаблон для вопроса и ответа
template = """System: You are a security researcher, expert in detecting security vulnerabilities. Provide response only in following format: vulnerability: <YES or NO> | vulnerability type: <CWE ID> | explanation: <explanation for prediction>. Use N/A in other fields if there are no vulnerabilities. Do not include anything else in response.
User: Evaluate the security of the following code snippet for potential vulnerabilities:
{vulnerable_code}

Response:"""

prompt = PromptTemplate(template=template, input_variables=["vulnerable_code"])

# Создание LLMChain с HuggingFacePipeline
llm_chain = LLMChain(prompt=prompt, llm=bloom)

In [6]:
# Загрузка данных из файла "vul_sample.csv"
df = pd.read_csv("vul_sample.csv")
parsed_data = df[["CWE ID", "func_before", "len"]]

In [8]:
total_rows = len(parsed_data)
correct_predictions_binary_classification = 0
correct_predictions_multiclass = 0
results = []

for index, row in parsed_data.iterrows():
    result = {}
    with torch.no_grad():
        start_time = time.time()
        generated_text = llm_chain.invoke(row["func_before"])['text'].strip()
        torch.cuda.empty_cache()
    print("Expected CWE ID:", row["CWE ID"])
    print(generated_text)
    end_time = time.time()
    print(f"\nВремя выполнения: {(end_time - start_time):.2f} секунд")
    print(f"Процесс обработки: {(index / total_rows) * 100}")
        
    result["Expected CWE ID:"] = row["CWE ID"]
    result["generated_text"] = generated_text
    result["lead_time"] = end_time - start_time

    results.append(result)
    
    print('-'*40)

with open(f'collected_generated_text/results_{model_id.split("/")[1]}.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Expected CWE ID: CWE-416
System: You are a security researcher, expert in detecting security vulnerabilities. Provide response only in following format: vulnerability: <YES or NO> | vulnerability type: <CWE ID> | explanation: <explanation for prediction>. Use N/A in other fields if there are no vulnerabilities. Do not include anything else in response.
User: Evaluate the security of the following code snippet for potential vulnerabilities:
PDFiumEngine::PDFiumEngine(PDFEngine::Client* client)
    : client_(client),
      current_zoom_(1.0),
      current_rotation_(0),
      doc_loader_(this),
      password_tries_remaining_(0),
      doc_(nullptr),
      form_(nullptr),
      defer_page_unload_(false),
      selecting_(false),
      mouse_down_state_(PDFiumPage::NONSELECTABLE_AREA,
                        PDFiumPage::LinkTarget()),
      next_page_to_search_(-1),
      last_page_to_search_(-1),
      last_character_index_to_search_(-1),
      permissions_(0),
      permissions_handler_