In [1]:
!pip install transformers datasets accelerate evaluate torch scikit-learn

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

In [3]:
def load_and_preprocess_dataset(dataset_name="DetectVul/devign"):
    dataset = load_dataset(dataset_name)

    def convert_labels(example):
        return {"func": example["func"], "target": int(example["target"])}

    dataset = dataset.map(convert_labels)

    dataset = dataset["train"].train_test_split(test_size=0.2)
    test_valid = dataset["test"].train_test_split(test_size=0.5)

    dataset = DatasetDict({
        "train": dataset["train"],
        "test": test_valid["test"],
        "validation": test_valid["train"]
    })

    print(f"Dataset loaded! Train: {len(dataset['train'])}, Test: {len(dataset['test'])}, Validation: {len(dataset['validation'])}")

    return dataset

raw_datasets = load_and_preprocess_dataset()

README.md:   0%|          | 0.00/2.96k [00:00<?, ?B/s]

(…)-00000-of-00001-396a063c42dfdb0a.parquet:   0%|          | 0.00/64.2M [00:00<?, ?B/s]

(…)-00000-of-00001-5d4ba937305086b9.parquet:   0%|          | 0.00/7.93M [00:00<?, ?B/s]

(…)-00000-of-00001-e0e162fa10729371.parquet:   0%|          | 0.00/8.03M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/21854 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2732 [00:00<?, ? examples/s]

Map:   0%|          | 0/21854 [00:00<?, ? examples/s]

Map:   0%|          | 0/2732 [00:00<?, ? examples/s]

Map:   0%|          | 0/2732 [00:00<?, ? examples/s]

Dataset loaded! Train: 17483, Test: 2186, Validation: 2185


In [4]:
df = pd.DataFrame(raw_datasets["train"][:])
target_stats = df["target"].value_counts().reset_index()
target_stats.columns = ["Target Value", "Count"]
print(target_stats)

   Target Value  Count
0         False   9518
1          True   7965


In [5]:
def tokenize_dataset(dataset, tokenizer_name="microsoft/graphcodebert-base"):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    def preprocess_function(examples):
        tokenized_inputs = tokenizer(
            examples["func"], truncation=True, padding="max_length", max_length=512
        )

        tokenized_inputs["labels"] = [int(label) for label in examples["target"]]

        return tokenized_inputs

    tokenized_datasets = dataset.map(preprocess_function, batched=True)

    columns_to_remove = ["func", "target"]
    if "label" in tokenized_datasets["train"].column_names:
        columns_to_remove.append("label")

    tokenized_datasets = tokenized_datasets.remove_columns(columns_to_remove)

    print("Tokenization complete! 'target' correctly converted to 'labels'.")
    return tokenized_datasets

tokenized_datasets = tokenize_dataset(raw_datasets)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/539 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Map:   0%|          | 0/17483 [00:00<?, ? examples/s]

Map:   0%|          | 0/2186 [00:00<?, ? examples/s]

Map:   0%|          | 0/2185 [00:00<?, ? examples/s]

Tokenization complete! 'target' correctly converted to 'labels'.


In [6]:
df_tokenized = pd.DataFrame(tokenized_datasets["train"][:])
print(df_tokenized.head())

      id project                                 commit_id  \
0  17395  FFmpeg  f19af812a32c1398d48c3550d11dbc6aafbb2bfc   
1  22778    qemu  6049f4f831c6f409031dfa09282b38d0cbaecad8   
2  23176    qemu  c546194f260fb3e391193cb8cc33505618077ecb   
3   8036    qemu  c20b7fa4b2fedd979bcb0cc974bb5d08a10e3448   
4  10998    qemu  53cb28cbfea038f8ad50132dc8a684e638c7d48b   

                                          func_clean  \
0  static int adx_encode_header(AVCodecContext *a...   
1  void cpu_loop (CPUState *env)\n\n{\n\n    int ...   
2  void qmp_block_set_io_throttle(const char *dev...   
3  static void monitor_protocol_event_handler(voi...   
4  static void phys_page_set(AddressSpaceDispatch...   

                                           vul_lines  \
0  {'code': ['	struct {', '		uint32_t freq;', '		...   
1                        {'code': [], 'line_no': []}   
2                        {'code': [], 'line_no': []}   
3  {'code': ['    qemu_mutex_lock(&monitor_event_...   
4         

In [7]:
def load_model(model_name="microsoft/graphcodebert-base"):
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    return model

model = load_model()

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
training_args = TrainingArguments(
    output_dir="./graphcodebert_results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    report_to="none"
)

In [19]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1-score": f1
    }

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics
)

print("Trainer initialized!")

Trainer initialized!


In [21]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [26]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1-score
1,0.598,0.609952,0.63524,0.608268,0.60767,0.607969
2,0.4581,0.664848,0.63524,0.633172,0.514258,0.567553
3,0.3573,0.756457,0.627002,0.615826,0.528024,0.568555




TrainOutput(global_step=1641, training_loss=0.48409023752014935, metrics={'train_runtime': 3097.3071, 'train_samples_per_second': 16.934, 'train_steps_per_second': 0.53, 'total_flos': 1.379991174257664e+16, 'train_loss': 0.48409023752014935, 'epoch': 3.0})

In [27]:
import pandas as pd

results = trainer.evaluate(tokenized_datasets["test"])

df_results = pd.DataFrame(list(results.items()), columns=["Metric", "Value"])

print("\nEvaluation Metrics on Test Data:\n")
print(df_results.to_string(index=False))




Evaluation Metrics on Test Data:

                 Metric     Value
              eval_loss  0.611772
          eval_accuracy  0.634035
         eval_precision  0.614341
            eval_recall  0.611969
          eval_f1-score  0.613153
           eval_runtime 40.978300
eval_samples_per_second 53.345000
  eval_steps_per_second  1.684000
                  epoch  3.000000


In [28]:
predictions = trainer.predict(tokenized_datasets["test"])
preds = np.argmax(predictions.predictions, axis=-1)

print("Detailed Classification Report:")
print(classification_report(tokenized_datasets["test"]["labels"], preds, target_names=["Non-Vulnerable", "Vulnerable"]))



Detailed Classification Report:
                precision    recall  f1-score   support

Non-Vulnerable       0.65      0.65      0.65      1150
    Vulnerable       0.61      0.61      0.61      1036

      accuracy                           0.63      2186
     macro avg       0.63      0.63      0.63      2186
  weighted avg       0.63      0.63      0.63      2186

