In [17]:
import os 
import json
import pandas as pd

In [18]:
models = ['linevul', 'codebert', 'PLBART', 'devign',  'regvd', 'vulberta_cnn', 'vulberta_mlp']

In [19]:
with open("../data/devign/test.jsonl", 'r') as f:
    data = []
    for  line in f:
        try:
            data.append(json.loads(line.strip()))
        except Exception as e:
            print(e)


In [20]:
scores = {model: {} for model in models}
for model in models:
    model_d = json.load(open(os.path.join(model, "{}_line_scores.json".format(model)), "r"))
    for x in model_d:
        scores[model][x['idx']] = x['scores']

In [21]:
predictions = {model:{} for model in models}
for model in models:
    df_pred = pd.read_csv("predictions/{}_predictions.csv".format(model))
    for idx, row in df_pred.iterrows():
        predictions[model][row['idx']] = row['prediction']

In [22]:
def maximum_aggrement(freq, model_threshold):
    aggrement_lines = []
    for i, fr in enumerate(freq):
        if fr >= model_threshold: 
            aggrement_lines.append(i)
    return aggrement_lines

In [23]:
import shutil

In [24]:
# keywords = ['if', 'while', 'for', 'error', 'print', 'fail', 'alloc', 'free', 'memset', 'memcpy']

# models_keyword = {}
# for model in models:
#     models_keyword[model] = {}
#     for key in keywords:
#         models_keyword[model][key] = []

# models_data = {model: 0 for model in models}

In [25]:

# for sample in data:
#     # if sample['target'] == 0: 
#     #     continue

#     ind = sample['idx']
#     lines = sample['func'].split("\n")
#     keys_values = {key: 0 for key in keywords}
#     for line in lines:
#         for key in keywords:
#             if key in line:
#                 keys_values[key] += 1

#     for model in models:
#         if ind in predictions[model]:
#             models_data[model] += 1
#             paired_data_1 = [(scores[model][ind][i]  , i) for i in range(len(scores[model][ind]))]
#             assert len(paired_data_1) == len(lines)
#             sorted_data_1 = sorted(paired_data_1, reverse=True)
#             data_1_lines = set([k[1] for k in sorted_data_1[:10]])
#             keywrod_lines = {key: 0 for key in keywords}
#             for line_no in data_1_lines:
#                 line = lines[line_no].lower()
#                 for key in keywords:
#                     keywrod_lines[key] += (1 if line.count(key) > 0 else 0)
#             for key in keywords:
#                 models_keyword[model][key].append((keywrod_lines[key]/ keys_values[key]) if keys_values[key] else 0)

In [26]:
keywords = ['if', 'while', 'for', 'error', 'print', 'fail', 'alloc', 'free', 'memset', 'memcpy']

models_keyword = {}
for model in models:
    models_keyword[model] = {}
    for key in keywords:
        models_keyword[model][key] = []

models_data = {model: 0 for model in models}


for sample in data:
    # if sample['target'] == 0: 
    #     continue

    ind = sample['idx']
    lines = sample['func'].split("\n")

    for model in models:
        if ind in predictions[model]:
            models_data[model] += 1
            paired_data_1 = [(scores[model][ind][i]  , i) for i in range(len(scores[model][ind]))]
            assert len(paired_data_1) == len(lines)
            
            last_line = len(lines)
            while last_line >=1 and scores[model][ind][last_line - 1] == 0:
                last_line -= 1

            keys_values = {key: 0 for key in keywords}
            for line_no,line in enumerate(lines):
                if line_no >= last_line:
                    break
                for key in keywords:
                    if key in line:
                        keys_values[key] += 1

            line_threshold = min(last_line, 10)
            sorted_data_1 = sorted(paired_data_1, reverse=True)
            data_1_lines = set([k[1] for k in sorted_data_1[:line_threshold]])
            assert len(data_1_lines) <= 10
            keywrod_lines = {key: 0 for key in keywords}
            for line_no in data_1_lines:
                line = lines[line_no].lower()
                for key in keywords:
                    keywrod_lines[key] += (1 if line.count(key) > 0 else 0)
            for key in keywords:
                models_keyword[model][key].append((keywrod_lines[key]/ keys_values[key]) if keys_values[key] else 0)

In [27]:
key_percentage = {model: {} for model in models}
for model in models:
    for key in keywords:
        key_percentage[model][key]  = sum(models_keyword[model][key]) / len(models_keyword[model][key]) 

In [36]:
data_1 = []
for key in keywords:
    row_data= [key]
    for model in models:
        row_data.append(round(key_percentage[model][key], 3))
    data_1.append(row_data)

In [37]:
df = pd.DataFrame(data_1, columns= ['feature'] + models)

In [38]:
print(df.to_latex())

\begin{tabular}{llrrrrrrr}
\toprule
{} & feature &  linevul &  codebert &  PLBART &  devign &  regvd &  vulberta\_cnn &  vulberta\_mlp \\
\midrule
0 &      if &    0.411 &     0.447 &   0.596 &   0.380 &  0.335 &         0.268 &         0.276 \\
1 &   while &    0.040 &     0.038 &   0.047 &   0.056 &  0.043 &         0.033 &         0.028 \\
2 &     for &    0.222 &     0.178 &   0.209 &   0.233 &  0.156 &         0.104 &         0.083 \\
3 &   error &    0.121 &     0.100 &   0.097 &   0.072 &  0.102 &         0.070 &         0.085 \\
4 &   print &    0.092 &     0.068 &   0.065 &   0.040 &  0.086 &         0.053 &         0.065 \\
5 &    fail &    0.030 &     0.023 &   0.018 &   0.014 &  0.027 &         0.020 &         0.034 \\
6 &   alloc &    0.100 &     0.098 &   0.080 &   0.078 &  0.106 &         0.055 &         0.074 \\
7 &    free &    0.050 &     0.059 &   0.046 &   0.048 &  0.057 &         0.052 &         0.073 \\
8 &  memset &    0.023 &     0.032 &   0.022 &   0.011 &  0.0

In [30]:
df.to_csv("frequent_keyword.csv")

In [31]:
df_data = []
for key in keywords:
    values = []
    for sample in data:
        ind = sample['idx']
        lines = sample['func'].split("\n")
        while lines.count('') > 0:
            lines.remove('')
        n_lines = len(lines)
        n_key = 0
        for line in lines:
            if key in line:
                n_key += 1
        values.append((n_key/n_lines) if n_lines else 0)
    df_data.append(sum(values)/ len(values))
print(keywords)
df_l = pd.DataFrame([df_data], columns=keywords)
df_l.to_csv("keyword_by_lines.csv")

['if', 'while', 'for', 'error', 'print', 'fail', 'alloc', 'free', 'memset', 'memcpy']
