Three standard models (two linear, one bayesian) on a bag-of-words vector

In [84]:
import sqlite3
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score

df = pd.read_csv('preprocessed_data.csv', index_col=0)

In [85]:
df.head(n=2)

Unnamed: 0,function,isVulnerable,tokenized_functions,size
0,static void ipcomp_free_scratches(void)\n{\n\t...,0,"['static', 'void', 'ipcomp_free_scratches', '(...",53
1,static void ipcomp_free_scratches(void)\n{\n\t...,1,"['static', 'void', 'ipcomp_free_scratches', '(...",49


In [86]:
print(df.count())

function               22024
isVulnerable           22024
tokenized_functions    22024
size                   22024
dtype: int64


In [87]:
vectorizer = CountVectorizer(min_df=0, tokenizer=None, preprocessor=None)
vectorizer.fit(df['tokenized_functions'])


In [88]:
vectorizer.vocabulary_

{'static': 35258,
 'void': 40359,
 'ipcomp_free_scratches': 17690,
 'int': 17038,
 '__percpu': 736,
 'scratches': 32338,
 'if': 16295,
 'ipcomp_scratch_users': 17691,
 'return': 30546,
 'ipcomp_scratches': 17692,
 'for_each_possible_cpu': 12451,
 'vfree': 39877,
 'per_cpu_ptr': 26965,
 'free_percpu': 12680,
 'null': 24827,
 '__exit': 532,
 'nbd_cleanup': 23219,
 'struct': 35510,
 'nbd_device': 23226,
 'nbd': 23216,
 'list_head': 19718,
 'del_list': 7868,
 'genl_unregister_family': 13259,
 'nbd_genl_family': 23230,
 'nbd_dbg_close': 23222,
 'mutex_lock': 22981,
 'nbd_index_mutex': 23232,
 'idr_for_each': 16065,
 'nbd_index_idr': 23231,
 'nbd_exit_cb': 23229,
 'mutex_unlock': 22987,
 'while': 40790,
 'list_empty': 19705,
 'list_first_entry': 19707,
 'list': 19696,
 'list_del_init': 19702,
 'refcount_read': 29916,
 'refs': 29927,
 'printk': 28162,
 'kern_err': 18678,
 'possibly': 27910,
 'leaking': 19546,
 'device': 8206,
 'nbd_put': 23235,
 'destroy_workqueue': 8017,
 'nbd_del_wq': 23225

In [89]:
vectorizer.transform(df['tokenized_functions']).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [90]:
functions = df['tokenized_functions'].values
y = df['isVulnerable'].values

functions_train, functions_test, y_train, y_test = train_test_split(functions, y, test_size=0.2, random_state=42)

In [91]:
vectorizer = CountVectorizer(min_df=0, tokenizer=None, preprocessor=None)
vectorizer.fit(functions_train)

X_train = vectorizer.transform(functions_train)
X_test = vectorizer.transform(functions_test)

In [92]:
X_train

<17619x41439 sparse matrix of type '<class 'numpy.int64'>'
	with 700268 stored elements in Compressed Sparse Row format>

In [93]:
y_train_1 = (y_train == 1)
y_test_1 = (y_test == 1)

In [94]:
sgd_clf = SGDClassifier()
sgd_clf.fit(X_train, y_train_1)

In [95]:
cross_val_score(sgd_clf, X_train, y_train_1, cv=3, scoring="accuracy", verbose=True)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.8s finished


array([0.7149668 , 0.69776945, 0.71752086])

In [96]:
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_1, cv=3)

In [97]:
confusion_matrix(y_train_1, y_train_pred)

array([[6108, 2716],
       [2460, 6335]])

In [98]:
precision_score(y_train_1, y_train_pred)

0.699922660479505

In [99]:
recall_score(y_train_1, y_train_pred)

0.7202956225127913

In [100]:
f1_score(y_train_1, y_train_pred)

0.7099630169225597

In [101]:
lr_clf = LogisticRegression(max_iter=2000)
lr_clf.fit(X_train, y_train_1)

In [102]:
cross_val_score(lr_clf, X_train, y_train_1, cv=3, scoring="accuracy", verbose=True)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   22.1s finished


array([0.73625064, 0.73301549, 0.74646688])

In [103]:
y_train_pred = cross_val_predict(lr_clf, X_train, y_train_1, cv=3)

In [104]:
confusion_matrix(y_train_1, y_train_pred)

array([[6502, 2322],
       [2284, 6511]])

In [105]:
precision_score(y_train_1, y_train_pred)

0.7371221555530397

In [106]:
recall_score(y_train_1, y_train_pred)

0.7403069926094372

In [107]:
f1_score(y_train_1, y_train_pred)

0.7387111413660087

In [108]:
mnb_clf = MultinomialNB()
mnb_clf.fit(X_train, y_train_1)

In [109]:
cross_val_score(mnb_clf, X_train, y_train_1, cv=3, scoring="accuracy", verbose=True)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


array([0.42159033, 0.42312277, 0.4287417 ])

In [110]:
y_train_pred = cross_val_predict(mnb_clf, X_train, y_train_1, cv=3)

In [111]:
confusion_matrix(y_train_1, y_train_pred)

array([[3923, 4901],
       [5239, 3556]])

In [112]:
precision_score(y_train_1, y_train_pred)

0.420480075676954

In [113]:
recall_score(y_train_1, y_train_pred)

0.4043206367254122

In [114]:
f1_score(y_train_1, y_train_pred)

0.4122420588917227