Setting up the dataframe and testing it out

In [69]:
import sqlite3
import pandas as pd

conn = sqlite3.connect('vulture.db')

df = pd.read_sql_query("SELECT function, isVulnerable FROM functions", conn)
df_distinct = pd.read_sql_query("SELECT distinct function, isVulnerable FROM functions", conn)

conn.close()

In [70]:
print(df.iloc[0])

function        static void ipcomp_free_scratches(void)\n{\n\t...
isVulnerable                                                    0
Name: 0, dtype: object


In [71]:
from nltk.tokenize import wordpunct_tokenize

In [72]:
df.head(n=2)

Unnamed: 0,function,isVulnerable
0,static void ipcomp_free_scratches(void)\n{\n\t...,0
1,static void ipcomp_free_scratches(void)\n{\n\t...,1


In [73]:
df['tokenized_functions'] = df['function'].apply(wordpunct_tokenize)
df_distinct['tokenized_functions'] = df_distinct['function'].apply(wordpunct_tokenize)

In [74]:
df.head(n=2)

Unnamed: 0,function,isVulnerable,tokenized_functions
0,static void ipcomp_free_scratches(void)\n{\n\t...,0,"[static, void, ipcomp_free_scratches, (, void,..."
1,static void ipcomp_free_scratches(void)\n{\n\t...,1,"[static, void, ipcomp_free_scratches, (, void,..."


In [75]:
df['size'] = df['tokenized_functions'].apply(len)
df.head(n=5)

Unnamed: 0,function,isVulnerable,tokenized_functions,size
0,static void ipcomp_free_scratches(void)\n{\n\t...,0,"[static, void, ipcomp_free_scratches, (, void,...",53
1,static void ipcomp_free_scratches(void)\n{\n\t...,1,"[static, void, ipcomp_free_scratches, (, void,...",49
2,static void __exit nbd_cleanup(void)\n{\n\tstr...,0,"[static, void, __exit, nbd_cleanup, (, void, )...",110
3,static void __exit nbd_cleanup(void)\n{\n\tstr...,1,"[static, void, __exit, nbd_cleanup, (, void, )...",110
4,static void __exit nbd_cleanup(void)\n{\n\tstr...,0,"[static, void, __exit, nbd_cleanup, (, void, )...",110


In [76]:
df.nlargest(3, ['size'])

Unnamed: 0,function,isVulnerable,tokenized_functions,size
19876,static int ext4_fill_super(struct super_block ...,0,"[static, int, ext4_fill_super, (, struct, supe...",6270
19877,static int ext4_fill_super(struct super_block ...,1,"[static, int, ext4_fill_super, (, struct, supe...",6266
27134,static int ext4_fill_super(struct super_block ...,0,"[static, int, ext4_fill_super, (, struct, supe...",6215


In [77]:
df['size'].mean()

299.56643630499883

In [78]:
limit = 1000
column = df['size']
count = column[column > limit].count()
print(count)

1276


In [79]:
print(column.count())

43410


In [80]:
df2 = df[[i <= 1000 for i in df['size']]]

In [81]:
print(df.count())

function               43410
isVulnerable           43410
tokenized_functions    43410
size                   43410
dtype: int64


In [82]:
df2.nlargest(3, ['size'])

Unnamed: 0,function,isVulnerable,tokenized_functions,size
685,struct net_device {\n\tchar\t\t\tname[IFNAMSIZ...,1,"[struct, net_device, {, char, name, [, IFNAMSI...",1000
1290,"int avc_ca_pmt(struct firedtv *fdtv, char *msg...",0,"[int, avc_ca_pmt, (, struct, firedtv, *, fdtv,...",1000
1292,"int avc_ca_pmt(struct firedtv *fdtv, char *msg...",0,"[int, avc_ca_pmt, (, struct, firedtv, *, fdtv,...",1000


In [83]:
df2.to_csv('preprocessed_data.csv')

In [84]:
print(df2.count())

function               42134
isVulnerable           42134
tokenized_functions    42134
size                   42134
dtype: int64


In [91]:
df_distinct['size'] = df_distinct['tokenized_functions'].apply(len)
df_distinct = df_distinct[[i <= 1000 for i in df_distinct['size']]]

In [92]:
df_distinct.to_csv('distinct_data.csv')
df_distinct.head(n=5)

Unnamed: 0,function,isVulnerable,tokenized_functions,size
0,static void ipcomp_free_scratches(void)\n{\n\t...,0,"[static, void, ipcomp_free_scratches, (, void,...",53
1,static void ipcomp_free_scratches(void)\n{\n\t...,1,"[static, void, ipcomp_free_scratches, (, void,...",49
2,static void __exit nbd_cleanup(void)\n{\n\tstr...,0,"[static, void, __exit, nbd_cleanup, (, void, )...",110
3,static void __exit nbd_cleanup(void)\n{\n\tstr...,1,"[static, void, __exit, nbd_cleanup, (, void, )...",110
4,struct mcb_bus *mcb_alloc_bus(struct device *c...,0,"[struct, mcb_bus, *, mcb_alloc_bus, (, struct,...",171


In [93]:
print(df_distinct.count())

function               8791
isVulnerable           8791
tokenized_functions    8791
size                   8791
dtype: int64
