Setting up the dataframe and testing it out

In [1]:
import sqlite3
import pandas as pd

conn = sqlite3.connect('vulture.db')

df = pd.read_sql_query("SELECT function, isVulnerable FROM functions", conn)
df_distinct = pd.read_sql_query("SELECT distinct function, isVulnerable FROM functions", conn)

conn.close()

In [2]:
print(df.iloc[0])

function        static void ipcomp_free_scratches(void)\n{\n\t...
isVulnerable                                                    0
Name: 0, dtype: object


In [3]:
from nltk.tokenize import wordpunct_tokenize

In [4]:
df.head(n=2)

Unnamed: 0,function,isVulnerable
0,static void ipcomp_free_scratches(void)\n{\n\t...,0
1,static void ipcomp_free_scratches(void)\n{\n\t...,1


In [5]:
df['tokenized_functions'] = df['function'].apply(wordpunct_tokenize)
df_distinct['tokenized_functions'] = df_distinct['function'].apply(wordpunct_tokenize)

In [6]:
df.head(n=2)

Unnamed: 0,function,isVulnerable,tokenized_functions
0,static void ipcomp_free_scratches(void)\n{\n\t...,0,"[static, void, ipcomp_free_scratches, (, void,..."
1,static void ipcomp_free_scratches(void)\n{\n\t...,1,"[static, void, ipcomp_free_scratches, (, void,..."


In [7]:
df['size'] = df['tokenized_functions'].apply(len)
df.head(n=5)

Unnamed: 0,function,isVulnerable,tokenized_functions,size
0,static void ipcomp_free_scratches(void)\n{\n\t...,0,"[static, void, ipcomp_free_scratches, (, void,...",53
1,static void ipcomp_free_scratches(void)\n{\n\t...,1,"[static, void, ipcomp_free_scratches, (, void,...",49
2,static void __exit nbd_cleanup(void)\n{\n\tstr...,0,"[static, void, __exit, nbd_cleanup, (, void, )...",110
3,static void __exit nbd_cleanup(void)\n{\n\tstr...,1,"[static, void, __exit, nbd_cleanup, (, void, )...",110
4,struct mcb_bus *mcb_alloc_bus(struct device *c...,0,"[struct, mcb_bus, *, mcb_alloc_bus, (, struct,...",171


In [8]:
df.nlargest(3, ['size'])

Unnamed: 0,function,isVulnerable,tokenized_functions,size
11708,static int ext4_fill_super(struct super_block ...,0,"[static, int, ext4_fill_super, (, struct, supe...",6270
11709,static int ext4_fill_super(struct super_block ...,1,"[static, int, ext4_fill_super, (, struct, supe...",6266
15906,static int ext4_fill_super(struct super_block ...,0,"[static, int, ext4_fill_super, (, struct, supe...",6215


In [9]:
df['size'].mean()

269.57263325377886

In [10]:
limit = 1000
column = df['size']
count = column[column > limit].count()
print(count)

646


In [11]:
print(column.count())

25140


In [12]:
df2 = df[[i <= 1000 for i in df['size']]]

In [13]:
print(df.count())

function               25140
isVulnerable           25140
tokenized_functions    25140
size                   25140
dtype: int64


In [14]:
df2.nlargest(3, ['size'])

Unnamed: 0,function,isVulnerable,tokenized_functions,size
381,struct net_device {\n\tchar\t\t\tname[IFNAMSIZ...,1,"[struct, net_device, {, char, name, [, IFNAMSI...",1000
754,"int avc_ca_pmt(struct firedtv *fdtv, char *msg...",0,"[int, avc_ca_pmt, (, struct, firedtv, *, fdtv,...",1000
2988,"int avc_ca_pmt(struct firedtv *fdtv, char *msg...",0,"[int, avc_ca_pmt, (, struct, firedtv, *, fdtv,...",1000


In [15]:
df2.to_csv('preprocessed_data.csv')

In [16]:
print(df2.count())

function               24494
isVulnerable           24494
tokenized_functions    24494
size                   24494
dtype: int64


In [17]:
df_distinct['size'] = df_distinct['tokenized_functions'].apply(len)
df_distinct = df_distinct[[i <= 1000 for i in df_distinct['size']]]

In [18]:
df_distinct.to_csv('distinct_data.csv')
df_distinct.head(n=5)

Unnamed: 0,function,isVulnerable,tokenized_functions,size
0,static void ipcomp_free_scratches(void)\n{\n\t...,0,"[static, void, ipcomp_free_scratches, (, void,...",53
1,static void ipcomp_free_scratches(void)\n{\n\t...,1,"[static, void, ipcomp_free_scratches, (, void,...",49
2,static void __exit nbd_cleanup(void)\n{\n\tstr...,0,"[static, void, __exit, nbd_cleanup, (, void, )...",110
3,static void __exit nbd_cleanup(void)\n{\n\tstr...,1,"[static, void, __exit, nbd_cleanup, (, void, )...",110
4,struct mcb_bus *mcb_alloc_bus(struct device *c...,0,"[struct, mcb_bus, *, mcb_alloc_bus, (, struct,...",171


In [19]:
print(df_distinct.count())

function               8785
isVulnerable           8785
tokenized_functions    8785
size                   8785
dtype: int64
