Setting up the dataframe and testing it out

In [1]:
import sqlite3
import pandas as pd

conn = sqlite3.connect('vulture.db')

df = pd.read_sql_query("SELECT function, isVulnerable FROM functions", conn)
df_distinct = pd.read_sql_query("SELECT distinct function, isVulnerable FROM functions", conn)

conn.close()

In [2]:
print(df.iloc[0])

function        static void ipcomp_free_scratches(void)\n{\n\t...
isVulnerable                                                    0
Name: 0, dtype: object


In [3]:
from nltk.tokenize import wordpunct_tokenize

In [4]:
df.head(n=2)

Unnamed: 0,function,isVulnerable
0,static void ipcomp_free_scratches(void)\n{\n\t...,0
1,static void ipcomp_free_scratches(void)\n{\n\t...,1


In [7]:
df['tokenized_functions'] = df['function'].apply(wordpunct_tokenize)
df_distinct['tokenized_functions'] = df_distinct['function'].apply(wordpunct_tokenize)

In [8]:
df.head(n=2)

Unnamed: 0,function,isVulnerable,tokenized_functions
0,static void ipcomp_free_scratches(void)\n{\n\t...,0,"[static, void, ipcomp_free_scratches, (, void,..."
1,static void ipcomp_free_scratches(void)\n{\n\t...,1,"[static, void, ipcomp_free_scratches, (, void,..."


In [9]:
df['size'] = df['tokenized_functions'].apply(len)
df.head(n=5)

Unnamed: 0,function,isVulnerable,tokenized_functions,size
0,static void ipcomp_free_scratches(void)\n{\n\t...,0,"[static, void, ipcomp_free_scratches, (, void,...",53
1,static void ipcomp_free_scratches(void)\n{\n\t...,1,"[static, void, ipcomp_free_scratches, (, void,...",49
2,static void __exit nbd_cleanup(void)\n{\n\tstr...,0,"[static, void, __exit, nbd_cleanup, (, void, )...",110
3,static void __exit nbd_cleanup(void)\n{\n\tstr...,1,"[static, void, __exit, nbd_cleanup, (, void, )...",110
4,struct mcb_bus *mcb_alloc_bus(struct device *c...,0,"[struct, mcb_bus, *, mcb_alloc_bus, (, struct,...",171


In [10]:
df.nlargest(3, ['size'])

Unnamed: 0,function,isVulnerable,tokenized_functions,size
11708,static int ext4_fill_super(struct super_block ...,0,"[static, int, ext4_fill_super, (, struct, supe...",6270
11709,static int ext4_fill_super(struct super_block ...,1,"[static, int, ext4_fill_super, (, struct, supe...",6266
15906,static int ext4_fill_super(struct super_block ...,0,"[static, int, ext4_fill_super, (, struct, supe...",6215


In [11]:
df['size'].mean()

269.57263325377886

In [16]:
limit = 700
column = df['size']
count = column[column > limit].count()
print(count)

1610


In [17]:
print(column.count())

25140


In [22]:
df2 = df[[i <= 500 for i in df['size']]]

In [23]:
print(df2.count())

function               22024
isVulnerable           22024
tokenized_functions    22024
size                   22024
dtype: int64


In [24]:
df2.nlargest(3, ['size'])

Unnamed: 0,function,isVulnerable,tokenized_functions,size
3719,static int raid1_run(struct mddev *mddev)\n{\n...,1,"[static, int, raid1_run, (, struct, mddev, *, ...",500
8035,static int pxa3xx_gcu_probe(struct platform_de...,1,"[static, int, pxa3xx_gcu_probe, (, struct, pla...",500
11234,"nft_do_chain(struct nft_pktinfo *pkt, void *pr...",0,"[nft_do_chain, (, struct, nft_pktinfo, *, pkt,...",500


In [25]:
df2.to_csv('preprocessed_data.csv')

In [26]:
print(df2.count())

function               22024
isVulnerable           22024
tokenized_functions    22024
size                   22024
dtype: int64


In [28]:
df_distinct['size'] = df_distinct['tokenized_functions'].apply(len)
df_distinct = df_distinct[[i <= 500 for i in df_distinct['size']]]

In [29]:
df_distinct.to_csv('distinct_data.csv')
df_distinct.head(n=5)

Unnamed: 0,function,isVulnerable,tokenized_functions,size
0,static void ipcomp_free_scratches(void)\n{\n\t...,0,"[static, void, ipcomp_free_scratches, (, void,...",53
1,static void ipcomp_free_scratches(void)\n{\n\t...,1,"[static, void, ipcomp_free_scratches, (, void,...",49
2,static void __exit nbd_cleanup(void)\n{\n\tstr...,0,"[static, void, __exit, nbd_cleanup, (, void, )...",110
3,static void __exit nbd_cleanup(void)\n{\n\tstr...,1,"[static, void, __exit, nbd_cleanup, (, void, )...",110
4,struct mcb_bus *mcb_alloc_bus(struct device *c...,0,"[struct, mcb_bus, *, mcb_alloc_bus, (, struct,...",171


In [30]:
print(df_distinct.count())

function               7568
isVulnerable           7568
tokenized_functions    7568
size                   7568
dtype: int64
