Convolutional Neural Network
https://realpython.com/python-keras-text-classification/

In [8]:
import sqlite3
import pandas as pd

df = pd.read_csv('preprocessed_data.csv', index_col=0)

In [9]:
from sklearn.model_selection import train_test_split

functions = df['function'].values
y = df['isVulnerable'].values

functions_train, functions_test, y_train, y_test = train_test_split(functions, y, test_size=0.2, random_state=42)


In [10]:
from keras.preprocessing.text import Tokenizer
from nltk.tokenize import wordpunct_tokenize

tokenizer = Tokenizer(analyzer=wordpunct_tokenize)
tokenizer.fit_on_texts(functions_train)

X_train = tokenizer.texts_to_sequences(functions_train)
X_test = tokenizer.texts_to_sequences(functions_test)

vocab_size = len(tokenizer.word_index) + 1


In [11]:
print(vocab_size)

42726


In [12]:
from keras.utils import pad_sequences

maxlen = 500

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

print(X_train[0])

[   36  2563     1     9    53    10    53     7    11  5081    66  3088
  2563     4    53     6   259    38    79    33    53     5     8    24
    53     2  2812    50   122  4726     1    53    27    11  2724     1
    53     5     8     1  3089     1    53   152  5082    42  5083     7
    11     9    53    10  4321     6  1463     1    53     2   417   152
  4321     3  2071    13    53     2  1151     5     8     1  5084     1
  5085     4    53    27  5086     1    53     5     8     1  4321    50
  3089     1  4321   152 22963     7  5087     1    53     5  2724     1
    53     5  5088     1    53     5    12    12    52    11  2071    13
    53     2  1151     5    12  5089     1    53     5  5090     1    53
     5    12     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0   

In [13]:
from keras.models import Sequential
from keras import layers

embedding_dim=100

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 100)          4272600   
                                                                 
 conv1d (Conv1D)             (None, 496, 128)          64128     
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 10)                1290      
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 4,338,029
Trainable params: 4,338,029
Non-trainable params: 0
______________________________________________

In [14]:
from keras.backend import clear_session
clear_session()

history = model.fit(X_train, y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Epoch 1/10


2023-05-03 18:09:20.466276: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 35238000 exceeds 10% of free system memory.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


2023-05-03 18:26:58.384384: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 35238000 exceeds 10% of free system memory.


Training Accuracy: 0.5147
Testing Accuracy:  0.5024
