In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import pickle

##  Data Preprocessing

In [2]:
# Loading formatted data
# I use format the data into pd dataframe
# See data_formatting.ipynb for details
train_data = pd.read_pickle("../dataset/train.pickle")
validate_data = pd.read_pickle("../dataset/validate.pickle")
test_data = pd.read_pickle("../dataset/test.pickle")

### Tokenize the source code

#### BoW

For data batching convenience, the paper trained only on functions with token length $10 \leq l \leq 500$, padded to the maximum length of **500**  
The paper does not mention to pad the 0 at the end or at the beginning, so I assume they append the padding at the end (actually, this is not a big deal in CNN)

text_to_word_sequence does not work since it ask a single string

In [None]:
# train_tokenized = tf.keras.preprocessing.text.text_to_word_sequence(train_data[0])
# x_train = tf.keras.preprocessing.sequence.pad_sequences(train_tokenized, maxlen=500, padding="post")

In [None]:
# validate_tokenized = tf.keras.preprocessing.text.text_to_word_sequence(validate_data[0])
# x_validate = tf.keras.preprocessing.sequence.pad_sequences(validate_tokenized, maxlen=500, padding="post")

In [None]:
# test_tokenized = tf.keras.preprocessing.text.text_to_word_sequence(test_data[0])
# x_test = tf.keras.preprocessing.sequence.pad_sequences(test_tokenized, maxlen=500, padding="post")

#### Init the Tokenizer

#### BoW

In [3]:
# The paper does not declare the num of words to track, I am using 10000 here
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
# Required before using texts_to_sequences
# Arguments; a list of strings
tokenizer.fit_on_texts(list(train_data[0]))

For data batching convenience, the paper trained only on functions with token length $10 \leq l \leq 500$, padded to the maximum length of **500**  
The paper does not mention to pad the 0 at the end or at the beginning, so I assume they append the padding at the end (actually, this is not a big deal in CNN)

In [None]:
train_tokenized = tokenizer.texts_to_sequences(train_data[0])
x_train = tf.keras.preprocessing.sequence.pad_sequences(train_tokenized, maxlen=500, padding="post")

In [None]:
validate_tokenized = tokenizer.texts_to_sequences(validate_data[0])
x_validate = tf.keras.preprocessing.sequence.pad_sequences(validate_tokenized, maxlen=500, padding="post")

In [None]:
test_tokenized = tokenizer.texts_to_sequences(test_data[0])
x_test = tf.keras.preprocessing.sequence.pad_sequences(test_tokenized, maxlen=500, padding="post")

In [None]:
y_train = train_data[train_data.columns[1:]].astype(int)
y_validate = validate_data[validate_data.columns[1:]].astype(int)
y_test = test_data[test_data.columns[1:]].astype(int)

## Model Design

This dataset is highly imbalanced, so I am working on adjusting the train weights
https://www.tensorflow.org/tutorials/structured_data/imbalanced_data

In [45]:
clear, vulnerable = (train_data[train_data.columns[1:]]).any(axis=1, bool_only=bool).value_counts()
total = vulnerable + clear
print("Total: {}\n    Vulnerable: {} ({:.2f}% of total)\n".format(total, vulnerable, 100 * vulnerable / total))

Total: 1019471
    Vulnerable: 65904 (6.46% of total)



In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=10000, output_dim=13, input_length=500))
model.add(tf.keras.layers.Conv1D(filters=512, kernel_size=9, activation="relu"))
model.add(tf.keras.layers.MaxPool1D(pool_size=3))
model.add(tf.keras.layers.Dropout(rate=0.5))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(units=64, activation="relu"))
model.add(tf.keras.layers.Dense(units=16, activation="relu"))
# I am using the sigmoid rather than the softmax mentioned in the paper
model.add(tf.keras.layers.Dense(units=5, activation="sigmoid"))

# Adam Optimization with the parameter stated in the paper
adam = tf.keras.optimizers.Adam(lr=0.005)

# Define the evaluation metrics
METRICS = [
      tf.keras.metrics.TruePositives(name='tp'),
      tf.keras.metrics.FalsePositives(name='fp'),
      tf.keras.metrics.TrueNegatives(name='tn'),
      tf.keras.metrics.FalseNegatives(name='fn'), 
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall'),
      tf.keras.metrics.AUC(name='auc'),
]

model.compile(optimizer=adam, loss="binary_crossentropy", metrics=METRICS)
model.summary()

In [None]:
model.fit(x=x_train, y=y_train, batch_size=128, epochs=10, verbose=1, class_weight=class_weight, validation_data=(x_validate, y_validate))

In [29]:
model.save("Simple_CNN_imbalanced")

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: Simple_CNN\assets


In [None]:
results = model.evaluate(x_test, y_test, batch_size=128)

