In [7]:
import tensorflow as tf
# import tensorflow_addons as tfa
import keras
import pandas as pd
import pickle
import string
import numpy as np
from gensim.models import KeyedVectors as word2vec
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import fasttext as ft

In [2]:
vectors_text_path = '../data/target_vecs.txt'
all_funcs_data_path = '../data/bcb_funcs_all.tsv'
pairs_id_path = '../data/bcb_pair_ids.pkl'

In [3]:
embeddings_dim = 256
max_sequence_length = 32
output_dim = 6

In [4]:
all_funcs = pd.read_csv(all_funcs_data_path, delimiter="\t",header=None)
with open(pairs_id_path, 'rb') as f:
    pair_ids = pickle.load(f).to_numpy()

In [5]:
all_funcs

Unnamed: 0,0,1
0,74,"static void copy(String src, String dest) ..."
1,661,"public void convert(File src, File dest) t..."
2,1362,public static int[] bubbleSort(int... a) {...
3,1363,public static int[] bubbleSortOtimizado(in...
4,2450,public void process(String dir) {\n ...
...,...,...
59613,23653940,"JarAgletClassLoader(URL codebase, Certific..."
59614,23655348,"public boolean fileCompare(String from, St..."
59615,23672349,public static byte[] calculateMD4(byte[] d...
59616,23672350,public static byte[] calculateMD5(byte[] d...


In [6]:
train_data, test_data = train_test_split(pair_ids, test_size=0.2, random_state=42, stratify=pair_ids[:,2]) 

In [8]:
code2vec = ft.load_model("../data/fasttext.bin")



In [10]:
def preproces(s):
    s = s.lower()
    s = s.split(" ")
    final = [ np.zeros(256) for _ in range(max_sequence_length)]
    counter = 0
    for word in s:
        word = word.strip()
        if len(word) > 0:
            final[counter] = code2vec[word]
            counter += 1 
        if counter >= max_sequence_length:
            break
    return final

In [11]:
functions = dict()
for index, row in all_funcs.iterrows():
    processed_function = preproces(row[1])
    functions[int(row[0])] = processed_function

In [12]:
def get_keras_dataset(data):
    x = [[],[]]
    y = []
    for id1,id2,label in data:
        try:
            x[0].append(functions[id1])
            x[1].append(functions[id2])
            y.append(label)
        except KeyError:
            continue
    x = np.array(x)
    print(x.shape)
    x = [x[0,:],x[1,:]]

    y = to_categorical(y)
    print(y.shape)
    
    return x, y

In [13]:
x,y  = get_keras_dataset(train_data)
test_x, test_y = get_keras_dataset(test_data)

(2, 78013, 32, 256)
(78013, 6)
(2, 19502, 32, 256)
(19502, 6)


In [14]:
from keras.layers import *
from keras.models import *

In [44]:
vocab_size = len(code2vec.index2word)

In [17]:
# embedding_layer = code2vec.get_keras_embedding(train_embeddings=False)
# embedding_layer = Embedding(vocab_size, output_dim=embeddings_dim, input_length=max_sequence_length)

lstm_layer = LSTM(embeddings_dim, dropout=0.3, recurrent_dropout=0.3)

in_1 = Input(shape=(max_sequence_length, embeddings_dim))
# emb_1 = embedding_layer(in_1)
lstm_1 = lstm_layer(in_1)

in_2 = Input(shape=(max_sequence_length, embeddings_dim))
# emb_2 = embedding_layer(in_2)
lstm_2 = lstm_layer(in_2)


merged = concatenate([lstm_1, lstm_2])
merged = BatchNormalization()(merged)
merged = Dropout(0.3)(merged)
merged = Dense(512, activation='relu')(merged)
merged = BatchNormalization()(merged)
merged = Dropout(0.3)(merged)
preds = Dense(output_dim, activation='softmax')(merged)

model = Model(inputs=[in_1, in_2], outputs=preds)

In [18]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam', 
              metrics=['acc',tf.keras.metrics.Recall(),tf.keras.metrics.Precision()])

In [19]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 32, 256)      0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 32, 256)      0                                            
__________________________________________________________________________________________________
lstm_3 (LSTM)                   (None, 256)          525312      input_3[0][0]                    
                                                                 input_4[0][0]                    
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 512)          0           lstm_3[0][0]               

In [20]:
model.fit(x,
          y,
          initial_epoch=0,
          epochs=25,
          validation_split=0.2,
          batch_size=64,
          shuffle=True)


Train on 62410 samples, validate on 15603 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.callbacks.History at 0x7fb7936ebb50>

In [21]:
model.evaluate(test_x, test_y)



[0.48474594374862456,
 0.8235565423965454,
 0.7708112597465515,
 0.8187830448150635]