In [None]:
import tensorflow as tf
from tensorflow import keras

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
import sys
from pathlib import Path

import itertools as it
import functools as fn
from tqdm import tqdm

In [None]:
data_path = Path('./dataset/data_df.csv')
data_df = pd.read_csv(data_path)
data_df.head()

# 전처리

In [None]:
def preprocessing(code):
    code = re.sub(r'    ',r'\t ',code)
    code = re.sub(r"#.*\n",'\n',code) # 주석제거
    code = re.sub(r"import.*\n",'',code) # 모듈 참고 안함
    code = re.sub(r'"',r"'",code)
    code = re.sub(r"([\n:(){}\[\]\*\/\%\+\-\,\=.'])",r' \1 ',code)
    code = re.sub(r'\n',r"<n>",code)
    code = re.sub(r'\t',r"<t>",code)
    return code.split()

In [None]:
import re
import config as cfg

with open(data_df.code1[0],'r') as f:
    code1 = f.read()
with open(data_df.code2[0],'r') as f:
    code2 = f.read()
    
code_docs = [code1,code2]

for i in range(len(code_docs)):
    code_docs[i] = preprocessing(code_docs[i])

print(f'{len(code_docs)=}')    
# print(code_docs)

# 단어장 생성

In [None]:
import os
code_path = './dataset/code/'
classes = os.listdir(code_path)
source_list = []
for label in classes:
    filenames = os.listdir(code_path+label)
    for file in filenames:
        source_list.append(code_path+label+'/'+file)

# source_list[145:155]
len(source_list)

In [None]:
print(source_list[478])

In [None]:
code_docs = []
for src_path in tqdm(source_list):
    with open(src_path,'r',encoding='utf-8') as f:
        code_docs.append(f.read())
len(code_docs)

In [None]:
max_len = max([len(code_doc) for code_doc in code_docs])
print(f'{max_len=}')
max_len_id = np.argmax([len(code_doc) for code_doc in code_docs])
print(source_list[max_len_id])
# print(code_docs[max_len_id])

In [None]:
max_code = max(code_docs)
print("code_docs.index(max_id)")
print(code_docs.index(max_code))
print()
print("source_list[code_docs.index(max_id)]")
print(source_list[code_docs.index(max_code)])
print()
# print(max_code)

In [None]:
from collections import Counter
import config as cfg

for i in tqdm(range(len(code_docs))):
    code_docs[i] = preprocessing(code_docs[i]).split()

In [None]:
max_len = max([len(code_doc) for code_doc in code_docs])
print(f'{max_len=}')
max_len_id = np.argmax([len(code_doc) for code_doc in code_docs])
print(source_list[max_len_id])
# print(code_docs[max_len_id])

In [None]:
max_code = max(code_docs)
print("code_docs.index(max_id)")
print(code_docs.index(max_code))
print()
print("source_list[code_docs.index(max_id)]")
print(source_list[code_docs.index(max_code)])
print()
# print(max_code)

In [None]:
counter = Counter()
for code_doc in tqdm(code_docs):
    # code_doc = np.concatenate(code_doc).tolist() # 메모리 부족....
    counter += Counter(code_doc)
most_counter = counter.most_common(10000-2)
vocab = ['<pad>','<unk>']+[key for key, _ in most_counter]
# print(vocab)
word_to_index = {word:index for index, word in enumerate(vocab)}

In [None]:
print(len(word_to_index))

# input pipe line

In [None]:
del code_docs
del most_counter
del source_list
del counter

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 32

In [None]:
def tokenize(path):
    with open(path,'r',encoding='utf8') as f:
        code = f.read()
    words = preprocessing(code)
    code_sequence = [word_to_index[word] if word in word_to_index else word_to_index['<unk>'] for word in words]
    return code_sequence

def make_tensor(paths):
    code_sequences = []
    for path in tqdm(paths):
        code_sequences.append(tokenize(path))
    pad_sequences = keras.preprocessing.sequence.pad_sequences(code_sequences,maxlen=cfg.max_len,truncating='pre')
    return pad_sequences
    
def make_dataset(code1_paths,code2_paths,similar):
    code1_tensor = make_tensor(code1_paths)
    code1_ds = tf.data.Dataset.from_tensor_slices(code1_tensor)
    code2_tensor = make_tensor(code2_paths)
    code2_ds = tf.data.Dataset.from_tensor_slices(code2_tensor)
    code_ds = tf.data.Dataset.zip((code1_ds,code2_ds))
    similar_ds = tf.data.Dataset.from_tensor_slices(similar)
    ds = tf.data.Dataset.zip((code_ds,similar_ds))
    return ds

In [None]:
data_ds = make_dataset(data_df['code1'],data_df['code2'],data_df['similar'])

In [None]:
for code1, code2, similar in data_ds.take(1):
    print(code1.shape)
    print(code2.shape)
    print(similar)

# 데이터셋 나누기

In [None]:
len(data_ds)

In [None]:
num_train_ds = int(len(data_ds)*0.8)
print(f"{num_train_ds=}")
num_val_ds = len(data_ds)-num_train_ds
print(f"{num_val_ds=}")

In [None]:
train_ds = data_ds.take(num_train_ds)
val_ds = data_ds.skip(num_train_ds)

In [None]:
print(f"{len(train_ds)=}")
print(f"{len(val_ds)=}")

# 모델만들기

In [None]:
from tensorflow.keras.layers import Layer
from tensorflow.keras import Model

In [None]:
# 셀프 어텐션
d_model = 512
num_layers = 6
num_heads = 8
dff = 2048

## 포지셔널 인코딩(Positional Encoding)

In [None]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

## 마스킹(making)
- 패딩을 계산하지 않게 해줌.

In [8]:
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

    # add extra dimensions to add the padding
    # to the attention logits.
    return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

## 스케일드 닷-프로덕트 어텐션(Scaled dot-product Attention)

In [None]:
def scaled_dot_product_attention(q, k, v, mask):
  """Calculate the attention weights.
  q, k, v must have matching leading dimensions.
  k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
  The mask has different shapes depending on its type(padding or look ahead)
  but it must be broadcastable for addition.

  Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable
          to (..., seq_len_q, seq_len_k). Defaults to None.

  Returns:
    output, attention_weights
  """

  matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

  # scale matmul_qk
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

  # add the mask to the scaled tensor.
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)

  # softmax is normalized on the last axis (seq_len_k) so that the scores
  # add up to 1.
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

  output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

  return output, attention_weights

In [None]:
def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
        tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
    ])

In [9]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, depth).
        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)

        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        scaled_attention, attention_weights = scaled_dot_product_attention(
            q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

        concat_attention = tf.reshape(scaled_attention,
                                    (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

        return output, attention_weights

class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):

        attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)

        return out2
    
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, rate=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model)
        # self.pos_encoding = tf.keras.layers.Embedding(input_vocab_size, d_model) => 포지션 임베딩 : Bert에서 사용하는 방법

        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
                       for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):

        seq_len = tf.shape(x)[1]

        # adding embedding and position encoding.
        x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)

        return x  # (batch_size, input_seq_len, d_model)
    
def code_similar_model (num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, training=False, rate=0.1):
    encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, rate)
    
    code1 = tf.keras.Input(shape=(None,), name="code1")
    code2 = tf.keras.Input(shape=(None,), name="code2")
    
    code1_enc_padding_mask = create_padding_mask(code1)
    code1 = encoder(code1, training, code1_enc_padding_mask)
    # code1.shape = (batch_size, inp_seq_len, d_model)
    code2_enc_padding_mask = create_padding_mask(code2)
    code2 = encoder(code2, training, code2_enc_padding_mask)
    # code2.shape = (batch_size, inp_seq_len, d_model)
    
    code2 = tf.transpose(code2,[0,2,1])
    x = tf.matmul(code1,code2)
    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    x = tf.keras.layers.Dropout(rate)(x)
    outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
    
    return  tf.keras.Model(inputs=[code1,code2],
                           outputs=outputs)