In [None]:
import tensorflow as tf
from tensorflow import keras

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
import sys
from pathlib import Path

import itertools as it
import functools as fn
from tqdm import tqdm

In [None]:
data_path = Path('./dataset/data_df.csv')
data_df = pd.read_csv(data_path)
data_df.head()

# 전처리

In [None]:
def preprocessing(code):
    code = re.sub(r'    ',r'\t ',code)
    code = re.sub(r"#.*\n",'\n',code)
    code = re.sub(r'"',r"'",code)
    code = re.sub(r"([\n:(){}\[\]\*\/\%\+\-\,\=.'])",r' \1 ',code)
    code = re.sub(r'\n',r"<n>",code)
    code = re.sub(r'\t',r"<t>",code)
    return code.split()

In [None]:
import re
import config as cfg

with open(data_df.code1[0],'r') as f:
    code1 = f.read()
with open(data_df.code2[0],'r') as f:
    code2 = f.read()
    
code_docs = [code1,code2]

for i in range(len(code_docs)):
    code_docs[i] = preprocessing(code_docs[i])

print(f'{len(code_docs)=}')    
# print(code_docs)

# 단어장 생성

In [None]:
import os
code_path = './dataset/code/'
classes = os.listdir(code_path)
source_list = []
for label in classes:
    filenames = os.listdir(code_path+label)
    for file in filenames:
        source_list.append(code_path+label+'/'+file)

# source_list[145:155]
len(source_list)

In [None]:
print(source_list[478])

In [None]:
from tqdm import tqdm
code_docs = []
for src_path in tqdm(source_list):
    with open(src_path,'r',encoding='utf-8') as f:
        code_docs.append(f.read())
len(code_docs)

In [None]:
max_len = max([len(code_doc) for code_doc in code_docs])
print(f'{max_len=}')
max_len_id = np.argmax([len(code_doc) for code_doc in code_docs])
print(source_list[max_len_id])
# print(code_docs[max_len_id])

In [None]:
max_code = max(code_docs)
print("code_docs.index(max_id)")
print(code_docs.index(max_code))
print()
print("source_list[code_docs.index(max_id)]")
print(source_list[code_docs.index(max_code)])
print()
# print(max_code)

In [None]:
from collections import Counter
import config as cfg

for i in tqdm(range(len(code_docs))):
    code_docs[i] = preprocessing(code_docs[i]).split()

In [None]:
max_len = max([len(code_doc) for code_doc in code_docs])
print(f'{max_len=}')
max_len_id = np.argmax([len(code_doc) for code_doc in code_docs])
print(source_list[max_len_id])
# print(code_docs[max_len_id])

In [None]:
max_code = max(code_docs)
print("code_docs.index(max_id)")
print(code_docs.index(max_code))
print()
print("source_list[code_docs.index(max_id)]")
print(source_list[code_docs.index(max_code)])
print()
# print(max_code)

In [None]:
counter = Counter()
for code_doc in tqdm(code_docs):
    # code_doc = np.concatenate(code_doc).tolist() # 메모리 부족....
    counter += Counter(code_doc)
most_counter = counter.most_common(10000-2)
vocab = ['<pad>','<unk>']+[key for key, _ in most_counter]
# print(vocab)
word_to_index = {word:index for index, word in enumerate(vocab)}

In [None]:
print(len(word_to_index))

# input pipe line

In [45]:
del code_docs
del most_counter
del source_list
del counter

In [35]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 32

In [50]:
def tokenize(path):
    with open(path,'r',encoding='utf8') as f:
        code = f.read()
    words = preprocessing(code)
    code_sequence = [word_to_index[word] if word in word_to_index else word_to_index['<unk>'] for word in words]
    return code_sequence

def make_tensor(paths):
    code_sequences = []
    for path in tqdm(paths):
        code_sequences.append(tokenize(path))
    pad_sequences = keras.preprocessing.sequence.pad_sequences(code_sequences,maxlen=cfg.max_len,truncating='pre')
    return pad_sequences
    
def make_dataset(code1_paths,code2_paths,similar):
    code1_tensor = make_tensor(code1_paths)
    code1_ds = tf.data.Dataset.from_tensor_slices(code1_tensor)
    code2_tensor = make_tensor(code2_paths)
    code2_ds = tf.data.Dataset.from_tensor_slices(code2_tensor)
    similar_ds = tf.data.Dataset.from_tensor_slices(similar)
    ds = tf.data.Dataset.zip((code1_ds,code2_ds,similar_ds))
    return ds

In [49]:
data_ds = make_dataset(data_df['code1'],data_df['code2'],data_df['similar'])

In [53]:
for code1, code2, similar in data_ds.take(1):
    print(code1.shape)
    print(code2.shape)
    print(similar)

(512,)
(512,)
tf.Tensor(0, shape=(), dtype=int64)


# 데이터셋 나누기

In [54]:
len(data_ds)

9996

In [56]:
num_train_ds = int(len(data_ds)*0.8)
print(f"{num_train_ds=}")
num_val_ds = len(data_ds)-num_train_ds
print(f"{num_val_ds=}")

num_train_ds=7996
num_val_ds=2000


In [57]:
train_ds = data_ds.take(num_train_ds)
val_ds = data_ds.skip(num_train_ds)

In [60]:
print(f"{len(train_ds)=}")
print(f"{len(val_ds)=}")

len(train_ds)=7996
len(val_ds)=2000


# 모델만들기

In [63]:
from tensorflow.keras.layers import Layer
from tensorflow.keras import Model

In [61]:
# 셀프 어텐션
d_model = 512
num_layers = 6
num_heads = 8
dff = 2048

## 포지셔널 인코딩

In [None]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2*(i//2))/np.float32(d_model))
    return pos * angle_rates

In [None]:
def positional_encoding()

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, embedding_dim, num_heads=8):

        assert embedding_dim % num_heads == 0,"embedding_dim % num_heads != 0"

        super(MultiHeadAttention, self).__init__()
        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.projection_dim
        
    