In [1]:
import os

from utils_park import *
from vocab import *

from tokenizers import Tokenizer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer

from collections import Counter
import numpy as np
import pandas as pd	
import matplotlib.pyplot as plt
%matplotlib inline

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
dir = '../newsbt_data/'
train_code_path = os.path.join(dir,'train/code_bpe')
valid_code_path = os.path.join(dir,'valid/code_bpe')
test_code_path = os.path.join(dir,'test/code_bpe')

train_nl_path = os.path.join(dir,'train/nl')
valid_nl_path = os.path.join(dir,'valid/nl')
test_nl_path = os.path.join(dir,'test/nl')

In [None]:
# 데이터 토큰개수 리스트
def count_token(f_name):
    tokens = read_file(f_name)
    len_list = [len(x.split()) for x in tokens]
    return len_list

# 통계분석
def cal(len_list):
    mean = np.mean(len_list) # 평균
    var = np.var(len_list) # 분산
    std = np.std(len_list) # 표준편차
    max_ = max(len_list) # 최댓값
    min_ = min(len_list) # 최솟값
    mid = np.median(len_list) # 중앙값

    c = Counter(len_list) 
    temp_mode = c.most_common(1) # 최빈값
    mode = temp_mode[0][0]

    print(f'mean: {mean:.5f}')
    print(f'variance: {var:.5f}')
    print(f'std: {std:.5f}')
    print(f'mode: {mode:.5f}')
    print(f'mid: {mid:.5f}')
    print(f'max: {max_:.5f}')
    print(f'min: {min_:.5f}')

In [None]:
train_code_tkn = count_token(train_code_path)
cal(train_code_tkn)

In [None]:
test_code_tkn = count_token(test_code_path)
cal(test_code_tkn)

In [None]:
valid_code_tkn = count_token(valid_code_path)
cal(valid_code_tkn)

In [None]:
all_code_tkn = train_code_tkn + test_code_tkn + valid_code_tkn
cal(all_code_tkn)

In [None]:
train_nl_tkn = count_token(train_nl_path)
cal(train_nl_tkn)

In [None]:
test_nl_tkn = count_token(test_nl_path)
cal(test_nl_tkn)

In [None]:
valid_nl_tkn = count_token(valid_nl_path)
cal(valid_nl_tkn)

In [None]:
all_nl_tkn = train_nl_tkn + test_nl_tkn + valid_nl_tkn
cal(all_nl_tkn)

In [None]:
plt.style.use(['seaborn'])
plt.figure(figsize=((8,6)))
plt.xlabel('Code Length(Tokens)')
plt.ylabel('Count')
plt.hist(all_code_tkn, bins=20, rwidth = 0.9)
plt.show()

In [None]:
plt.style.use(['seaborn'])
plt.figure(figsize=((8,6)))
plt.xlabel('Comment Length(Tokens)')
plt.ylabel('Count')
plt.hist(all_nl_tkn, bins=15, rwidth = 0.9)
plt.show()

In [2]:
dir = '../newsbt_data/'
out_path = os.path.join(dir, '40000_vocab_park/')

PAD = '<pad>'
UNK = '<unk>'
SOS = '<start>'
EOS = '<end>'

PAD_ID = 0
UNK_ID = 1
SOS_ID = 2
EOS_ID = 3

train_code_path = os.path.join(dir, 'train/train.token.code')
train_type_path = os.path.join(dir,'train/type')

valid_code_path = os.path.join(dir, 'valid/valid.token.code')
valid_type_path = os.path.join(dir,'valid/type')

test_code_path = os.path.join(dir, 'test/test.token.code')
test_type_path = os.path.join(dir,'test/type')

In [3]:
train_code = read_file(train_code_path)
test_code = read_file(test_code_path)
valid_code = read_file(valid_code_path)

train_type = read_file(train_type_path)
test_type = read_file(test_type_path)
valid_type = read_file(valid_type_path)

# Make Vocab

In [5]:
t1 = Tokenizer(BPE())
t1.pre_tokenizer = Whitespace()
trainer = BpeTrainer(vocab_size=52000,
show_progress = True,
special_tokens = ["<pad>","<unk>", "<start>", "<end>"])
t1.train(files=["../newsbt_data/train/train.token.code"], trainer=trainer)






In [6]:
t1.save("../newsbt_data/vocabulary/code_bpe.json")

In [7]:
t2 = Tokenizer(BPE())
t2.pre_tokenizer = Whitespace()
trainer = BpeTrainer(vocab_size=52000,
show_progress = True,
special_tokens = ["<pad>","<unk>", "<start>", "<end>"])
t2.train(files=["../newsbt_data/train/type"], trainer=trainer)






In [8]:
t2.save("../newsbt_data/vocabulary/type_bpe.json")

### indexing newsbtcode

In [9]:
my_tokenizer1 = Tokenizer.from_file("../newsbt_data/vocabulary/code_bpe.json")
my_tokenizer2 = Tokenizer.from_file("../newsbt_data/vocabulary/type_bpe.json")

In [10]:
def make_tokens(tokenizer, lines):
    data = []
    for i in lines:
        output = tokenizer.encode(i)
        data.append(output.ids)
    return data

In [11]:
# out path
train_out_path = '../newsbt_data/train/'
test_out_path = '../newsbt_data/test/'
valid_out_path = '../newsbt_data/valid/'

In [12]:
train_data1 = make_tokens(my_tokenizer1, train_code)
test_data1 = make_tokens(my_tokenizer1, test_code)
valid_data1 = make_tokens(my_tokenizer1, valid_code)

save_idx_file(train_out_path, "code_bpe", train_data1)
save_idx_file(test_out_path, "code_bpe", test_data1)
save_idx_file(valid_out_path, 'code_bpe', valid_data1)

In [13]:
train_data2 = make_tokens(my_tokenizer2, train_type)
test_data2 = make_tokens(my_tokenizer2, test_type)
valid_data2 = make_tokens(my_tokenizer2, valid_type)

save_idx_file(train_out_path, "type_bpe", train_data2)
save_idx_file(test_out_path, "type_bpe", test_data2)
save_idx_file(valid_out_path, 'type_bpe', valid_data2)
