In [1]:
import numpy as np
from model.layers import Embedding, Sigmoid, Softmax, Linear
from preprocess import *

In [54]:
import numpy as np
from model.layers import Embedding, Sigmoid, Softmax, Linear
from utils import _Huffman_Tree

class BCELoss:
    def __init__(self):
        self.params = None
        self.grads = None
        self.eps = 1e-8

        self.y_pred , self.target = None, None
        self.loss = None

    def forward(self, y_pred, target, dim = 1):

        self.y_pred = y_pred
        self.target = target
        self.dim = dim
        number = target.shape[0]
        
        self.loss = -self.target * np.log(self.y_pred + self.eps) - (1 - self.target) * np.log(1 - self.y_pred + self.eps)
        self.loss = np.sum(self.loss, axis = dim)/number

        return self.loss

    def backward(self, dout = 1):
        dx = (self.y_pred - self.target) / (self.y_pred * (1 - self.y_pred) + + self.eps) 
        return dx * dout

class Hsoftmax:
    def __init__(self, vocab_size, projection, sample_size):
        self.Embedding = Embedding(vocab_size, projection)
        self.HSvector = Embedding(vocab_size - 1 , projection)
        self.sigmoid = Sigmoid()
        self.sample_size = sample_size

        self.layers = [self.Embedding, self.HSvector]

        self.params = []
        self.grads = []

        for layer in self.layers:
            self.params.append(layer.params)
            self.grads.append(layer.grads)

    def forward(self, x, label):
        '''
        inputs : 1 x D(projection)
        label : 1 x [direction_path(1, depth), idx_path(1, depth)]
        label 과 output 의 argmax를 비교해서 같으면 1 틀리면 0 을 부여한 후 이를 target vector로 설정해야됨
        '''

        dir_path = np.array(label[0])
        idx_path = np.expand_dims(label[1], 1)
        self.x = x
        
        self.hidden = self.Embedding.forward(x)

        self.hirearchy_vectors = self.HSvector.forward(dir_path)

        out = np.matmul(self.hirearchy_vectors , self.hidden.T )

        out = self.sigmoid.forward(out)

        mask = np.zeros_like(out)
        mask[mask >= 0.5] = 1

        target = np.zeros_like(out)
        target[mask == idx_path] = 1

        return out , target

    def backward(self, dout):

        W_in, W_out = self.params
        #length x 1
        d_sig = self.sigmoid.backward(dout)

        #vocab -1 x hidden
        d_lin = np.matmul(d_sig , self.hidden)
        d_h = np.matmul(dout.T, self.hirearchy_vectors)

        self.HSvector.backward(d_lin)
        self.Embedding.backward(d_h)

In [3]:
path = "./data/text8.txt"
words = recall_word(path)

In [4]:
word2idx, idx2word, count = corpus_making_version2(words)
word_id = word_id_gen(words, word2idx, count)
train_data, label = train_token_gen(word_id, 3, 7)
node, max_depth = Huffman_Tree(count)
print(count[0])
labels = node[label]

Words Count complete


Changing Word to Index: 100%|█████████████████████████████████████████| 17005207/17005207 [00:06<00:00, 2668797.45it/s]
Huffman_Tree: 100%|██████████████████████████████████████████████████████████| 49999/49999 [00:00<00:00, 187025.89it/s]


[0, 418390]


In [9]:
labels

array([[array([0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0]),
        array([49998, 49996, 49993, 49986, 49974, 49950, 49911, 49840, 49714,
       49475, 49037])],
       [array([1, 0, 1, 1, 1]),
        array([49998, 49997, 49994, 49989, 49979])],
       [array([1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1]),
        array([49998, 49997, 49994, 49989, 49978, 49959, 49927, 49867, 49764,
       49570, 49211, 48567, 47461, 45672, 42947])],
       [array([1, 1, 1, 0, 1, 0, 1, 1, 1, 0]),
        array([49998, 49997, 49995, 49991, 49982, 49966, 49938, 49889, 49805,
       49648])],
       [array([1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1]),
        array([49998, 49997, 49994, 49989, 49978, 49958, 49924, 49860, 49750,
       49544, 49164])],
       [array([1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0]),
        array([49998, 49997, 49995, 49990, 49981, 49964, 49934, 49880, 49786,
       49611, 49288])]], dtype=object)

In [11]:
train_data

46

In [55]:
total_num = len(word2idx)
model = Hsoftmax(total_num, 200, 3)
criterion = BCELoss()

In [8]:
label = np.array([[path, idx_path] for _, _, idx_path ,path in b_tree])

In [58]:
batch_train = train_set_idx[np.random.choice(100, 1)]

x_train = batch_train[:,3]
label_train_idx = np.delete(batch_train , 3)

#N x [path(2C), idx_path(2C)]
label_train = label[np.random.choice(label_train_idx, 3)]

NameError: name 'train_set_idx' is not defined

In [59]:
for x, lab in zip(repeat(x_train), label_train):
    
    x = [x]
    y, t = model.forward(x, lab)
    loss = criterion.forward(y, t, dim = 0)
    dloss = criterion.backward()
    model.backward(dloss)

NameError: name 'x_train' is not defined

In [55]:
label_train

array([[array([47157, 47155, 47152, 47145, 47132, 47107, 47065, 46991, 46859,
       46609, 46149, 45345]),
        array([0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1])],
       [array([47157, 47155, 47152, 47146, 47134, 47112, 47075, 47004, 46881,
       46647, 46218, 45467, 44196, 42198, 39199, 34931, 29140]),
        array([0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1])],
       [array([47157, 47156, 47154, 47149, 47139, 47121, 47091, 47035, 46936,
       46754, 46417]),
        array([1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1])]], dtype=object)

In [57]:
from itertools import repeat

In [16]:
import os
import zipfile
import collections
import math
import os
import zipfile

import numpy as np
from six.moves import urllib
import tensorflow as tf


url = 'http://mattmahoney.net/dc/'


In [5]:
def maybe_download(filename, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urllib.request.urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception('Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename

Found and verified text8.zip


In [19]:
filename = 'text8.zip'
def read_data(filename):
    ''' zip 파일에 포함된 텍스트 파일을 읽어서 단어 리스트 생성. 포함된 파일은 1개. 
    zip 파일은 30mb, txt 파일은 100mb. '''
    with zipfile.ZipFile(filename) as f:
        names = f.namelist()                # ['text8']
        contents = f.read(names[0])         # 크기 : 100,000,000바이트
        text = tf.compat.as_str(contents)   # 크기 : 100,000,000
        return text.split()                 # 갯수 : 17005207


vocabulary = read_data(filename)
print('Data size', len(vocabulary))         # 17005207

Data size 17005207


In [20]:
# Step 2: 사전을 구축하고 거의 등장하지 않는 단어를 UNK 토큰으로 대체.
# UNK는 unknown 약자로 출현 빈도가 낮은 단어들을 모두 대체한다. UNK 갯수는 418391.
vocabulary_size = 50000


def build_dataset(words, n_words):
    # count : [['UNK', -1], ('the', 1061396), ('of', 593677), ('and', 416629), ...]
    # 크기는 50,000개. UNK가 들어 있고, -1을 뺐으니까 처음에 전달된 크기 사용.
    # 빈도가 높은 5만개 추출.
    # count에 포함된 마지막 데이터는 ('hif', 9). 9번 나왔는데 드물다고 얘기할 수 있는지는 의문.
    unique = collections.Counter(words) # 중복 단어 제거
    print(unique)
    orders = unique.most_common(n_words - 1)        # 단어에 대한 빈도 계산. 갯수를 지정하지 않으면 전체 계산.
    count = [['UNK', -1]]
    count.extend(orders)

    dictionary = {}
    for word, _ in count:
        dictionary[word] = len(dictionary)


    data = []
    for word in words:
        if word in dictionary:          # word가 dictionary에 존재한다면
            index = dictionary[word]
        else:
            index = 0                   # UNK는 0번째에 위치
            count[0][1] += 1            # 갯수 : 418391
        data.append(index)


    return data, count, list(dictionary.keys())

data, count, ordered_words = build_dataset(vocabulary, vocabulary_size)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)






In [21]:
ordered_words

['UNK',
 'the',
 'of',
 'and',
 'one',
 'in',
 'a',
 'to',
 'zero',
 'nine',
 'two',
 'is',
 'as',
 'eight',
 'for',
 's',
 'five',
 'three',
 'was',
 'by',
 'that',
 'four',
 'six',
 'seven',
 'with',
 'on',
 'are',
 'it',
 'from',
 'or',
 'his',
 'an',
 'be',
 'this',
 'which',
 'at',
 'he',
 'also',
 'not',
 'have',
 'were',
 'has',
 'but',
 'other',
 'their',
 'its',
 'first',
 'they',
 'some',
 'had',
 'all',
 'more',
 'most',
 'can',
 'been',
 'such',
 'many',
 'who',
 'new',
 'used',
 'there',
 'after',
 'when',
 'into',
 'american',
 'time',
 'these',
 'only',
 'see',
 'may',
 'than',
 'world',
 'i',
 'b',
 'would',
 'd',
 'no',
 'however',
 'between',
 'about',
 'over',
 'years',
 'states',
 'people',
 'war',
 'during',
 'united',
 'known',
 'if',
 'called',
 'use',
 'th',
 'system',
 'often',
 'state',
 'so',
 'history',
 'will',
 'up',
 'while',
 'where',
 'city',
 'being',
 'english',
 'then',
 'any',
 'both',
 'under',
 'out',
 'made',
 'well',
 'her',
 'e',
 'number',
 'g

In [164]:
data

[5234,
 3081,
 12,
 6,
 195,
 2,
 3134,
 46,
 59,
 156,
 128,
 742,
 477,
 10572,
 134,
 1,
 27350,
 2,
 1,
 103,
 855,
 3,
 1,
 15068,
 0,
 2,
 1,
 151,
 855,
 3581,
 1,
 195,
 11,
 191,
 59,
 5,
 6,
 10713,
 215,
 7,
 1325,
 105,
 455,
 20,
 59,
 2732,
 363,
 7,
 3673,
 1,
 709,
 2,
 372,
 27,
 41,
 37,
 54,
 540,
 98,
 12,
 6,
 1424,
 2758,
 19,
 568,
 687,
 7089,
 1,
 248,
 5234,
 11,
 1053,
 28,
 1,
 321,
 249,
 44612,
 2878,
 793,
 187,
 5234,
 12,
 6,
 201,
 603,
 11,
 1,
 1135,
 20,
 2622,
 26,
 8984,
 3,
 280,
 32,
 4148,
 142,
 60,
 26,
 6438,
 4187,
 2,
 154,
 33,
 363,
 5234,
 37,
 1138,
 7,
 448,
 345,
 1819,
 20,
 4861,
 1,
 6754,
 2,
 7574,
 1775,
 567,
 1,
 94,
 1,
 248,
 11065,
 12,
 52,
 7089,
 90,
 27,
 271,
 38,
 5949,
 4862,
 20300,
 29,
 0,
 42,
 318,
 6,
 25637,
 528,
 7574,
 372,
 5,
 259,
 2,
 154,
 26,
 1207,
 12,
 7574,
 201,
 1577,
 3,
 15201,
 333,
 1775,
 7089,
 4861,
 345,
 765,
 161,
 407,
 5691,
 756,
 2,
 4106,
 1132,
 4332,
 1537,
 3,
 568,
 8118,
 99

In [22]:
# Step 3: skip-gram 모델에 사용할 학습 데이터를 생성할 함수 작성

def generate_batch(data, batch_size, num_skips, skip_window, data_index):
    ''' 
    Stochastic Gradient Descent 알고리즘에 사용할 minibatch 생성.
    :param data : 단어 인덱스 리스트
    :param batch_size : SGD 알고리즘에 적용할 데이터 갯수. 한 번에 처리할 크기.
    :param num_skips : context window에서 구축할 (target, context) 쌍의 갯수.
    :param skip_window : skip-gram 모델에 사용할 윈도우 크기.
         1이라면 목표 단어(target) 양쪽에 1개 단어이므로 context window 크기는 3이 된다. (단어, target, 단어)
         2라면 5가 된다. (단어 단어 target 단어 단어)
    :param data_index : 첫 번째 context window에 들어갈 data에서의 시작 위치.
    '''

    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window

    temp = 'batch_size {}, num_skips {}, skip_window {}, data_index {}'

    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)

    # span은 assert에 기술한 코드 때문에 항상 num_skips보다 크다.
    span = 2 * skip_window + 1                      # context = skip_window + target + skip_window
    assert span > num_skips

    # deque
    # 처음과 마지막의 양쪽 끝에서 일어나는 입출력에 대해 가장 좋은 성능을 내는 자료구조
    # maxlen 옵션이 없으면 크기 제한도 없고, 있다면 지정한 크기만큼만 사용 가능.
    # maxlen을 3으로 전달하면 3개만 저장할 수 있고, 새로운 요소를 추가하면 반대쪽 요소가 자동으로 삭제됨.
    # 여기서는 자동 삭제 기능 때문에 사용.
    # data_index 번째부터 span 크기만큼 단어 인덱스 저장
    # 첫 번째 context 윈도우 구성
    
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)   # 다음 단어 인덱스로 이동. len(data) = 17005207

    for i in range(batch_size // num_skips):

        targets = list(range(span))     # 1. 0부터 span-1까지의 정수로 채운 다음
        targets.pop(skip_window)        # 2. skip_window번째 삭제
        np.random.shuffle(targets)      # 3. 난수를 사용해서 섞는다.

        start = i * num_skips
        batch[start:start+num_skips] = buffer[skip_window]

        for j in range(num_skips):
            labels[start+j, 0] = buffer[targets[j]]
            #print(targets[j], '**')     # (2, 0), (0, 2), (0, 2), (0, 2)

        # 새로운 요소가 들어가면서 가장 먼저 들어간 데이터 삭제
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)

    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels, data_index


In [23]:
batch, labels, data_index = generate_batch(data, batch_size=15, num_skips=5, skip_window=5, data_index=0)

In [150]:
labels, batch

(array([[3134],
        [  59],
        [ 195],
        [3081],
        [ 156],
        [  12],
        [ 156],
        [   6],
        [   2],
        [ 195],
        [ 156],
        [  59],
        [   2],
        [3134],
        [ 128]]),
 array([   2,    2,    2,    2,    2, 3134, 3134, 3134, 3134, 3134,   46,
          46,   46,   46,   46]))

In [129]:
import torch

In [130]:
torch.Tensor(a)

tensor([[1., 1.],
        [1., 1.],
        [1., 1.]])

In [131]:
emb = torch.nn.Embedding(3,2)

In [141]:
test = torch.tensor(np.array([[1],[2],[1]])).to(torch.long)

In [146]:
emb(test.flatten())

tensor([[-0.6819,  0.1654],
        [ 1.6444, -0.6906],
        [-0.6819,  0.1654]], grad_fn=<EmbeddingBackward>)

In [145]:
test.flatten()

tensor([1, 2, 1])

In [153]:
np.random.permutation(100)

array([47, 61, 72, 77, 62, 58, 36, 27,  4, 57, 21, 42, 95, 37, 78,  2, 53,
       20, 68, 46, 35,  1, 91, 63, 64, 67,  3, 70, 55, 65, 94, 25, 84, 96,
       60, 89, 15,  6,  5, 83, 26, 45, 31, 18, 98, 10, 86, 66, 41, 85, 75,
       49, 24, 11, 14, 76, 97,  0, 43, 30, 38, 99, 59, 22, 82, 52, 13, 29,
       50, 34, 17, 40, 39, 92, 81, 33, 74, 48, 19, 73, 88, 87, 54, 16, 23,
       28, 69, 80, 56, 93, 44,  8, 12, 32, 79, 51,  7,  9, 90, 71])

In [156]:
import pickle

with open("./bestmodel.pickle", 'rb') as f:
    x = pickle.load(f)

In [157]:
x

[array([[-1.54175470e+09, -1.79445650e+09, -1.87232727e+09, ...,
         -1.16198965e+09, -1.01097462e+09, -5.63138488e+08],
        [-6.66701178e+09, -7.71014195e+09, -8.05382719e+09, ...,
         -4.96211278e+09, -4.39597926e+09, -2.41403984e+09],
        [-3.38580451e+09, -4.04628273e+09, -4.20251767e+09, ...,
         -2.68509276e+09, -2.16867249e+09, -1.28164479e+09],
        ...,
        [ 6.02995247e-01,  4.42277608e-01,  1.59471883e-02, ...,
          1.78316802e-01,  3.63981967e-01,  6.16207138e-01],
        [-3.27359057e+08, -3.32138774e+08, -3.55516187e+08, ...,
         -1.84993688e+08, -2.38512356e+08, -9.87504215e+07],
        [ 3.31288206e-01,  6.36649251e-01,  1.85021861e-01, ...,
          3.60247087e-02,  6.85274421e-01,  1.12200348e-01]]),
 array([[0.42303893, 0.97166472, 0.92592606, ..., 0.9241728 , 0.04349063,
         0.35868405],
        [0.79671503, 0.47587896, 0.57932334, ..., 0.03032535, 0.74273915,
         0.09871113],
        [0.74237253, 0.07914116, 0.54

In [198]:
np.random.choice([1,2,3], 5, p =[0.2,0.5,0.2])

ValueError: probabilities do not sum to 1

In [188]:
wo = {1:3,2:4}

In [178]:
def ch(wo):
    wo.pop(1)

In [179]:
ch(wo)

In [180]:
wo

{2: 4}

In [184]:
class ch:
    def __init__(self, wh):
    
        self.wh = wh
        self.wh.pop(1)

In [189]:
a = ch(wo)

In [190]:
wo

{2: 4}

In [193]:
a.wh

{2: 4}

In [194]:
1 - (1e-5 / 0.01) ** 0.5

0.9683772233983162

In [195]:
400000/len(word_id)

0.02352220705105207

In [202]:
np.random.choice([1,2], 1 , p = [0.9,0.1])

array([1])

In [203]:
import random

In [206]:
np.random.choice(10,1)

array([2])