In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dot, Reshape, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import Callback ,EarlyStopping

import pathlib
import datetime
import pandas as pd
import numpy as np
import random
import io
import neptune

import matplotlib.pyplot as plt
import seaborn as sns
print("GPU ", "사용 가능" if tf.config.experimental.list_physical_devices("GPU") else "사용 불가능")

GPU  사용 가능


In [2]:
class NeptuneLogger(Callback):
    def on_batch_end(self, batch, logs={}):
        for log_name, log_value in logs.items():
            neptune.log_metric(f'batch_{log_name}', log_value)

    def on_epoch_end(self, epoch, logs={}):
        for log_name, log_value in logs.items():
            neptune.log_metric(f'epoch_{log_name}', log_value)

In [3]:
def sen(x):
    sentences = []
    sentences.append(x.split(','))
    return sentences[0]

def remove_values_from_list(the_list,val):
    return [value for value in the_list if value != val]   

def raw_data(file,vals):
    raw_data=pd.read_excel(file)
    raw_data['기관']=raw_data['기관'].apply(lambda x: sen(x))
    for val in vals:
        raw_data['기관']=raw_data['기관'].map(lambda x: remove_values_from_list(x,val))
    raw_data['기관1']=[','.join(map(str, l)) for l in raw_data['기관']]
    
    return raw_data

In [4]:
def make_word2int(listseries):# str을 넣어서 매핑할 워드int만들기
    word2int = {}
    
    words = listseries.str.cat(sep=', ')
    words = words.split(",")
    words = [x.strip(' ') for x in words] #빈칸지우기
    words =  list(filter(None, words)) 
    words =  list(set(words))   #빈칸지우고 유일한것만 남기기
    for i,word in enumerate(words):
        word2int[word] = i
        
    return words ,word2int

In [5]:
def make_ngram(listseries,WINDOW_SIZE): #리스트로 된 pandas.core.series.Series
    data = []
    for sentence in listseries: #시리즈에 있는 하나의 라인마다
        for idx,word in enumerate(sentence): #인덱스랑 단어를 꺼내서
            for neighbor in sentence[max(idx - WINDOW_SIZE ,0) : min( idx+ WINDOW_SIZE, len(sentence))]:
                if neighbor != word:
                    data.append([word,neighbor])
    
    df=pd.DataFrame(data, columns = ['input','label'])
    df['input'] = df['input'].map(word2int) 
    df['label'] = df['label'].map(word2int) 
    return df

In [6]:
def make_pairset(data):
    pairs = [tuple(x) for x in data.to_numpy()] #튜플리스트만들기
    pairs_set = set(pairs)
    return pairs,pairs_set

In [7]:
def generate_batch(pairs, n_positive = 50, negative_ratio = 1, classification = False):
    """Generate batches of samples for training"""
    #배치사이즈 
    batch_size = n_positive * (1 + negative_ratio)
    #배치사이즈 x 3 의 batch만들기  batch를 저장할 numpy 배열을 준비합니다.
    batch = np.zeros((batch_size, 3))
    
    # Adjust label based on task #분류문제면 1,0,지금은 1-1
    if classification:
        neg_label = 0
    else:
        neg_label = -1
    
    # This creates a generator
    while True:
        # randomly choose positive examples 긍정라벨갯수만큼 뽑음 랜덤으로 True인 샘플을 준비합니다.
        for idx, (book_id, link_id) in enumerate(random.sample(pairs, n_positive)):
            batch[idx, :] = (book_id, link_id, 1)

        # Increment idx by 1
        idx += 1
        
        # Add negative examples until reach batch size 부정라벨은 총 배치사이즈까지 뽑음
        while idx < batch_size:
            
            # random selection 임의로 뽑아서
            random_book = random.randrange(len(words))
            random_link = random.randrange(len(words))
            
            # Check to make sure this is not a positive example 페어셋에 있는지 확인하고
            if (random_book, random_link) not in pairs_set:
                
                # Add to batch and increment index 배치에 추가함
                batch[idx, :] = (random_book, random_link, neg_label)
                idx += 1
                
        # Make sure to shuffle order 배치에 저장된 데이터들의 순서를 섞습니다
        np.random.shuffle(batch)
        yield {'company1': batch[:, 0], 'company2': batch[:, 1]}, batch[:, 2]

In [8]:
def book_embedding_model(OPTIMIZER='Adam', embedding_size = 100, classification = False ):
    """Model to embed books and wikilinks using the functional API.
       Trained to discern if a link is present in a article"""
    
    # Both inputs are 1-dimensional
    company1 = Input(name = 'company1', shape = [1])
    company2 = Input(name = 'company2', shape = [1])
    
    # Embedding the book (shape will be (None, 1, 50))
    company1_embedding =Embedding(name = 'company1_embedding',
                                           input_dim = len(words),
                                           output_dim = embedding_size)(company1)
    
    # Embedding the link (shape will be (None, 1, 50))
    company2_embedding =Embedding(name = 'company2_embedding',
                                                    input_dim = len(words),
                                                    output_dim = embedding_size)(company2)
    
    # Merge the layers with a dot product along the second axis (shape will be (None, 1, 1))
    merged = Dot(name = 'dot_product', normalize = True, axes = 2)([company1_embedding, company2_embedding])
    
    # Reshape to be a single number (shape will be (None, 1))
    merged = Reshape(target_shape = [1])(merged)
    
    # If classifcation, add extra layer and loss function is binary cross entropy
    if classification:
        merged = Dense(1, activation = 'sigmoid')(merged)
        model = Model(inputs = [company1, company2], outputs = merged)
        model.compile(optimizer = OPTIMIZER , loss = 'binary_crossentropy', metrics = ['accuracy'])
    
    # Otherwise loss function is mean squared error
    else:
        model = Model(inputs = [company1, company2], outputs = merged)
        model.compile(optimizer = OPTIMIZER, loss = 'mse')
    
    return model

---
# 파일읽고 단어 지우기

In [9]:
file ='embemb.xlsx'
vals =['한국거래소' ,'정부','유가증권', 'LP','금융위원회','금융감독원','한경로보', '대신증권' ,'코스닥','씽크풀','한국경제신문','대상','후보추천위원회','프랑스령 기아나' ]
raw_data=raw_data(file,vals)

# 윈도우 사이즈 지정하고 학습데이터 만들기

In [10]:
np.random.seed(100)
WINDOW_SIZE = 20

words,word2int   = make_word2int(raw_data['기관1'])
data             = make_ngram   (raw_data['기관' ],WINDOW_SIZE)
pairs,pairs_set  = make_pairset (data)

# 임베딩 차원과 옵티마이져 배치사이즈 지정

In [11]:
EMBEDDING_SIZE=250
OPTIMIZER='Adam'
N_POSITIVE=32768 # x값 
NEGATIVE_RATIO=1 # y값 
# x(1+y) 만큼이 총배치사이즈

# 파라미터 전체지정후 넵튠켜기

In [21]:
#주소
WHERE = 'sgeconomics/natural'
#모델이름
MODEL_NAME = 'init'
#토큰
TOKEN      = 'eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vdWkubmVwdHVuZS5haSIsImFwaV91cmwiOiJodHRwczovL3VpLm5lcHR1bmUuYWkiLCJhcGlfa2V5IjoiMzEyODExZTItMTJkOC00Mjk5LTgxNzItMjQwMGNjZGYwMGZmIn0='
#저장할파라미터
PARAMS ={
        'window size' : WINDOW_SIZE,
        'embedding dim' : EMBEDDING_SIZE,
        'Optimizer' : OPTIMIZER,
        'positive' :  N_POSITIVE,
        'negative ratio' : NEGATIVE_RATIO
        }
#태그
TAG=['adv']

In [22]:
neptune.init(WHERE,api_token = TOKEN)
neptune.create_experiment(name=MODEL_NAME,params =PARAMS, tags=TAG)



https://ui.neptune.ai/sgeconomics/natural/e/NAT-6


Experiment(NAT-6)

# 에포크 지정하고 학습시키기

In [12]:
EPOCH =1000

In [None]:
model = book_embedding_model(OPTIMIZER,EMBEDDING_SIZE)
gen = generate_batch(pairs, n_positive=N_POSITIVE ,negative_ratio=NEGATIVE_RATIO)
h = model.fit(gen,
              epochs = EPOCH, 
              steps_per_epoch = len(pairs) // N_POSITIVE,
             # callbacks=[NeptuneLogger()],
              verbose = 1)

  ...
    to  
  ['...']
Train for 300 steps
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000

# 모델저장하기

In [17]:
model.save('./model.h5')
neptune.log_artifact('model.h5')

In [18]:
company1_layer = model.get_layer('company1_embedding')
company1_weights = company1_layer.get_weights()[0]

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

for key,value in word2int.items():
  vec = company1_weights[value] # skip 0, it's padding.
  out_m.write(key + "\n")
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
out_v.close()
out_m.close()

neptune.log_artifact('vecs.tsv')
neptune.log_artifact('meta.tsv')

# 넵튠 추적 종료하기

In [19]:
neptune.stop()